diff --git a/.clang-tidy b/.clang-tidy
index 3be1d9e0c..c01182eb4 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,4 +1,4 @@
-Checks: 'modernize-*,-modernize-make-*,-modernize-use-auto,-modernize-raw-string-literal,-modernize-avoid-c-arrays,-modernize-use-trailing-return-type,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming'
+Checks: 'modernize-*,-modernize-use-nodiscard,-modernize-concat-nested-namespaces,-modernize-make-*,-modernize-use-auto,-modernize-raw-string-literal,-modernize-avoid-c-arrays,-modernize-use-trailing-return-type,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming'
 CheckOptions:
   - { key: readability-identifier-naming.ClassCase,                 value: CamelCase  }
   - { key: readability-identifier-naming.StructCase,                value: CamelCase  }
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index c3df3e66d..8efcdc2ec 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -34,11 +34,11 @@ jobs:
         python -m pip install awscli
 
     - name: Cache Maven packages
-      uses: actions/cache@937d24475381cd9c75ae6db12cb4e79714b926ed # v3.0.11
+      uses: actions/cache@6998d139ddd3e68c71e9e398d8e40b71a2f39812 # v3.2.5
       with:
         path: ~/.m2
         key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
-        restore-keys: ${{ runner.os }}-m2
+        restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
 
     - name: Test XGBoost4J
       run: |
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 822ae14d8..ac50b744b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -144,7 +144,18 @@ jobs:
         python -m pip install wheel setuptools cpplint pylint
     - name: Run lint
       run: |
-        python dmlc-core/scripts/lint.py xgboost cpp R-package/src
+        python3 dmlc-core/scripts/lint.py xgboost cpp R-package/src
+
+        python3 dmlc-core/scripts/lint.py --exclude_path \
+            python-package/xgboost/dmlc-core \
+            python-package/xgboost/include \
+            python-package/xgboost/lib \
+            python-package/xgboost/rabit \
+            python-package/xgboost/src \
+            --pylint-rc python-package/.pylintrc \
+            xgboost \
+            cpp \
+            include src python-package
 
   sphinx:
     runs-on: ubuntu-latest
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b79ccc4a..bfdbb6aa5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(xgboost LANGUAGES CXX C VERSION 2.0.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
@@ -212,9 +212,6 @@ find_package(Threads REQUIRED)
 
 if (USE_OPENMP)
   if (APPLE)
-    # Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
-    # OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
-    cmake_minimum_required(VERSION 3.16)
     find_package(OpenMP)
     if (NOT OpenMP_FOUND)
       # Try again with extra path info; required for libomp 15+ from Homebrew
diff --git a/R-package/CMakeLists.txt b/R-package/CMakeLists.txt
index c61fe1c61..003a635a5 100644
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@@ -30,7 +30,7 @@ if (USE_OPENMP)
 endif (USE_OPENMP)
 set_target_properties(
   xgboost-r PROPERTIES
-  CXX_STANDARD 14
+  CXX_STANDARD 17
   CXX_STANDARD_REQUIRED ON
   POSITION_INDEPENDENT_CODE ON)
 
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 3b230ac87..9ceef2fda 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -66,4 +66,4 @@ Imports:
     jsonlite (>= 1.0),
 RoxygenNote: 7.2.3
 Encoding: UTF-8
-SystemRequirements: GNU make, C++14
+SystemRequirements: GNU make, C++17
diff --git a/R-package/configure b/R-package/configure
index 834cb95c2..19ea48a91 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -2096,9 +2096,9 @@ if test -z "${R_HOME}"; then
   exit 1
 fi
 
-CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
-CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
-CXX="${CXX14} ${CXX14STD}"
+CXX17=`"${R_HOME}/bin/R" CMD config CXX17`
+CXX17STD=`"${R_HOME}/bin/R" CMD config CXX17STD`
+CXX="${CXX17} ${CXX17STD}"
 CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
 
 CC=`"${R_HOME}/bin/R" CMD config CC`
diff --git a/R-package/configure.ac b/R-package/configure.ac
index 4e6cfee70..1fb6ea35a 100644
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -10,9 +10,9 @@ if test -z "${R_HOME}"; then
   exit 1
 fi
 
-CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
-CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
-CXX="${CXX14} ${CXX14STD}"
+CXX17=`"${R_HOME}/bin/R" CMD config CXX17`
+CXX17STD=`"${R_HOME}/bin/R" CMD config CXX17STD`
+CXX="${CXX17} ${CXX17STD}"
 CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
 
 CC=`"${R_HOME}/bin/R" CMD config CC`
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 630965e38..ed3f10571 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -3,7 +3,7 @@ PKGROOT=../../
 ENABLE_STD_THREAD=1
 # _*_ mode: Makefile; _*_
 
-CXX_STD = CXX14
+CXX_STD = CXX17
 
 XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
            -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@@ -36,6 +36,8 @@ OBJECTS= \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
     $(PKGROOT)/src/objective/adaptive.o \
+    $(PKGROOT)/src/objective/init_estimation.o \
+    $(PKGROOT)/src/objective/quantile_obj.o \
     $(PKGROOT)/src/gbm/gbm.o \
     $(PKGROOT)/src/gbm/gbtree.o \
     $(PKGROOT)/src/gbm/gbtree_model.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 09f09598a..024ba1aa1 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -3,7 +3,7 @@ PKGROOT=../../
 ENABLE_STD_THREAD=0
 # _*_ mode: Makefile; _*_
 
-CXX_STD = CXX14
+CXX_STD = CXX17
 
 XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
            -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@@ -36,6 +36,8 @@ OBJECTS= \
     $(PKGROOT)/src/objective/hinge.o \
     $(PKGROOT)/src/objective/aft_obj.o \
     $(PKGROOT)/src/objective/adaptive.o \
+    $(PKGROOT)/src/objective/init_estimation.o \
+    $(PKGROOT)/src/objective/quantile_obj.o \
     $(PKGROOT)/src/gbm/gbm.o \
     $(PKGROOT)/src/gbm/gbtree.o \
     $(PKGROOT)/src/gbm/gbtree_model.o \
diff --git a/cmake/Sanitizer.cmake b/cmake/Sanitizer.cmake
index 2f7c913c3..77d7c93c1 100644
--- a/cmake/Sanitizer.cmake
+++ b/cmake/Sanitizer.cmake
@@ -8,9 +8,6 @@ macro(enable_sanitizer sanitizer)
   if(${sanitizer} MATCHES "address")
     find_package(ASan)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=address")
-    if (ASan_FOUND)
-      link_libraries(${ASan_LIBRARY})
-    endif (ASan_FOUND)
 
   elseif(${sanitizer} MATCHES "thread")
     find_package(TSan)
@@ -22,16 +19,10 @@ macro(enable_sanitizer sanitizer)
   elseif(${sanitizer} MATCHES "leak")
     find_package(LSan)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=leak")
-    if (LSan_FOUND)
-      link_libraries(${LSan_LIBRARY})
-    endif (LSan_FOUND)
 
   elseif(${sanitizer} MATCHES "undefined")
     find_package(UBSan)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=undefined -fno-sanitize-recover=undefined")
-    if (UBSan_FOUND)
-      link_libraries(${UBSan_LIBRARY})
-    endif (UBSan_FOUND)
 
   else()
     message(FATAL_ERROR "Santizer ${sanitizer} not supported.")
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index f28c1f270..3a66735fe 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -178,17 +178,10 @@ function(xgboost_set_cuda_flags target)
       $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
   endif (MSVC)
 
-  if (PLUGIN_RMM)
-    set_target_properties(${target} PROPERTIES
-      CUDA_STANDARD 17
-      CUDA_STANDARD_REQUIRED ON
-      CUDA_SEPARABLE_COMPILATION OFF)
-  else ()
-    set_target_properties(${target} PROPERTIES
-      CUDA_STANDARD 14
-      CUDA_STANDARD_REQUIRED ON
-      CUDA_SEPARABLE_COMPILATION OFF)
-  endif (PLUGIN_RMM)
+  set_target_properties(${target} PROPERTIES
+    CUDA_STANDARD 17
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
 endfunction(xgboost_set_cuda_flags)
 
 macro(xgboost_link_nccl target)
@@ -205,17 +198,10 @@ endmacro(xgboost_link_nccl)
 
 # compile options
 macro(xgboost_target_properties target)
-  if (PLUGIN_RMM)
-    set_target_properties(${target} PROPERTIES
-      CXX_STANDARD 17
-      CXX_STANDARD_REQUIRED ON
-      POSITION_INDEPENDENT_CODE ON)
-  else ()
-    set_target_properties(${target} PROPERTIES
-      CXX_STANDARD 14
-      CXX_STANDARD_REQUIRED ON
-      POSITION_INDEPENDENT_CODE ON)
-  endif (PLUGIN_RMM)
+  set_target_properties(${target} PROPERTIES
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON)
 
   if (HIDE_CXX_SYMBOLS)
     #-- Hide all C++ symbols
diff --git a/cmake/modules/FindASan.cmake b/cmake/modules/FindASan.cmake
index e7b273853..9c4dc1566 100644
--- a/cmake/modules/FindASan.cmake
+++ b/cmake/modules/FindASan.cmake
@@ -1,7 +1,7 @@
 set(ASan_LIB_NAME ASan)
 
 find_library(ASan_LIBRARY
-  NAMES libasan.so libasan.so.5 libasan.so.4 libasan.so.3 libasan.so.2 libasan.so.1 libasan.so.0
+  NAMES libasan.so libasan.so.6 libasan.so.5 libasan.so.4 libasan.so.3 libasan.so.2 libasan.so.1 libasan.so.0
   PATHS ${SANITIZER_PATH} /usr/lib64 /usr/lib /usr/local/lib64 /usr/local/lib ${CMAKE_PREFIX_PATH}/lib)
 
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/version_config.h.in b/cmake/version_config.h.in
index dfde79a5a..38d64fa9e 100644
--- a/cmake/version_config.h.in
+++ b/cmake/version_config.h.in
@@ -1,11 +1,11 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023 by XGBoost contributors
  */
 #ifndef XGBOOST_VERSION_CONFIG_H_
 #define XGBOOST_VERSION_CONFIG_H_
 
-#define XGBOOST_VER_MAJOR @xgboost_VERSION_MAJOR@
-#define XGBOOST_VER_MINOR @xgboost_VERSION_MINOR@
-#define XGBOOST_VER_PATCH @xgboost_VERSION_PATCH@
+#define XGBOOST_VER_MAJOR @xgboost_VERSION_MAJOR@  /* NOLINT */
+#define XGBOOST_VER_MINOR @xgboost_VERSION_MINOR@  /* NOLINT */
+#define XGBOOST_VER_PATCH @xgboost_VERSION_PATCH@  /* NOLINT */
 
 #endif  // XGBOOST_VERSION_CONFIG_H_
diff --git a/demo/c-api/CMakeLists.txt b/demo/c-api/CMakeLists.txt
index 25764c12a..9764267aa 100644
--- a/demo/c-api/CMakeLists.txt
+++ b/demo/c-api/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(xgboost-c-examples)
 
 add_subdirectory(basic)
diff --git a/demo/c-api/external-memory/CMakeLists.txt b/demo/c-api/external-memory/CMakeLists.txt
index 0c21acb3c..5e68e9918 100644
--- a/demo/c-api/external-memory/CMakeLists.txt
+++ b/demo/c-api/external-memory/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(external-memory-demo LANGUAGES C VERSION 0.0.1)
 
 find_package(xgboost REQUIRED)
diff --git a/demo/c-api/inference/CMakeLists.txt b/demo/c-api/inference/CMakeLists.txt
index 4d0f3cd6e..6aa8f1dd2 100644
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(inference-demo LANGUAGES C VERSION 0.0.1)
 find_package(xgboost REQUIRED)
 
diff --git a/demo/dask/cpu_survival.py b/demo/dask/cpu_survival.py
index 629667b12..83eddd361 100644
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@@ -8,9 +8,9 @@ import os
 
 import dask.dataframe as dd
 from dask.distributed import Client, LocalCluster
-from xgboost.dask import DaskDMatrix
 
 import xgboost as xgb
+from xgboost.dask import DaskDMatrix
 
 
 def main(client):
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
index 7fc5d2d1c..a31e5d2a6 100644
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -5,9 +5,9 @@ Example of training with Dask on CPU
 """
 from dask import array as da
 from dask.distributed import Client, LocalCluster
-from xgboost.dask import DaskDMatrix
 
 import xgboost as xgb
+from xgboost.dask import DaskDMatrix
 
 
 def main(client):
diff --git a/demo/dask/dask_callbacks.py b/demo/dask/dask_callbacks.py
index a80ede01f..408297d9e 100644
--- a/demo/dask/dask_callbacks.py
+++ b/demo/dask/dask_callbacks.py
@@ -6,9 +6,9 @@ import numpy as np
 from dask.distributed import Client, LocalCluster
 from dask_ml.datasets import make_regression
 from dask_ml.model_selection import train_test_split
-from xgboost.dask import DaskDMatrix
 
 import xgboost as xgb
+from xgboost.dask import DaskDMatrix
 
 
 def probability_for_going_backward(epoch):
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index cf09f8e44..23cbfb47c 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -7,10 +7,10 @@ from dask import array as da
 from dask import dataframe as dd
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
-from xgboost.dask import DaskDMatrix
 
 import xgboost as xgb
 from xgboost import dask as dxgb
+from xgboost.dask import DaskDMatrix
 
 
 def using_dask_matrix(client: Client, X, y):
diff --git a/demo/guide-python/quantile_regression.py b/demo/guide-python/quantile_regression.py
new file mode 100644
index 000000000..d92115bf0
--- /dev/null
+++ b/demo/guide-python/quantile_regression.py
@@ -0,0 +1,122 @@
+"""
+Quantile Regression
+===================
+
+The script is inspired by this awesome example in sklearn:
+https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
+
+"""
+import argparse
+from typing import Dict
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+import xgboost as xgb
+
+
+def f(x: np.ndarray) -> np.ndarray:
+    """The function to predict."""
+    return x * np.sin(x)
+
+
+def quantile_loss(args: argparse.Namespace) -> None:
+    """Train a quantile regression model."""
+    rng = np.random.RandomState(1994)
+    # Generate a synthetic dataset for demo, the generate process is from the sklearn
+    # example.
+    X = np.atleast_2d(rng.uniform(0, 10.0, size=1000)).T
+    expected_y = f(X).ravel()
+
+    sigma = 0.5 + X.ravel() / 10.0
+    noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2.0 / 2.0)
+    y = expected_y + noise
+
+    # Train on 0.05 and 0.95 quantiles. The model is similar to multi-class and
+    # multi-target models.
+    alpha = np.array([0.05, 0.5, 0.95])
+    evals_result: Dict[str, Dict] = {}
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+    # We will be using the `hist` tree method, quantile DMatrix can be used to preserve
+    # memory.
+    # Do not use the `exact` tree method for quantile regression, otherwise the
+    # performance might drop.
+    Xy = xgb.QuantileDMatrix(X, y)
+    # use Xy as a reference
+    Xy_test = xgb.QuantileDMatrix(X_test, y_test, ref=Xy)
+
+    booster = xgb.train(
+        {
+            # Use the quantile objective function.
+            "objective": "reg:quantileerror",
+            "tree_method": "hist",
+            "quantile_alpha": alpha,
+            # Let's try not to overfit.
+            "learning_rate": 0.04,
+            "max_depth": 5,
+        },
+        Xy,
+        num_boost_round=32,
+        early_stopping_rounds=2,
+        # The evaluation result is a weighted average across multiple quantiles.
+        evals=[(Xy, "Train"), (Xy_test, "Test")],
+        evals_result=evals_result,
+    )
+    xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
+    scores = booster.inplace_predict(xx)
+    # dim 1 is the quantiles
+    assert scores.shape[0] == xx.shape[0]
+    assert scores.shape[1] == alpha.shape[0]
+
+    y_lower = scores[:, 0]  # alpha=0.05
+    y_med = scores[:, 1]  # alpha=0.5, median
+    y_upper = scores[:, 2]  # alpha=0.95
+
+    # Train a mse model for comparison
+    booster = xgb.train(
+        {
+            "objective": "reg:squarederror",
+            "tree_method": "hist",
+            # Let's try not to overfit.
+            "learning_rate": 0.04,
+            "max_depth": 5,
+        },
+        Xy,
+        num_boost_round=32,
+        early_stopping_rounds=2,
+        evals=[(Xy, "Train"), (Xy_test, "Test")],
+        evals_result=evals_result,
+    )
+    xx = np.atleast_2d(np.linspace(0, 10, 1000)).T
+    y_pred = booster.inplace_predict(xx)
+
+    if args.plot:
+        from matplotlib import pyplot as plt
+
+        fig = plt.figure(figsize=(10, 10))
+        plt.plot(xx, f(xx), "g:", linewidth=3, label=r"$f(x) = x\,\sin(x)$")
+        plt.plot(X_test, y_test, "b.", markersize=10, label="Test observations")
+        plt.plot(xx, y_med, "r-", label="Predicted median")
+        plt.plot(xx, y_pred, "m-", label="Predicted mean")
+        plt.plot(xx, y_upper, "k-")
+        plt.plot(xx, y_lower, "k-")
+        plt.fill_between(
+            xx.ravel(), y_lower, y_upper, alpha=0.4, label="Predicted 90% interval"
+        )
+        plt.xlabel("$x$")
+        plt.ylabel("$f(x)$")
+        plt.ylim(-10, 25)
+        plt.legend(loc="upper left")
+        plt.show()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plot",
+        action="store_true",
+        help="Specify it to enable plotting the outputs.",
+    )
+    args = parser.parse_args()
+    quantile_loss(args)
diff --git a/demo/guide-python/spark_estimator_examples.py b/demo/guide-python/spark_estimator_examples.py
index cbc3862e5..97caef610 100644
--- a/demo/guide-python/spark_estimator_examples.py
+++ b/demo/guide-python/spark_estimator_examples.py
@@ -10,6 +10,7 @@ from pyspark.ml.linalg import Vectors
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import rand
 from sklearn.model_selection import train_test_split
+
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
 
 spark = SparkSession.builder.master("local[*]").getOrCreate()
diff --git a/demo/nvflare/custom/controller.py b/demo/nvflare/custom/controller.py
index ae2933ad8..dd3e39f46 100644
--- a/demo/nvflare/custom/controller.py
+++ b/demo/nvflare/custom/controller.py
@@ -4,7 +4,6 @@ Example of training controller with NVFlare
 """
 import multiprocessing
 
-import xgboost.federated
 from nvflare.apis.client import Client
 from nvflare.apis.fl_context import FLContext
 from nvflare.apis.impl.controller import Controller, Task
@@ -12,6 +11,8 @@ from nvflare.apis.shareable import Shareable
 from nvflare.apis.signal import Signal
 from trainer import SupportedTasks
 
+import xgboost.federated
+
 
 class XGBoostController(Controller):
     def __init__(self, port: int, world_size: int, server_key_path: str,
diff --git a/dmlc-core b/dmlc-core
index dfd936526..81db53948 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit dfd9365264a060a5096734b7d892e1858b6d2722
+Subproject commit 81db539486ce6525b31b971545edffee2754aced
diff --git a/doc/model.schema b/doc/model.schema
index d91039db3..07a871820 100644
--- a/doc/model.schema
+++ b/doc/model.schema
@@ -440,6 +440,20 @@
               },
               "type": "object"
             },
+            {
+              "properties": {
+                "name": {
+                  "const": "reg:quantileerror"
+                },
+                "quantile_loss_param": {
+                  "type": "object",
+                  "properties": {
+                    "quantle_alpha": {"type": "array"}
+                  }
+                }
+              },
+              "type": "object"
+            },
             {
               "type": "object",
               "properties": {
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 6232884e8..99d6f0585 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -348,6 +348,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
   - ``reg:logistic``: logistic regression.
   - ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
   - ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal.
+  - ``reg:quantileerror``: Quantile loss, also known as ``pinball loss``. See later sections for its parameter and :ref:`sphx_glr_python_examples_quantile_regression.py` for a worked example.
   - ``binary:logistic``: logistic regression for binary classification, output probability
   - ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
   - ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
@@ -441,6 +442,11 @@ Parameter for using Pseudo-Huber (``reg:pseudohubererror``)
 
 * ``huber_slope`` : A parameter used for Pseudo-Huber loss to define the :math:`\delta` term. [default = 1.0]
 
+Parameter for using Quantile Loss (``reg:quantileerror``)
+=========================================================
+
+* ``quantile_alpha``: A scala or a list of targeted quantiles.
+
 ***********************
 Command Line Parameters
 ***********************
diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst
index 1cfec70f4..ca121e1d2 100644
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -45,7 +45,7 @@ Use ``find_package()`` and ``target_link_libraries()`` in your application's CMa
 
 .. code-block:: cmake
 
-    cmake_minimum_required(VERSION 3.13)
+    cmake_minimum_required(VERSION 3.18)
     project(your_project_name LANGUAGES C CXX VERSION your_project_version)
     find_package(xgboost REQUIRED)
     add_executable(your_project_name /path/to/project_file.c)
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index ba2ea7886..d12e71a3a 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -48,21 +48,6 @@
 #define XGBOOST_ALIGNAS(X)
 #endif  // defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4)
 
-#if defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) && \
-    !defined(__CUDACC__) && !defined(__sun) && !defined(sun)
-#include <parallel/algorithm>
-#define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z))
-#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \
-  __gnu_parallel::stable_sort((X), (Y), (Z))
-#elif defined(_MSC_VER) && (!__INTEL_COMPILER)
-#include <ppl.h>
-#define XGBOOST_PARALLEL_SORT(X, Y, Z) concurrency::parallel_sort((X), (Y), (Z))
-#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z))
-#else
-#define XGBOOST_PARALLEL_SORT(X, Y, Z) std::sort((X), (Y), (Z))
-#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z))
-#endif  // GLIBC VERSION
-
 #if defined(__GNUC__)
 #define XGBOOST_EXPECT(cond, ret)  __builtin_expect((cond), (ret))
 #else
diff --git a/include/xgboost/cache.h b/include/xgboost/cache.h
index 142c33a57..781f45b1c 100644
--- a/include/xgboost/cache.h
+++ b/include/xgboost/cache.h
@@ -4,18 +4,21 @@
 #ifndef XGBOOST_CACHE_H_
 #define XGBOOST_CACHE_H_
 
-#include <xgboost/logging.h>  // CHECK_EQ
+#include <xgboost/logging.h>  // for CHECK_EQ, CHECK
 
-#include <cstddef>            // std::size_t
-#include <memory>             // std::weak_ptr,std::shared_ptr,std::make_shared
-#include <queue>              // std:queue
-#include <unordered_map>      // std::unordered_map
-#include <vector>             // std::vector
+#include <cstddef>            // for size_t
+#include <memory>             // for weak_ptr, shared_ptr, make_shared
+#include <mutex>              // for mutex, lock_guard
+#include <queue>              // for queue
+#include <thread>             // for thread
+#include <unordered_map>      // for unordered_map
+#include <utility>            // for move
+#include <vector>             // for vector
 
 namespace xgboost {
 class DMatrix;
 /**
- * \brief FIFO cache for DMatrix related data.
+ * \brief Thread-aware FIFO cache for DMatrix related data.
  *
  * \tparam CacheT The type that needs to be cached.
  */
@@ -30,13 +33,37 @@ class DMatrixCache {
 
     CacheT const& Value() const { return *value; }
     CacheT& Value() { return *value; }
+
+    Item(std::shared_ptr<DMatrix> m, std::shared_ptr<CacheT> v) : ref{m}, value{std::move(v)} {}
   };
 
   static constexpr std::size_t DefaultSize() { return 32; }
 
+ private:
+  mutable std::mutex lock_;
+
  protected:
-  std::unordered_map<DMatrix const*, Item> container_;
-  std::queue<DMatrix const*> queue_;
+  struct Key {
+    DMatrix const* ptr;
+    std::thread::id const thread_id;
+
+    bool operator==(Key const& that) const {
+      return ptr == that.ptr && thread_id == that.thread_id;
+    }
+  };
+  struct Hash {
+    std::size_t operator()(Key const& key) const noexcept {
+      std::size_t f = std::hash<DMatrix const*>()(key.ptr);
+      std::size_t s = std::hash<std::thread::id>()(key.thread_id);
+      if (f == s) {
+        return f;
+      }
+      return f ^ s;
+    }
+  };
+
+  std::unordered_map<Key, Item, Hash> container_;
+  std::queue<Key> queue_;
   std::size_t max_size_;
 
   void CheckConsistent() const { CHECK_EQ(queue_.size(), container_.size()); }
@@ -44,8 +71,8 @@ class DMatrixCache {
   void ClearExpired() {
     // Clear expired entries
     this->CheckConsistent();
-    std::vector<DMatrix const*> expired;
-    std::queue<DMatrix const*> remained;
+    std::vector<Key> expired;
+    std::queue<Key> remained;
 
     while (!queue_.empty()) {
       auto p_fmat = queue_.front();
@@ -61,8 +88,8 @@ class DMatrixCache {
     CHECK(queue_.empty());
     CHECK_EQ(remained.size() + expired.size(), container_.size());
 
-    for (auto const* p_fmat : expired) {
-      container_.erase(p_fmat);
+    for (auto const& key : expired) {
+      container_.erase(key);
     }
     while (!remained.empty()) {
       auto p_fmat = remained.front();
@@ -74,7 +101,9 @@ class DMatrixCache {
 
   void ClearExcess() {
     this->CheckConsistent();
-    while (queue_.size() >= max_size_) {
+    // clear half of the entries to prevent repeatingly clearing cache.
+    std::size_t half_size = max_size_ / 2;
+    while (queue_.size() >= half_size && !queue_.empty()) {
       auto p_fmat = queue_.front();
       queue_.pop();
       container_.erase(p_fmat);
@@ -88,7 +117,7 @@ class DMatrixCache {
    */
   explicit DMatrixCache(std::size_t cache_size) : max_size_{cache_size} {}
   /**
-   * \brief Cache a new DMatrix if it's no in the cache already.
+   * \brief Cache a new DMatrix if it's not in the cache already.
    *
    *  Passing in a `shared_ptr` is critical here.  First to create a `weak_ptr` inside the
    *  entry this shared pointer is necessary.  More importantly, the life time of this
@@ -101,35 +130,42 @@ class DMatrixCache {
    *         created.
    */
   template <typename... Args>
-  std::shared_ptr<CacheT>& CacheItem(std::shared_ptr<DMatrix> m, Args const&... args) {
+  std::shared_ptr<CacheT> CacheItem(std::shared_ptr<DMatrix> m, Args const&... args) {
     CHECK(m);
+    std::lock_guard<std::mutex> guard{lock_};
+
     this->ClearExpired();
     if (container_.size() >= max_size_) {
       this->ClearExcess();
     }
     // after clear, cache size < max_size
     CHECK_LT(container_.size(), max_size_);
-    auto it = container_.find(m.get());
+    auto key = Key{m.get(), std::this_thread::get_id()};
+    auto it = container_.find(key);
     if (it == container_.cend()) {
       // after the new DMatrix, cache size is at most max_size
-      container_[m.get()] = {m, std::make_shared<CacheT>(args...)};
-      queue_.push(m.get());
+      container_.emplace(key, Item{m, std::make_shared<CacheT>(args...)});
+      queue_.emplace(key);
     }
-    return container_.at(m.get()).value;
+    return container_.at(key).value;
   }
   /**
    * \brief Get a const reference to the underlying hash map.  Clear expired caches before
    *        returning.
    */
   decltype(container_) const& Container() {
+    std::lock_guard<std::mutex> guard{lock_};
+
     this->ClearExpired();
     return container_;
   }
 
   std::shared_ptr<CacheT> Entry(DMatrix const* m) const {
-    CHECK(container_.find(m) != container_.cend());
-    CHECK(!container_.at(m).ref.expired());
-    return container_.at(m).value;
+    std::lock_guard<std::mutex> guard{lock_};
+    auto key = Key{m, std::this_thread::get_id()};
+    CHECK(container_.find(key) != container_.cend());
+    CHECK(!container_.at(key).ref.expired());
+    return container_.at(key).value;
   }
 };
 }  // namespace xgboost
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 9411fcfab..ec78c588d 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -124,18 +124,7 @@ class MetaInfo {
     return weights_.Size() != 0 ?  weights_.HostVector()[i] : 1.0f;
   }
   /*! \brief get sorted indexes (argsort) of labels by absolute value (used by cox loss) */
-  inline const std::vector<size_t>& LabelAbsSort() const {
-    if (label_order_cache_.size() == labels.Size()) {
-      return label_order_cache_;
-    }
-    label_order_cache_.resize(labels.Size());
-    std::iota(label_order_cache_.begin(), label_order_cache_.end(), 0);
-    const auto& l = labels.Data()->HostVector();
-    XGBOOST_PARALLEL_STABLE_SORT(label_order_cache_.begin(), label_order_cache_.end(),
-              [&l](size_t i1, size_t i2) {return std::abs(l[i1]) < std::abs(l[i2]);});
-
-    return label_order_cache_;
-  }
+  const std::vector<size_t>& LabelAbsSort(Context const* ctx) const;
   /*! \brief clear all the information */
   void Clear();
   /*!
@@ -540,6 +529,16 @@ class DMatrix {
     return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
   }
 
+  /*! \brief Whether the data is split row-wise. */
+  bool IsRowSplit() const {
+    return Info().data_split_mode == DataSplitMode::kRow;
+  }
+
+  /*! \brief Whether the data is split column-wise. */
+  bool IsColumnSplit() const {
+    return Info().data_split_mode == DataSplitMode::kCol;
+  }
+
   /*!
    * \brief Load DMatrix from URI.
    * \param uri The URI of input.
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index 3546e58d1..3b34c2874 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -1,5 +1,5 @@
 /**
- * Copyright by XGBoost Contributors 2019-2023
+ * Copyright 2019-2023 by XGBoost Contributors
  */
 #ifndef XGBOOST_JSON_H_
 #define XGBOOST_JSON_H_
@@ -372,7 +372,7 @@ class Json {
   /*! \brief Use your own JsonWriter. */
   static void Dump(Json json, JsonWriter* writer);
 
-  Json() : ptr_{new JsonNull} {}
+  Json() = default;
 
   // number
   explicit Json(JsonNumber number) : ptr_{new JsonNumber(std::move(number))} {}
@@ -462,7 +462,7 @@ class Json {
   IntrusivePtr<Value> const& Ptr() const { return ptr_; }
 
  private:
-  IntrusivePtr<Value> ptr_;
+  IntrusivePtr<Value> ptr_{new JsonNull};
 };
 
 /**
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index 742231055..e11545b04 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -22,13 +22,13 @@ namespace detail {
 // static_cast and std::to_string.
 template <typename Char, std::enable_if_t<std::is_signed<Char>::value>* = nullptr>
 std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value, "");
+  static_assert(std::is_same<Char, char>::value);
   return std::string{c};
 }
 
 template <typename Char, std::enable_if_t<!std::is_signed<Char>::value>* = nullptr>
 std::string CharToStr(Char c) {
-  static_assert(std::is_same<Char, char>::value, "");
+  static_assert(std::is_same<Char, char>::value);
   return (c <= static_cast<char>(127) ? std::string{c} : std::to_string(c));
 }
 }  // namespace detail
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 2f84bb1cb..3d6bcc962 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -15,14 +15,19 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cinttypes>  // std::int32_t
+#include <cinttypes>  // for int32_t
+#include <cstddef>    // for size_t
 #include <limits>
 #include <string>
-#include <tuple>
+#include <tuple>  // for make_tuple
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif  // defined(_MSC_VER)
+
 // decouple it from xgboost.
 #ifndef LINALG_HD
 #if defined(__CUDA__) || defined(__NVCC__)
@@ -32,8 +37,7 @@
 #endif  // defined (__CUDA__) || defined(__NVCC__)
 #endif  // LINALG_HD
 
-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace detail {
 
 struct ArrayInterfaceHandler {
@@ -47,14 +51,14 @@ struct ArrayInterfaceHandler {
 
 template <size_t dim, typename S, typename Head, size_t D>
 constexpr size_t Offset(S (&strides)[D], size_t n, Head head) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
   return n + head * strides[dim];
 }
 
 template <size_t dim, typename S, size_t D, typename Head, typename... Tail>
 constexpr std::enable_if_t<sizeof...(Tail) != 0, size_t> Offset(S (&strides)[D], size_t n,
                                                                 Head head, Tail &&...rest) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
   return Offset<dim + 1>(strides, n + (head * strides[dim]), std::forward<Tail>(rest)...);
 }
 
@@ -81,7 +85,7 @@ template <typename I>
 struct RangeTag {
   I beg;
   I end;
-  constexpr size_t Size() const { return end - beg; }
+  [[nodiscard]] constexpr size_t Size() const { return end - beg; }
 };
 
 /**
@@ -146,21 +150,41 @@ inline LINALG_HD int Popc(uint64_t v) {
   return __popcll(v);
 #elif defined(__GNUC__) || defined(__clang__)
   return __builtin_popcountll(v);
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && _defined(_M_X64)
   return __popcnt64(v);
 #else
   return NativePopc(v);
 #endif  // compiler
 }
 
+template <std::size_t D, typename Head>
+LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head) {
+  static_assert(std::is_integral<std::remove_reference_t<Head>>::value, "Invalid index type.");
+  arr[D - 1] = head;
+}
+
+/**
+ * \brief Convert index from parameter pack to C-style array.
+ */
+template <std::size_t D, typename Head, typename... Rest>
+LINALG_HD void IndexToArr(std::size_t (&arr)[D], Head head, Rest &&...index) {
+  static_assert(sizeof...(Rest) < D, "Index overflow.");
+  static_assert(std::is_integral<std::remove_reference_t<Head>>::value, "Invalid index type.");
+  arr[D - sizeof...(Rest) - 1] = head;
+  IndexToArr(arr, std::forward<Rest>(index)...);
+}
+
 template <class T, std::size_t N, std::size_t... Idx>
-constexpr auto Arr2Tup(T (&arr)[N], std::index_sequence<Idx...>) {
+constexpr auto ArrToTuple(T (&arr)[N], std::index_sequence<Idx...>) {
   return std::make_tuple(arr[Idx]...);
 }
 
+/**
+ * \brief Convert C-styple array to std::tuple.
+ */
 template <class T, std::size_t N>
-constexpr auto Arr2Tup(T (&arr)[N]) {
-  return Arr2Tup(arr, std::make_index_sequence<N>{});
+constexpr auto ArrToTuple(T (&arr)[N]) {
+  return ArrToTuple(arr, std::make_index_sequence<N>{});
 }
 
 // uint division optimization inspired by the CIndexer in cupy.  Division operation is
@@ -183,19 +207,19 @@ LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
     }
   }
   index[0] = idx;
-  return Arr2Tup(index);
+  return ArrToTuple(index);
 }
 
 template <size_t dim, typename I, int32_t D>
 void ReshapeImpl(size_t (&out_shape)[D], I s) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
   out_shape[dim] = s;
 }
 
 template <size_t dim, int32_t D, typename... S, typename I,
           std::enable_if_t<sizeof...(S) != 0> * = nullptr>
 void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) {
-  static_assert(dim < D, "");
+  static_assert(dim < D);
   out_shape[dim] = s;
   ReshapeImpl<dim + 1>(out_shape, std::forward<S>(rest)...);
 }
@@ -225,7 +249,8 @@ struct Conjunction : std::true_type {};
 template <class B1>
 struct Conjunction<B1> : B1 {};
 template <class B1, class... Bn>
-struct Conjunction<B1, Bn...> : std::conditional_t<bool(B1::value), Conjunction<Bn...>, B1> {};
+struct Conjunction<B1, Bn...>
+    : std::conditional_t<static_cast<bool>(B1::value), Conjunction<Bn...>, B1> {};
 
 template <typename... Index>
 using IsAllIntegral = Conjunction<std::is_integral<std::remove_reference_t<Index>>...>;
@@ -246,6 +271,11 @@ constexpr detail::RangeTag<I> Range(I beg, I end) {
   return {beg, end};
 }
 
+enum Order : std::uint8_t {
+  kC,  // Row major
+  kF,  // Col major
+};
+
 /**
  * \brief A tensor view with static type and dimension. It implements indexing and slicing.
  *
@@ -286,8 +316,8 @@ class TensorView {
   template <size_t old_dim, size_t new_dim, int32_t D, typename I>
   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
                                 detail::RangeTag<I> &&range) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
     new_stride[new_dim] = stride_[old_dim];
     new_shape[new_dim] = range.Size();
     assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);
@@ -301,8 +331,8 @@ class TensorView {
   template <size_t old_dim, size_t new_dim, int32_t D, typename I, typename... S>
   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
                                 detail::RangeTag<I> &&range, S &&...slices) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
     new_stride[new_dim] = stride_[old_dim];
     new_shape[new_dim] = range.Size();
     assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);
@@ -315,8 +345,8 @@ class TensorView {
 
   template <size_t old_dim, size_t new_dim, int32_t D>
   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
     new_stride[new_dim] = stride_[old_dim];
     new_shape[new_dim] = shape_[old_dim];
     return 0;
@@ -327,8 +357,8 @@ class TensorView {
   template <size_t old_dim, size_t new_dim, int32_t D, typename... S>
   LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag,
                                 S &&...slices) const {
-    static_assert(new_dim < D, "");
-    static_assert(old_dim < kDim, "");
+    static_assert(new_dim < D);
+    static_assert(old_dim < kDim);
     new_stride[new_dim] = stride_[old_dim];
     new_shape[new_dim] = shape_[old_dim];
     return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,
@@ -338,7 +368,7 @@ class TensorView {
   template <size_t old_dim, size_t new_dim, int32_t D, typename Index>
   LINALG_HD size_t MakeSliceDim(DMLC_ATTRIBUTE_UNUSED size_t new_shape[D],
                                 DMLC_ATTRIBUTE_UNUSED size_t new_stride[D], Index i) const {
-    static_assert(old_dim < kDim, "");
+    static_assert(old_dim < kDim);
     return stride_[old_dim] * i;
   }
   /**
@@ -347,7 +377,7 @@ class TensorView {
   template <size_t old_dim, size_t new_dim, int32_t D, typename Index, typename... S>
   LINALG_HD std::enable_if_t<std::is_integral<Index>::value, size_t> MakeSliceDim(
       size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const {
-    static_assert(old_dim < kDim, "");
+    static_assert(old_dim < kDim);
     auto offset = stride_[old_dim] * i;
     auto res =
         MakeSliceDim<old_dim + 1, new_dim, D>(new_shape, new_stride, std::forward<S>(slices)...);
@@ -371,7 +401,11 @@ class TensorView {
    * \param device Device ordinal
    */
   template <typename I, int32_t D>
-  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], int32_t device)
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device)
+      : TensorView{data, shape, device, Order::kC} {}
+
+  template <typename I, int32_t D>
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device, Order order)
       : data_{data}, ptr_{data_.data()}, device_{device} {
     static_assert(D > 0 && D <= kDim, "Invalid shape.");
     // shape
@@ -380,7 +414,19 @@ class TensorView {
       shape_[i] = 1;
     }
     // stride
-    detail::CalcStride(shape_, stride_);
+    switch (order) {
+      case Order::kC: {
+        detail::CalcStride(shape_, stride_);
+        break;
+      }
+      case Order::kF: {
+        detail::CalcStride<kDim, true>(shape_, stride_);
+        break;
+      }
+      default: {
+        SPAN_CHECK(false);
+      }
+    }
     // size
     this->CalcSize();
   }
@@ -484,19 +530,19 @@ class TensorView {
   /**
    * \brief Number of items in the tensor.
    */
-  LINALG_HD size_t Size() const { return size_; }
+  LINALG_HD [[nodiscard]] std::size_t Size() const { return size_; }
   /**
    * \brief Whether this is a contiguous array, both C and F contiguous returns true.
    */
-  LINALG_HD bool Contiguous() const {
+  LINALG_HD [[nodiscard]] bool Contiguous() const {
     return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
   }
   /**
    * \brief Whether it's a c-contiguous array.
    */
-  LINALG_HD bool CContiguous() const {
+  LINALG_HD [[nodiscard]] bool CContiguous() const {
     StrideT stride;
-    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value, "");
+    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
     // It's contiguous if the stride can be calculated from shape.
     detail::CalcStride(shape_, stride);
     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
@@ -504,9 +550,9 @@ class TensorView {
   /**
    * \brief Whether it's a f-contiguous array.
    */
-  LINALG_HD bool FContiguous() const {
+  LINALG_HD [[nodiscard]] bool FContiguous() const {
     StrideT stride;
-    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value, "");
+    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value);
     // It's contiguous if the stride can be calculated from shape.
     detail::CalcStride<kDim, true>(shape_, stride);
     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
@@ -524,16 +570,38 @@ class TensorView {
 /**
  * \brief Constructor for automatic type deduction.
  */
-template <typename Container, typename I, int32_t D,
-          std::enable_if_t<!common::detail::IsSpan<Container>::value> * = nullptr>
-auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device) {  // NOLINT
+template <typename Container, typename... S,
+          std::enable_if_t<!common::detail::IsSpan<Container>::value &&
+                           !std::is_pointer_v<Container>> * = nullptr>
+auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOLINT
   using T = typename Container::value_type;
-  return TensorView<T, D>{data, shape, device};
+  std::size_t in_shape[sizeof...(S)];
+  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
+  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->gpu_id};
 }
 
-template <typename T, typename I, int32_t D>
-LINALG_HD auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device) {
-  return TensorView<T, D>{data, shape, device};
+template <typename T, typename... S>
+LINALG_HD auto MakeTensorView(std::int32_t device, common::Span<T> data, S &&...shape) {
+  std::size_t in_shape[sizeof...(S)];
+  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
+  return TensorView<T, sizeof...(S)>{data, in_shape, device};
+}
+
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
+  return MakeTensorView(ctx->gpu_id, data, std::forward<S>(shape)...);
+}
+
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
+  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
+  return MakeTensorView(ctx->gpu_id, span, std::forward<S>(shape)...);
+}
+
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
+  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
+  return MakeTensorView(ctx->gpu_id, span, std::forward<S>(shape)...);
 }
 
 /**
@@ -548,6 +616,18 @@ LINALG_HD auto UnravelIndex(size_t idx, common::Span<size_t const, D> shape) {
   }
 }
 
+template <size_t D>
+LINALG_HD auto UnravelIndex(size_t idx, std::size_t const (&shape)[D]) {
+  return UnravelIndex(idx, common::Span<std::size_t const, D>(shape));
+}
+
+template <typename... S>
+LINALG_HD auto UnravelIndex(std::size_t idx, S... shape) {
+  std::size_t s[sizeof...(S)];
+  detail::IndexToArr(s, shape...);
+  return UnravelIndex(idx, common::Span<std::size_t const, sizeof...(S)>(s));
+}
+
 /**
  * \brief A view over a vector, specialization of Tensor
  *
@@ -615,7 +695,7 @@ Json ArrayInterface(TensorView<T const, D> const &t) {
   array_interface["version"] = 3;
 
   char constexpr kT = detail::ArrayInterfaceHandler::TypeChar<T>();
-  static_assert(kT != '\0', "");
+  static_assert(kT != '\0');
   if (DMLC_LITTLE_ENDIAN) {
     array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))};
   } else {
@@ -665,6 +745,7 @@ class Tensor {
  private:
   HostDeviceVector<T> data_;
   ShapeT shape_{0};
+  Order order_{Order::kC};
 
   template <typename I, std::int32_t D>
   void Initialize(I const (&shape)[D], std::int32_t device) {
@@ -690,11 +771,12 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], int32_t device)
-      : Tensor{common::Span<I const, D>{shape}, device} {}
+  explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
+      : Tensor{common::Span<I const, D>{shape}, device, order} {}
 
   template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, int32_t device) {
+  explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
+      : order_{order} {
     // No device unroll as this is a host only function.
     std::copy(shape.data(), shape.data() + D, shape_);
     for (auto i = D; i < kDim; ++i) {
@@ -713,7 +795,8 @@ class Tensor {
    * Initialize from 2 host iterators.
    */
   template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], int32_t device) {
+  explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
+      : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec.insert(h_vec.begin(), begin, end);
     // shape
@@ -721,8 +804,9 @@ class Tensor {
   }
 
   template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D],
-                  int32_t device = Context::kCpuId) {
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
+                  Order order = kC)
+      : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec = data;
     // shape
@@ -752,27 +836,27 @@ class Tensor {
     if (device >= 0) {
       data_.SetDevice(device);
       auto span = data_.DeviceSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
     } else {
       auto span = data_.HostSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
     }
   }
   TensorView<T const, kDim> View(int32_t device) const {
     if (device >= 0) {
       data_.SetDevice(device);
       auto span = data_.ConstDeviceSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
     } else {
       auto span = data_.ConstHostSpan();
-      return {span, shape_, device};
+      return {span, shape_, device, order_};
     }
   }
 
   auto HostView() const { return this->View(-1); }
   auto HostView() { return this->View(-1); }
 
-  size_t Size() const { return data_.Size(); }
+  [[nodiscard]] size_t Size() const { return data_.Size(); }
   auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
   auto Shape(size_t i) const { return shape_[i]; }
 
@@ -826,12 +910,26 @@ class Tensor {
   void Reshape(size_t (&shape)[D]) {
     this->Reshape(common::Span<size_t const, D>{shape});
   }
+  /**
+   * \brief Get a host view on the slice.
+   */
+  template <typename... S>
+  auto Slice(S &&...slices) const {
+    return this->HostView().Slice(std::forward<S>(slices)...);
+  }
+  /**
+   * \brief Get a host view on the slice.
+   */
+  template <typename... S>
+  auto Slice(S &&...slices) {
+    return this->HostView().Slice(std::forward<S>(slices)...);
+  }
 
   /**
    * \brief Set device ordinal for this tensor.
    */
   void SetDevice(int32_t device) const { data_.SetDevice(device); }
-  int32_t DeviceIdx() const { return data_.DeviceIdx(); }
+  [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
 };
 
 template <typename T>
@@ -889,8 +987,7 @@ void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
     shape[0] = l->Shape(0) + r.Shape(0);
   });
 }
-}  // namespace linalg
-}  // namespace xgboost
+}  // namespace xgboost::linalg
 
 #if defined(LINALG_HD)
 #undef LINALG_HD
diff --git a/include/xgboost/metric.h b/include/xgboost/metric.h
index 2be6d5591..3e405cf58 100644
--- a/include/xgboost/metric.h
+++ b/include/xgboost/metric.h
@@ -8,15 +8,16 @@
 #define XGBOOST_METRIC_H_
 
 #include <dmlc/registry.h>
-#include <xgboost/model.h>
-#include <xgboost/data.h>
 #include <xgboost/base.h>
+#include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
+#include <xgboost/model.h>
 
-#include <vector>
-#include <string>
 #include <functional>
+#include <memory>  // shared_ptr
+#include <string>
 #include <utility>
+#include <vector>
 
 namespace xgboost {
 struct Context;
@@ -27,7 +28,7 @@ struct Context;
  */
 class Metric : public Configurable {
  protected:
-  Context const* ctx_;
+  Context const* ctx_{nullptr};
 
  public:
   /*!
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 0341a27a1..a04d2e453 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -116,12 +116,13 @@ class ObjFunction : public Configurable {
    *
    * \param position The leaf index for each rows.
    * \param info MetaInfo providing labels and weights.
+   * \param learning_rate The learning rate for current iteration.
    * \param prediction Model prediction after transformation.
    * \param group_idx The group index for this tree, 0 when it's not multi-target or multi-class.
    * \param p_tree Tree that needs to be updated.
    */
   virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& /*position*/,
-                              MetaInfo const& /*info*/,
+                              MetaInfo const& /*info*/, float /*learning_rate*/,
                               HostDeviceVector<float> const& /*prediction*/,
                               std::int32_t /*group_idx*/, RegTree* /*p_tree*/) const {}
 
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 438c23465..50665341a 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -14,6 +14,8 @@
 #include <functional>  // std::function
 #include <memory>
 #include <string>
+#include <thread>   // for get_id
+#include <utility>  // for make_pair
 #include <vector>
 
 // Forward declarations
@@ -48,18 +50,17 @@ struct PredictionCacheEntry {
  * \brief A container for managed prediction caches.
  */
 class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
-  // we cache up to 32 DMatrix
-  std::size_t static constexpr DefaultSize() { return 32; }
+  // We cache up to 64 DMatrix for all threads
+  std::size_t static constexpr DefaultSize() { return 64; }
 
  public:
   PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
-  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, int32_t device) {
-    this->CacheItem(m);
-    auto p_cache = this->container_.find(m.get());
+  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
+    auto p_cache = this->CacheItem(m);
     if (device != Context::kCpuId) {
-      p_cache->second.Value().predictions.SetDevice(device);
+      p_cache->predictions.SetDevice(device);
     }
-    return p_cache->second.Value();
+    return *p_cache;
   }
 };
 
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 5cf8fb05c..59f4c2cf8 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -24,6 +24,9 @@
 #include <vector>
 
 namespace xgboost {
+namespace tree {
+struct TrainParam;
+}
 
 class Json;
 struct Context;
@@ -56,8 +59,10 @@ class TreeUpdater : public Configurable {
    *        tree can be used.
    */
   virtual bool HasNodePosition() const { return false; }
-  /*!
+  /**
    * \brief perform update to the tree models
+   *
+   * \param param Hyper-parameter for constructing trees.
    * \param gpair the gradient pair statistics of the data
    * \param data The data matrix passed to the updater.
    * \param out_position The leaf index for each row.  The index is negated if that row is
@@ -67,8 +72,8 @@ class TreeUpdater : public Configurable {
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* data,
-                      common::Span<HostDeviceVector<bst_node_t>> out_position,
+  virtual void Update(tree::TrainParam const* param, HostDeviceVector<GradientPair>* gpair,
+                      DMatrix* data, common::Span<HostDeviceVector<bst_node_t>> out_position,
                       const std::vector<RegTree*>& out_trees) = 0;
 
   /*!
diff --git a/include/xgboost/version_config.h b/include/xgboost/version_config.h
index 3eb87e664..8005b8391 100644
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -1,11 +1,11 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023 by XGBoost contributors
  */
 #ifndef XGBOOST_VERSION_CONFIG_H_
 #define XGBOOST_VERSION_CONFIG_H_
 
-#define XGBOOST_VER_MAJOR 2
-#define XGBOOST_VER_MINOR 0
-#define XGBOOST_VER_PATCH 0
+#define XGBOOST_VER_MAJOR 2  /* NOLINT */
+#define XGBOOST_VER_MINOR 0  /* NOLINT */
+#define XGBOOST_VER_PATCH 0  /* NOLINT */
 
 #endif  // XGBOOST_VERSION_CONFIG_H_
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 2c30d512c..852cf7f69 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -181,7 +181,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-assembly-plugin</artifactId>
-                        <version>3.4.2</version>
+                        <version>3.5.0</version>
                         <configuration>
                             <descriptorRefs>
                                 <descriptorRef>jar-with-dependencies</descriptorRef>
@@ -392,7 +392,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.8.0</version>
+                <version>4.8.1</version>
                 <executions>
                     <execution>
                         <id>compile</id>
@@ -455,7 +455,7 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-                <version>4.8.0</version>
+                <version>4.8.1</version>
                 <configuration>
                     <jvmArgs>
                         <jvmArg>-Xms64m</jvmArg>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index cd3975156..4d35d2e76 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -68,7 +68,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.4.1</version>
+                <version>3.5.0</version>
                 <configuration>
                     <show>protected</show>
                     <nohelp>true</nohelp>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 66be34b88..dcc4bf60c 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -56,7 +56,7 @@
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.4.1</version>
+              <version>3.5.0</version>
               <configuration>
                   <show>protected</show>
                   <nohelp>true</nohelp>
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 485f1cc3c..7026238e3 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -15,7 +15,7 @@ if (PLUGIN_UPDATER_ONEAPI)
   target_link_libraries(oneapi_plugin PUBLIC -fsycl)
   set_target_properties(oneapi_plugin PROPERTIES
     COMPILE_FLAGS -fsycl
-    CXX_STANDARD 14
+    CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
   if (USE_OPENMP)
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 76350d839..5be6a058a 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -23,7 +23,13 @@ from typing import (
 import numpy
 
 from . import collective
-from .core import Booster, DMatrix, XGBoostError, _get_booster_layer_trees
+from .core import (
+    Booster,
+    DMatrix,
+    XGBoostError,
+    _get_booster_layer_trees,
+    _parse_eval_str,
+)
 
 __all__ = [
     "TrainingCallback",
@@ -250,11 +256,7 @@ class CallbackContainer:
             for _, name in evals:
                 assert name.find("-") == -1, "Dataset name should not contain `-`"
             score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
-            splited = score.split()[1:]  # into datasets
-            # split up `test-error:0.1234`
-            metric_score_str = [tuple(s.split(":")) for s in splited]
-            # convert to float
-            metric_score = [(n, float(s)) for n, s in metric_score_str]
+            metric_score = _parse_eval_str(score)
             self._update_history(metric_score, epoch)
         ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
         return ret
diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py
index 7c586cba7..4c67ccbfc 100644
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@@ -231,7 +231,7 @@ def allreduce(data: np.ndarray, op: Op) -> np.ndarray:  # pylint:disable=invalid
     if buf.base is data.base:
         buf = buf.copy()
     if buf.dtype not in DTYPE_ENUM__:
-        raise Exception(f"data type {buf.dtype} not supported")
+        raise TypeError(f"data type {buf.dtype} not supported")
     _check_call(
         _LIB.XGCommunicatorAllreduce(
             buf.ctypes.data_as(ctypes.c_void_p),
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index f3b986e93..5a0cfb3a2 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -111,6 +111,16 @@ def make_jcargs(**kwargs: Any) -> bytes:
     return from_pystr_to_cstr(json.dumps(kwargs))
 
 
+def _parse_eval_str(result: str) -> List[Tuple[str, float]]:
+    """Parse an eval result string from the booster."""
+    splited = result.split()[1:]
+    # split up `test-error:0.1234`
+    metric_score_str = [tuple(s.split(":")) for s in splited]
+    # convert to float
+    metric_score = [(n, float(s)) for n, s in metric_score_str]
+    return metric_score
+
+
 IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int])
 
 
@@ -1926,6 +1936,8 @@ class Booster:
         elif isinstance(params, str) and value is not None:
             params = [(params, value)]
         for key, val in cast(Iterable[Tuple[str, str]], params):
+            if isinstance(val, np.ndarray):
+                val = val.tolist()
             if val is not None:
                 _check_call(
                     _LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))
diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py
index 0b8f143ec..132d72178 100644
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@@ -136,7 +136,7 @@ def allreduce(  # pylint:disable=invalid-name
     """
     if prepare_fun is None:
         return collective.allreduce(data, collective.Op(op))
-    raise Exception("preprocessing function is no longer supported")
+    raise ValueError("preprocessing function is no longer supported")
 
 
 def version_number() -> int:
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 69bcac38d..3204f5a2a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -43,8 +43,9 @@ from .core import (
     XGBoostError,
     _convert_ntree_limit,
     _deprecate_positional_args,
+    _parse_eval_str,
 )
-from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
+from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train
 
 
@@ -1812,32 +1813,43 @@ class XGBRFRegressor(XGBRegressor):
         return self
 
 
+def _get_qid(
+    X: ArrayLike, qid: Optional[ArrayLike]
+) -> Tuple[ArrayLike, Optional[ArrayLike]]:
+    """Get the special qid column from X if exists."""
+    if (_is_pandas_df(X) or _is_cudf_df(X)) and hasattr(X, "qid"):
+        if qid is not None:
+            raise ValueError(
+                "Found both the special column `qid` in `X` and the `qid` from the"
+                "`fit` method. Please remove one of them."
+            )
+        q_x = X.qid
+        X = X.drop("qid", axis=1)
+        return X, q_x
+    return X, qid
+
+
 @xgboost_model_doc(
-    "Implementation of the Scikit-Learn API for XGBoost Ranking.",
+    """Implementation of the Scikit-Learn API for XGBoost Ranking.""",
     ["estimators", "model"],
     end_note="""
-        .. note::
-
-            The default objective for XGBRanker is "rank:pairwise"
-
         .. note::
 
             A custom objective function is currently not supported by XGBRanker.
-            Likewise, a custom metric function is not supported either.
 
         .. note::
 
-            Query group information is required for ranking tasks by either using the
-            `group` parameter or `qid` parameter in `fit` method. This information is
-            not required in 'predict' method and multiple groups can be predicted on
-            a single call to `predict`.
+            Query group information is only required for ranking training but not
+            prediction. Multiple groups can be predicted on a single call to
+            :py:meth:`predict`.
 
         When fitting the model with the `group` parameter, your data need to be sorted
-        by query group first. `group` must be an array that contains the size of each
+        by the query group first. `group` is an array that contains the size of each
         query group.
-        When fitting the model with the `qid` parameter, your data does not need
-        sorting. `qid` must be an array that contains the group of each training
-        sample.
+
+        Similarly, when fitting the model with the `qid` parameter, the data should be
+        sorted according to query index and `qid` is an array that contains the query
+        index for each training sample.
 
         For example, if your original data look like:
 
@@ -1859,9 +1871,10 @@ class XGBRFRegressor(XGBRegressor):
         |   2   |   1       |   x_7         |
         +-------+-----------+---------------+
 
-        then `fit` method can be called with either `group` array as ``[3, 4]``
-        or with `qid` as ``[`1, 1, 1, 2, 2, 2, 2]``, that is the qid column.
-""",
+        then :py:meth:`fit` method can be called with either `group` array as ``[3, 4]``
+        or with `qid` as ``[1, 1, 1, 2, 2, 2, 2]``, that is the qid column.  Also, the
+        `qid` can be a special column of input `X` instead of a separated parameter, see
+        :py:meth:`fit` for more info.""",
 )
 class XGBRanker(XGBModel, XGBRankerMixIn):
     # pylint: disable=missing-docstring,too-many-arguments,invalid-name
@@ -1873,6 +1886,16 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
         if "rank:" not in objective:
             raise ValueError("please use XGBRanker for ranking task")
 
+    def _create_ltr_dmatrix(
+        self, ref: Optional[DMatrix], data: ArrayLike, qid: ArrayLike, **kwargs: Any
+    ) -> DMatrix:
+        data, qid = _get_qid(data, qid)
+
+        if kwargs.get("group", None) is None and qid is None:
+            raise ValueError("Either `group` or `qid` is required for ranking task")
+
+        return super()._create_dmatrix(ref=ref, data=data, qid=qid, **kwargs)
+
     @_deprecate_positional_args
     def fit(
         self,
@@ -1907,6 +1930,23 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
         X :
             Feature matrix. See :ref:`py-data` for a list of supported types.
 
+            When this is a :py:class:`pandas.DataFrame` or a :py:class:`cudf.DataFrame`,
+            it may contain a special column called ``qid`` for specifying the query
+            index. Using a special column is the same as using the `qid` parameter,
+            except for being compatible with sklearn utility functions like
+            :py:func:`sklearn.model_selection.cross_validation`. The same convention
+            applies to the :py:meth:`XGBRanker.score` and :py:meth:`XGBRanker.predict`.
+
+            +-----+----------------+----------------+
+            | qid | feat_0         | feat_1         |
+            +-----+----------------+----------------+
+            | 0   | :math:`x_{00}` | :math:`x_{01}` |
+            +-----+----------------+----------------+
+            | 1   | :math:`x_{10}` | :math:`x_{11}` |
+            +-----+----------------+----------------+
+            | 1   | :math:`x_{20}` | :math:`x_{21}` |
+            +-----+----------------+----------------+
+
             When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
             :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
             for conserving memory. However, this has performance implications when the
@@ -1916,12 +1956,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
         y :
             Labels
         group :
-            Size of each query group of training data. Should have as many elements as the
-            query groups in the training data.  If this is set to None, then user must
-            provide qid.
+            Size of each query group of training data. Should have as many elements as
+            the query groups in the training data.  If this is set to None, then user
+            must provide qid.
         qid :
             Query ID for each training sample.  Should have the size of n_samples.  If
-            this is set to None, then user must provide group.
+            this is set to None, then user must provide group or a special column in X.
         sample_weight :
             Query group weights
 
@@ -1929,8 +1969,9 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
 
                 In ranking task, one weight is assigned to each query group/id (not each
                 data point). This is because we only care about the relative ordering of
-                data points within each group, so it doesn't make sense to assign weights
-                to individual data points.
+                data points within each group, so it doesn't make sense to assign
+                weights to individual data points.
+
         base_margin :
             Global bias for each instance.
         eval_set :
@@ -1942,7 +1983,8 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
             query groups in the ``i``-th pair in **eval_set**.
         eval_qid :
             A list in which ``eval_qid[i]`` is the array containing query ID of ``i``-th
-            pair in **eval_set**.
+            pair in **eval_set**. The special column convention in `X` applies to
+            validation datasets as well.
 
         eval_metric : str, list of str, optional
             .. deprecated:: 1.6.0
@@ -1985,16 +2027,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
                 Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.
 
         """
-        # check if group information is provided
         with config_context(verbosity=self.verbosity):
-            if group is None and qid is None:
-                raise ValueError("group or qid is required for ranking task")
-
-            if eval_set is not None:
-                if eval_group is None and eval_qid is None:
-                    raise ValueError(
-                        "eval_group or eval_qid is required if eval_set is not None"
-                    )
             train_dmatrix, evals = _wrap_evaluation_matrices(
                 missing=self.missing,
                 X=X,
@@ -2009,7 +2042,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
                 base_margin_eval_set=base_margin_eval_set,
                 eval_group=eval_group,
                 eval_qid=eval_qid,
-                create_dmatrix=self._create_dmatrix,
+                create_dmatrix=self._create_ltr_dmatrix,
                 enable_categorical=self.enable_categorical,
                 feature_types=self.feature_types,
             )
@@ -2044,3 +2077,59 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
 
             self._set_evaluation_result(evals_result)
             return self
+
+    def predict(
+        self,
+        X: ArrayLike,
+        output_margin: bool = False,
+        ntree_limit: Optional[int] = None,
+        validate_features: bool = True,
+        base_margin: Optional[ArrayLike] = None,
+        iteration_range: Optional[Tuple[int, int]] = None,
+    ) -> ArrayLike:
+        X, _ = _get_qid(X, None)
+        return super().predict(
+            X,
+            output_margin,
+            ntree_limit,
+            validate_features,
+            base_margin,
+            iteration_range,
+        )
+
+    def apply(
+        self,
+        X: ArrayLike,
+        ntree_limit: int = 0,
+        iteration_range: Optional[Tuple[int, int]] = None,
+    ) -> ArrayLike:
+        X, _ = _get_qid(X, None)
+        return super().apply(X, ntree_limit, iteration_range)
+
+    def score(self, X: ArrayLike, y: ArrayLike) -> float:
+        """Evaluate score for data using the last evaluation metric.
+
+        Parameters
+        ----------
+        X : pd.DataFrame|cudf.DataFrame
+          Feature matrix. A DataFrame with a special `qid` column.
+
+        y :
+          Labels
+
+        Returns
+        -------
+        score :
+          The result of the first evaluation metric for the ranker.
+
+        """
+        X, qid = _get_qid(X, None)
+        Xyq = DMatrix(X, y, qid=qid)
+        if callable(self.eval_metric):
+            metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
+            result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
+        else:
+            result_str = self.get_booster().eval(Xyq)
+
+        metric_score = _parse_eval_str(result_str)
+        return metric_score[-1][1]
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 6d9733817..745c9348f 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -34,12 +34,12 @@ from pyspark.sql.types import (
     ShortType,
 )
 from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
-from xgboost.compat import is_cudf_available
-from xgboost.core import Booster
-from xgboost.training import train as worker_train
 
 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
+from xgboost.compat import is_cudf_available
+from xgboost.core import Booster
+from xgboost.training import train as worker_train
 
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
@@ -314,8 +314,19 @@ class _SparkXGBParams(
                 raise ValueError("Only string type 'objective' param is allowed.")
 
         if self.getOrDefault(self.eval_metric) is not None:
-            if not isinstance(self.getOrDefault(self.eval_metric), str):
-                raise ValueError("Only string type 'eval_metric' param is allowed.")
+            if not (
+                isinstance(self.getOrDefault(self.eval_metric), str)
+                or (
+                    isinstance(self.getOrDefault(self.eval_metric), List)
+                    and all(
+                        isinstance(metric, str)
+                        for metric in self.getOrDefault(self.eval_metric)
+                    )
+                )
+            ):
+                raise ValueError(
+                    "Only string type or list of string type 'eval_metric' param is allowed."
+                )
 
         if self.getOrDefault(self.early_stopping_rounds) is not None:
             if not (
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index e5a0eac94..6e2d4c6db 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -6,9 +6,9 @@ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tupl
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from xgboost.compat import concat
 
 from xgboost import DataIter, DMatrix, QuantileDMatrix, XGBModel
+from xgboost.compat import concat
 
 from .._typing import ArrayLike
 from ..core import _convert_ntree_limit
diff --git a/python-package/xgboost/spark/model.py b/python-package/xgboost/spark/model.py
index 6b050a468..888bc9cc5 100644
--- a/python-package/xgboost/spark/model.py
+++ b/python-package/xgboost/spark/model.py
@@ -8,6 +8,7 @@ import uuid
 from pyspark import SparkFiles, cloudpickle
 from pyspark.ml.util import DefaultParamsReader, DefaultParamsWriter, MLReader, MLWriter
 from pyspark.sql import SparkSession
+
 from xgboost.core import Booster
 
 from .utils import get_class_name, get_logger
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 189396089..979c40ea9 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -8,9 +8,9 @@ from typing import Any, Callable, Dict, Set, Type
 import pyspark
 from pyspark import BarrierTaskContext, SparkContext
 from pyspark.sql.session import SparkSession
-from xgboost.tracker import RabitTracker
 
 from xgboost import collective
+from xgboost.tracker import RabitTracker
 
 
 def get_class_name(cls: Type) -> str:
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 3702885c0..3b33e8774 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -33,10 +33,10 @@ from urllib import request
 import numpy as np
 import pytest
 from scipy import sparse
-from xgboost.core import ArrayLike
-from xgboost.sklearn import SklObjective
 
 import xgboost as xgb
+from xgboost.core import ArrayLike
+from xgboost.sklearn import SklObjective
 
 hypothesis = pytest.importorskip("hypothesis")
 
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
index e1f714294..8b39ba122 100644
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -2,9 +2,9 @@
 import numpy as np
 from dask import array as da
 from distributed import Client
-from xgboost.testing.updater import get_basescore
 
 import xgboost as xgb
+from xgboost.testing.updater import get_basescore
 
 
 def check_init_estimation_clf(tree_method: str, client: Client) -> None:
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 791ffd7ec..4f79d7358 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -2,6 +2,7 @@
 from typing import Any, Generator, Tuple, Union
 
 import numpy as np
+
 from xgboost.data import pandas_pyarrow_mapper
 
 
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
new file mode 100644
index 000000000..fe4fc8404
--- /dev/null
+++ b/python-package/xgboost/testing/ranking.py
@@ -0,0 +1,72 @@
+# pylint: disable=too-many-locals
+"""Tests for learning to rank."""
+from types import ModuleType
+from typing import Any
+
+import numpy as np
+import pytest
+
+import xgboost as xgb
+from xgboost import testing as tm
+
+
+def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:
+    """Test ranking with qid packed into X."""
+    import scipy.sparse
+    from sklearn.metrics import mean_squared_error
+    from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
+
+    X, y, q, _ = tm.make_ltr(n_samples=128, n_features=2, n_query_groups=8, max_rel=3)
+
+    # pack qid into x using dataframe
+    df = impl.DataFrame(X)
+    df["qid"] = q
+    ranker = xgb.XGBRanker(n_estimators=3, eval_metric="ndcg", tree_method=tree_method)
+    ranker.fit(df, y)
+    s = ranker.score(df, y)
+    assert s > 0.7
+
+    # works with validation datasets as well
+    valid_df = df.copy()
+    valid_df.iloc[0, 0] = 3.0
+    ranker.fit(df, y, eval_set=[(valid_df, y)])
+
+    # same as passing qid directly
+    ranker = xgb.XGBRanker(n_estimators=3, eval_metric="ndcg", tree_method=tree_method)
+    ranker.fit(X, y, qid=q)
+    s1 = ranker.score(df, y)
+    assert np.isclose(s, s1)
+
+    # Works with standard sklearn cv
+    if tree_method != "gpu_hist":
+        # we need cuML for this.
+        kfold = StratifiedGroupKFold(shuffle=False)
+        results = cross_val_score(ranker, df, y, cv=kfold, groups=df.qid)
+        assert len(results) == 5
+
+    # Works with custom metric
+    def neg_mse(*args: Any, **kwargs: Any) -> float:
+        return -float(mean_squared_error(*args, **kwargs))
+
+    ranker = xgb.XGBRanker(n_estimators=3, eval_metric=neg_mse, tree_method=tree_method)
+    ranker.fit(df, y, eval_set=[(valid_df, y)])
+    score = ranker.score(valid_df, y)
+    assert np.isclose(score, ranker.evals_result()["validation_0"]["neg_mse"][-1])
+
+    # Works with sparse data
+    if tree_method != "gpu_hist":
+        # no sparse with cuDF
+        X_csr = scipy.sparse.csr_matrix(X)
+        df = impl.DataFrame.sparse.from_spmatrix(
+            X_csr, columns=[str(i) for i in range(X.shape[1])]
+        )
+        df["qid"] = q
+        ranker = xgb.XGBRanker(
+            n_estimators=3, eval_metric="ndcg", tree_method=tree_method
+        )
+        ranker.fit(df, y)
+        s2 = ranker.score(df, y)
+        assert np.isclose(s2, s)
+
+    with pytest.raises(ValueError, match="Either `group` or `qid`."):
+        ranker.fit(df, y, eval_set=[(X, y)])
diff --git a/python-package/xgboost/testing/shared.py b/python-package/xgboost/testing/shared.py
index 92c5f1e0d..930873163 100644
--- a/python-package/xgboost/testing/shared.py
+++ b/python-package/xgboost/testing/shared.py
@@ -8,9 +8,9 @@ import tempfile
 from typing import Any, Callable, Dict, Type
 
 import numpy as np
-from xgboost._typing import ArrayLike
 
 import xgboost as xgb
+from xgboost._typing import ArrayLike
 
 
 def validate_leaf_output(leaf: np.ndarray, num_parallel_tree: int) -> None:
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 1b675e51f..4086f92c8 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -1,9 +1,12 @@
 """Tests for updaters."""
 import json
+from functools import partial, update_wrapper
+from typing import Dict
 
 import numpy as np
 
 import xgboost as xgb
+import xgboost.testing as tm
 
 
 def get_basescore(model: xgb.XGBModel) -> float:
@@ -68,3 +71,91 @@ def check_init_estimation(tree_method: str) -> None:
         n_samples=4096, n_labels=3, n_classes=5, random_state=17
     )
     run_clf(X, y)
+
+
+# pylint: disable=too-many-locals
+def check_quantile_loss(tree_method: str, weighted: bool) -> None:
+    """Test for quantile loss."""
+    from sklearn.datasets import make_regression
+    from sklearn.metrics import mean_pinball_loss
+
+    from xgboost.sklearn import _metric_decorator
+
+    n_samples = 4096
+    n_features = 8
+    n_estimators = 8
+    # non-zero base score can cause floating point difference with GPU predictor.
+    # multi-class has small difference than single target in the prediction kernel
+    base_score = 0.0
+    rng = np.random.RandomState(1994)
+    # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        random_state=rng,
+    )
+    if weighted:
+        weight = rng.random(size=n_samples)
+    else:
+        weight = None
+
+    Xy = xgb.QuantileDMatrix(X, y, weight=weight)
+
+    alpha = np.array([0.1, 0.5])
+    evals_result: Dict[str, Dict] = {}
+    booster_multi = xgb.train(
+        {
+            "objective": "reg:quantileerror",
+            "tree_method": tree_method,
+            "quantile_alpha": alpha,
+            "base_score": base_score,
+        },
+        Xy,
+        num_boost_round=n_estimators,
+        evals=[(Xy, "Train")],
+        evals_result=evals_result,
+    )
+    predt_multi = booster_multi.predict(Xy, strict_shape=True)
+
+    assert tm.non_increasing(evals_result["Train"]["quantile"])
+    assert evals_result["Train"]["quantile"][-1] < 20.0
+    # check that there's a way to use custom metric and compare the results.
+    metrics = [
+        _metric_decorator(
+            update_wrapper(
+                partial(mean_pinball_loss, sample_weight=weight, alpha=alpha[i]),
+                mean_pinball_loss,
+            )
+        )
+        for i in range(alpha.size)
+    ]
+
+    predts = np.empty(predt_multi.shape)
+    for i in range(alpha.shape[0]):
+        a = alpha[i]
+
+        booster_i = xgb.train(
+            {
+                "objective": "reg:quantileerror",
+                "tree_method": tree_method,
+                "quantile_alpha": a,
+                "base_score": base_score,
+            },
+            Xy,
+            num_boost_round=n_estimators,
+            evals=[(Xy, "Train")],
+            custom_metric=metrics[i],
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["quantile"])
+        assert evals_result["Train"]["quantile"][-1] < 30.0
+        np.testing.assert_allclose(
+            np.array(evals_result["Train"]["quantile"]),
+            np.array(evals_result["Train"]["mean_pinball_loss"]),
+            atol=1e-6,
+            rtol=1e-6,
+        )
+        predts[:, i] = booster_i.predict(Xy)
+
+    for i in range(alpha.shape[0]):
+        np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
diff --git a/rabit/CMakeLists.txt b/rabit/CMakeLists.txt
index ad39fb249..ab8171b2b 100644
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.3)
+cmake_minimum_required(VERSION 3.18)
 
 find_package(Threads REQUIRED)
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 6069da064..59cb429da 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -455,7 +455,8 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
   xgboost_CHECK_C_ARG_PTR(indptr);
   xgboost_CHECK_C_ARG_PTR(indices);
   xgboost_CHECK_C_ARG_PTR(data);
-  data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow};
+  data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data},
+                                static_cast<std::size_t>(nrow)};
   xgboost_CHECK_C_ARG_PTR(c_json_config);
   auto config = Json::Load(StringView{c_json_config});
   float missing = GetMissing(config);
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index f9fe8f187..702bda256 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2023 by XGBoost contributors
  */
 #pragma once
 #include <string>
@@ -9,7 +9,7 @@
 namespace xgboost {
 namespace collective {
 
-/*!
+/**
  * \brief Initialize the collective communicator.
  *
  *  Currently the communicator API is experimental, function signatures may change in the future
@@ -140,6 +140,19 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
   }
 }
 
+/**
+ * @brief Gathers data from all processes and distributes it to all processes.
+ *
+ * This assumes all ranks have the same size, and input data has been sliced into the
+ * corresponding position.
+ *
+ * @param send_receive_buffer Buffer storing the data.
+ * @param size                Size of the data in bytes.
+ */
+inline void Allgather(void *send_receive_buffer, std::size_t size) {
+  Communicator::Get()->AllGather(send_receive_buffer, size);
+}
+
 /*!
  * \brief Perform in-place allreduce. This function is NOT thread-safe.
  *
@@ -197,7 +210,7 @@ inline void Allreduce(uint64_t *send_receive_buffer, size_t count) {
 template <Operation op, typename T,
           typename = std::enable_if_t<std::is_same<size_t, T>{} && !std::is_same<uint64_t, T>{}> >
 inline void Allreduce(T *send_receive_buffer, size_t count) {
-  static_assert(sizeof(T) == sizeof(uint64_t), "");
+  static_assert(sizeof(T) == sizeof(uint64_t));
   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
 }
 
diff --git a/src/common/algorithm.h b/src/common/algorithm.h
index a5d2d1974..739a84968 100644
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -1,10 +1,32 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_ALGORITHM_H_
 #define XGBOOST_COMMON_ALGORITHM_H_
-#include <algorithm>  // std::upper_bound
-#include <cinttypes>  // std::size_t
+#include <algorithm>          // upper_bound, stable_sort, sort, max
+#include <cinttypes>          // size_t
+#include <functional>         // less
+#include <iterator>           // iterator_traits, distance
+#include <vector>             // vector
+
+#include "numeric.h"          // Iota
+#include "xgboost/context.h"  // Context
+
+// clang with libstdc++ works as well
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
+    !defined(__APPLE__) && __has_include(<omp.h>)
+#define GCC_HAS_PARALLEL 1
+#endif  // GLIC_VERSION
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define MSVC_HAS_PARALLEL 1
+#endif  // MSC
+
+#if defined(GCC_HAS_PARALLEL)
+#include <parallel/algorithm>
+#elif defined(MSVC_HAS_PARALLEL)
+#include <ppl.h>
+#endif  // GLIBC VERSION
 
 namespace xgboost {
 namespace common {
@@ -13,6 +35,63 @@ auto SegmentId(It first, It last, Idx idx) {
   std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first;
   return segment_id;
 }
+
+template <typename Iter, typename Comp>
+void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::stable_sort(begin, end, comp,
+                                __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#else
+    // the only stable sort is radix sort for msvc ppl.
+    std::stable_sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::stable_sort(begin, end, comp);
+  }
+}
+
+template <typename Iter, typename Comp>
+void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#elif defined(MSVC_HAS_PARALLEL)
+    auto n = std::distance(begin, end);
+    // use chunk size as hint to number of threads. No local policy/scheduler input with the
+    // concurrency module.
+    std::size_t chunk_size = n / ctx->Threads();
+    // 2048 is the default of msvc ppl as of v2022.
+    chunk_size = std::max(chunk_size, static_cast<std::size_t>(2048));
+    concurrency::parallel_sort(begin, end, comp, chunk_size);
+#else
+    std::sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::sort(begin, end, comp);
+  }
+}
+
+template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
+          typename Comp = std::less<V>>
+std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
+  CHECK(ctx->IsCPU());
+  auto n = std::distance(begin, end);
+  std::vector<Idx> result(n);
+  Iota(ctx, result.begin(), result.end(), 0);
+  auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); };
+  StableSort(ctx, result.begin(), result.end(), op);
+  return result;
+}
 }  // namespace common
 }  // namespace xgboost
+
+#if defined(GCC_HAS_PARALLEL)
+#undef GCC_HAS_PARALLEL
+#endif  // defined(GCC_HAS_PARALLEL)
+
+#if defined(MSVC_HAS_PARALLEL)
+#undef MSVC_HAS_PARALLEL
+#endif  // defined(MSVC_HAS_PARALLEL)
+
 #endif  // XGBOOST_COMMON_ALGORITHM_H_
diff --git a/src/common/categorical.h b/src/common/categorical.h
index 452aaa8c1..d7e262812 100644
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -42,9 +42,9 @@ constexpr inline bst_cat_t OutOfRangeCat() {
 
 inline XGBOOST_DEVICE bool InvalidCat(float cat) {
   constexpr auto kMaxCat = OutOfRangeCat();
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
-  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat);
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1);
+  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat);
   return cat < 0 || cat >= kMaxCat;
 }
 
diff --git a/src/common/charconv.cc b/src/common/charconv.cc
index 8be2c0a81..3114a90e3 100644
--- a/src/common/charconv.cc
+++ b/src/common/charconv.cc
@@ -270,7 +270,9 @@ struct RyuPowLogUtils {
    */
   static uint32_t MulPow5InvDivPow2(const uint32_t m, const uint32_t q,
                                     const int32_t j) noexcept(true) {
-    return MulShift(m, kFloatPow5InvSplit[q], j);
+    static_assert(sizeof(kFloatPow5InvSplit) == 55 * sizeof(std::uint64_t));
+    assert(q < 55);
+    return MulShift(m, kFloatPow5InvSplit[q], j);  // NOLINT
   }
 
   /*
@@ -495,12 +497,10 @@ class PowerBaseComputer {
                              static_cast<int32_t>(IEEE754::kFloatBias) -
                              static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
                              static_cast<int32_t>(2);
-      static_assert(static_cast<int32_t>(1) -
-                            static_cast<int32_t>(IEEE754::kFloatBias) -
-                            static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
-                            static_cast<int32_t>(2) ==
-                        -151,
-                    "");
+      static_assert(static_cast<int32_t>(1) - static_cast<int32_t>(IEEE754::kFloatBias) -
+                        static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
+                        static_cast<int32_t>(2) ==
+                    -151);
       mantissa_base2 = f.mantissa;
     } else {
       base2_range.exponent = static_cast<int32_t>(f.exponent) - IEEE754::kFloatBias -
@@ -544,7 +544,7 @@ class RyuPrinter {
     // Function precondition: v is not a 10-digit number.
     // (f2s: 9 digits are sufficient for round-tripping.)
     // (d2fixed: We print 9-digit blocks.)
-    static_assert(100000000 == Tens(8), "");
+    static_assert(100000000 == Tens(8));
     assert(v < Tens(9));
     if (v >= Tens(8)) {
       return 9;
@@ -911,7 +911,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
   // the bias and also special-case the value 0.
   int32_t shift = (f_e2 == 0 ? 1 : f_e2) - exp_b2 - IEEE754::kFloatBias -
                   IEEE754::kFloatMantissaBits;
-  assert(shift >= 0);
+  assert(shift >= 1);
 
   // We need to round up if the exact value is more than 0.5 above the value we
   // computed. That's equivalent to checking if the last removed bit was 1 and
@@ -920,7 +920,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
   //
   // We need to update trailingZeros given that we have the exact output
   // exponent ieee_e2 now.
-  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;
+  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;  // NOLINT
   uint32_t lastRemovedBit = (mantissa_b2 >> (shift - 1)) & 1;
   bool roundup = (lastRemovedBit != 0) &&
                  (!trailing_zeros || (((mantissa_b2 >> shift) & 1) != 0));
diff --git a/src/common/charconv.h b/src/common/charconv.h
index b931ed7ce..c37b0bd96 100644
--- a/src/common/charconv.h
+++ b/src/common/charconv.h
@@ -87,7 +87,7 @@ inline to_chars_result to_chars(char *first, char *last, int64_t value) { // NOL
   if (value < 0) {
     *first = '-';
     std::advance(first, 1);
-    unsigned_value = uint64_t(~value) + uint64_t(1);
+    unsigned_value = static_cast<uint64_t>(~value) + static_cast<uint64_t>(1);
   }
   return detail::ToCharsUnsignedImpl(first, last, unsigned_value);
 }
diff --git a/src/common/column_matrix.cc b/src/common/column_matrix.cc
index 91977b96d..d8acfa7a5 100644
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
     feature_offsets_[fid] = accum_index;
   }
 
-  SetTypeSize(gmat.max_num_bins);
+  SetTypeSize(gmat.MaxNumBinPerFeat());
   auto storage_size =
       feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
   index_.resize(storage_size, 0);
diff --git a/src/common/common.h b/src/common/common.h
index 5ac764817..35c807bef 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) {
 }
 #endif
 
-template <typename Idx, typename Container,
-          typename V = typename Container::value_type,
-          typename Comp = std::less<V>>
-std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
-  std::vector<Idx> result(array.size());
-  std::iota(result.begin(), result.end(), 0);
-  auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
-  XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
-  return result;
-}
-
 /**
  * Last index of a group in a CSR style of index pointer.
  */
@@ -206,31 +195,6 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
   return indptr[group + 1] - 1;
 }
-
-/**
- * \brief A CRTP (curiously recurring template pattern) helper function.
- *
- * https://www.fluentcpp.com/2017/05/19/crtp-helper/
- *
- * Does two things:
- * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
- * 2. Avoids having to `static_cast` in a lot of places.
- *
- * \tparam T The derived class in a CRTP hierarchy.
- */
-template <typename T>
-struct Crtp {
-  T &Underlying() { return static_cast<T &>(*this); }
-  T const &Underlying() const { return static_cast<T const &>(*this); }
-};
-
-/**
- * \brief C++17 std::as_const
- */
-template <typename T>
-typename std::add_const<T>::type &AsConst(T &v) noexcept {  // NOLINT(runtime/references)
-  return v;
-}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h
index 9f60722fb..5a5b5f252 100644
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -1,12 +1,13 @@
-/*!
- * Copyright 2017 by Contributors
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
  * \file compressed_iterator.h
  */
 #pragma once
 #include <xgboost/base.h>
-#include <cmath>
-#include <cstddef>
+
 #include <algorithm>
+#include <cmath>
+#include <cstddef>  // for size_t
 
 #include "common.h"
 
@@ -36,7 +37,7 @@ static const int kPadding = 4;  // Assign padding so we can read slightly off
 // The number of bits required to represent a given unsigned range
 inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) {
   auto bits = std::ceil(log2(static_cast<double>(num_symbols)));
-  return common::Max(static_cast<size_t>(bits), size_t(1));
+  return common::Max(static_cast<size_t>(bits), static_cast<std::size_t>(1));
 }
 }  // namespace detail
 
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index d56965dfe..58300d06c 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -20,6 +20,7 @@
 
 #include <algorithm>
 #include <chrono>
+#include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>
 #include <numeric>
@@ -178,7 +179,7 @@ inline size_t MaxSharedMemory(int device_idx) {
   dh::safe_cuda(cudaDeviceGetAttribute
                 (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                  device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }
 
 /**
@@ -195,7 +196,7 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
   dh::safe_cuda(cudaDeviceGetAttribute
                 (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
                  device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }
 
 inline void CheckComputeCapability() {
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 3b4d42a8d..6e83c084e 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -46,7 +46,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
   if (!use_sorted) {
     HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                   HostSketchContainer::UseGroup(info),
-                                  m->Info().data_split_mode == DataSplitMode::kCol, n_threads);
+                                  m->IsColumnSplit(), n_threads);
     for (auto const& page : m->GetBatches<SparsePage>()) {
       container.PushRowPage(page, info, hessian);
     }
@@ -54,7 +54,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
   } else {
     SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                     HostSketchContainer::UseGroup(info),
-                                    m->Info().data_split_mode == DataSplitMode::kCol, n_threads};
+                                    m->IsColumnSplit(), n_threads};
     for (auto const& page : m->GetBatches<SortedCSCPage>()) {
       container.PushColPage(page, info, hessian);
     }
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 2d3dff054..08ef98ea1 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -1,33 +1,31 @@
-/*!
- * Copyright 2018~2020 XGBoost contributors
+/**
+ * Copyright 2018~2023 by XGBoost contributors
  */
-
-#include <xgboost/logging.h>
-
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
+#include <xgboost/logging.h>
 
+#include <cstddef>  // for size_t
 #include <memory>
 #include <mutex>
 #include <utility>
 #include <vector>
 
+#include "categorical.h"
 #include "device_helpers.cuh"
-#include "hist_util.h"
 #include "hist_util.cuh"
+#include "hist_util.h"
 #include "math.h"  // NOLINT
 #include "quantile.h"
-#include "categorical.h"
 #include "xgboost/host_device_vector.h"
 
-
 namespace xgboost {
 namespace common {
 
@@ -318,7 +316,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
     size_t batch_nnz = batch.data.Size();
     auto const& info = dmat->Info();
     for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
+      size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
       if (has_weights) {
         bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
         dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 7dd62b382..856404107 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 XGBoost contributors
+/**
+ * Copyright 2020-2023 by XGBoost contributors
  *
  * \brief Front end and utilities for GPU based sketching.  Works on sliding window
  *        instead of stream.
@@ -9,11 +9,13 @@
 
 #include <thrust/host_vector.h>
 
+#include <cstddef>  // for size_t
+
+#include "../data/device_adapter.cuh"
+#include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "device_helpers.cuh"
 #include "timer.h"
-#include "../data/device_adapter.cuh"
 
 namespace xgboost {
 namespace common {
@@ -304,7 +306,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
         device, num_cuts_per_feature, true);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
       ProcessWeightedSlidingWindow(batch, info,
                                    num_cuts_per_feature,
                                    HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
@@ -316,7 +319,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
         device, num_cuts_per_feature, false);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
       ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
                            sketch_container, num_cuts_per_feature);
     }
diff --git a/src/common/io.cc b/src/common/io.cc
index 8405e6604..da3a75d65 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -50,7 +50,7 @@ size_t PeekableInStream::PeekRead(void* dptr, size_t size) {
   }
 }
 
-FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream), pointer_{0} {
+FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream) {
   size_t constexpr kInitialSize = 4096;
   size_t size{kInitialSize}, total{0};
   buffer_.clear();
diff --git a/src/common/io.h b/src/common/io.h
index bcc6c4704..2dd593c60 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -27,8 +27,7 @@ using MemoryBufferStream = rabit::utils::MemoryBufferStream;
  */
 class PeekableInStream : public dmlc::Stream {
  public:
-  explicit PeekableInStream(dmlc::Stream* strm)
-      : strm_(strm), buffer_ptr_(0) {}
+  explicit PeekableInStream(dmlc::Stream* strm) : strm_(strm) {}
 
   size_t Read(void* dptr, size_t size) override;
   virtual size_t PeekRead(void* dptr, size_t size);
@@ -41,7 +40,7 @@ class PeekableInStream : public dmlc::Stream {
   /*! \brief input stream */
   dmlc::Stream *strm_;
   /*! \brief current buffer pointer */
-  size_t buffer_ptr_;
+  size_t buffer_ptr_{0};
   /*! \brief internal buffer */
   std::string buffer_;
 };
@@ -72,7 +71,7 @@ class FixedSizeStream : public PeekableInStream {
   void Take(std::string* out);
 
  private:
-  size_t pointer_;
+  size_t pointer_{0};
   std::string buffer_;
 };
 
diff --git a/src/common/json.cc b/src/common/json.cc
index 0fddf87d5..8e2dd05ff 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -710,10 +710,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
   writer->Save(json);
 }
 
-static_assert(std::is_nothrow_move_constructible<Json>::value, "");
-static_assert(std::is_nothrow_move_constructible<Object>::value, "");
-static_assert(std::is_nothrow_move_constructible<Array>::value, "");
-static_assert(std::is_nothrow_move_constructible<String>::value, "");
+static_assert(std::is_nothrow_move_constructible<Json>::value);
+static_assert(std::is_nothrow_move_constructible<Object>::value);
+static_assert(std::is_nothrow_move_constructible<Array>::value);
+static_assert(std::is_nothrow_move_constructible<String>::value);
 
 Json UBJReader::ParseArray() {
   auto marker = PeekNextChar();
diff --git a/src/common/numeric.cc b/src/common/numeric.cc
index 2a1ca4d44..240e0234a 100644
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -14,7 +14,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
   if (ctx->IsCPU()) {
     auto const& h_values = values.ConstHostVector();
     auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
-    static_assert(std::is_same<decltype(result), double>::value, "");
+    static_assert(std::is_same<decltype(result), double>::value);
     return result;
   }
   return cuda_impl::Reduce(ctx, values);
diff --git a/src/common/numeric.h b/src/common/numeric.h
index 7b52b7ba6..6a1c15fd0 100644
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -42,8 +42,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
  */
 template <typename InIt, typename OutIt, typename T>
 void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
-  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
-  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
+  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
+  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
   // The number of threads is pegged to the batch size. If the OMP block is parallelized
   // on anything other than the batch/block size, it should be reassigned
   auto n = static_cast<size_t>(std::distance(begin, end));
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index d52bcef87..9a9c162d2 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -31,6 +31,8 @@ namespace common {
 // BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
 template<size_t BlockSize>
 class PartitionBuilder {
+  using BitVector = RBitField8;
+
  public:
   template<typename Func>
   void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
@@ -121,27 +123,11 @@ class PartitionBuilder {
     bool default_left = tree[nid].DefaultLeft();
     bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
     auto node_cats = tree.NodeCats(nid);
-
-    auto const& index = gmat.index;
     auto const& cut_values = gmat.cut.Values();
-    auto const& cut_ptrs = gmat.cut.Ptrs();
-
-    auto gidx_calc = [&](auto ridx) {
-      auto begin = gmat.RowIdx(ridx);
-      if (gmat.IsDense()) {
-        return static_cast<bst_bin_t>(index[begin + fid]);
-      }
-      auto end = gmat.RowIdx(ridx + 1);
-      auto f_begin = cut_ptrs[fid];
-      auto f_end = cut_ptrs[fid + 1];
-      // bypassing the column matrix as we need the cut value instead of bin idx for categorical
-      // features.
-      return BinarySearchBin(begin, end, index, f_begin, f_end);
-    };
 
     auto pred_hist = [&](auto ridx, auto bin_id) {
       if (any_cat && is_cat) {
-        auto gidx = gidx_calc(ridx);
+        auto gidx = gmat.GetGindex(ridx, fid);
         bool go_left = default_left;
         if (gidx > -1) {
           go_left = Decision(node_cats, cut_values[gidx]);
@@ -153,7 +139,7 @@ class PartitionBuilder {
     };
 
     auto pred_approx = [&](auto ridx) {
-      auto gidx = gidx_calc(ridx);
+      auto gidx = gmat.GetGindex(ridx, fid);
       bool go_left = default_left;
       if (gidx > -1) {
         if (is_cat) {
@@ -199,6 +185,84 @@ class PartitionBuilder {
     SetNRightElems(node_in_set, range.begin(), n_right);
   }
 
+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix,
+                const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  void PartitionByMask(const size_t node_in_set,
+                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
   // allocate thread local memory, should be called for each specific task
   void AllocateForTask(size_t id) {
     if (mem_blocks_[id].get() == nullptr) {
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 8f89ed26f..cabdc603b 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -109,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
 template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
   CHECK_EQ(out.size(), src.size());
-  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
+  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
   dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                 out.size_bytes(),
                                 cudaMemcpyDefault));
@@ -143,7 +143,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
       thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));
 
   dh::XGBCachingDeviceAllocator<Tuple> alloc;
-  static_assert(sizeof(Tuple) == sizeof(SketchEntry), "");
+  static_assert(sizeof(Tuple) == sizeof(SketchEntry));
   // We reuse the memory for storing merge path.
   common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
   // Determine the merge path, 0 if element is from x, 1 if it's from y.
diff --git a/src/common/random.cc b/src/common/random.cc
index f66b084cc..d0e75729d 100644
--- a/src/common/random.cc
+++ b/src/common/random.cc
@@ -24,8 +24,9 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
     for (size_t i = 0; i < h_features.size(); ++i) {
       weights[i] = feature_weights_[h_features[i]];
     }
+    CHECK(ctx_);
     new_features.HostVector() =
-        WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n);
+        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
   } else {
     new_features.Resize(features.size());
     std::copy(features.begin(), features.end(), new_features.HostVector().begin());
diff --git a/src/common/random.h b/src/common/random.h
index 2d29bede3..5efdb486d 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -20,7 +20,9 @@
 #include <vector>
 
 #include "../collective/communicator-inl.h"
+#include "algorithm.h"  // ArgSort
 #include "common.h"
+#include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"
 
 namespace xgboost {
@@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
  * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
 */
 template <typename T>
-std::vector<T> WeightedSamplingWithoutReplacement(
-    std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
+std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
+                                                  std::vector<float> const& weights, size_t n) {
   // ES sampling.
   CHECK_EQ(array.size(), weights.size());
   std::vector<float> keys(weights.size());
@@ -100,7 +102,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
     auto k = std::log(u) / w;
     keys[i] = k;
   }
-  auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
+  auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
   ind.resize(n);
 
   std::vector<T> results(ind.size());
@@ -126,6 +128,7 @@ class ColumnSampler {
   float colsample_bytree_{1.0f};
   float colsample_bynode_{1.0f};
   GlobalRandomEngine rng_;
+  Context const* ctx_;
 
  public:
   std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
@@ -157,12 +160,13 @@ class ColumnSampler {
    * \param colsample_bytree
    * \param skip_index_0      (Optional) True to skip index 0.
    */
-  void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
-            float colsample_bylevel, float colsample_bytree) {
+  void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
+            float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
     feature_weights_ = std::move(feature_weights);
     colsample_bylevel_ = colsample_bylevel;
     colsample_bytree_ = colsample_bytree;
     colsample_bynode_ = colsample_bynode;
+    ctx_ = ctx;
 
     if (feature_set_tree_ == nullptr) {
       feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
diff --git a/src/common/row_set.h b/src/common/row_set.h
index 87d5f5287..11f12bda3 100644
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -77,14 +77,14 @@ class RowSetCollection {
     if (row_indices_.empty()) {  // edge case: empty instance set
       constexpr size_t* kBegin = nullptr;
       constexpr size_t* kEnd = nullptr;
-      static_assert(kEnd - kBegin == 0, "");
-      elem_of_each_node_.emplace_back(Elem(kBegin, kEnd, 0));
+      static_assert(kEnd - kBegin == 0);
+      elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
       return;
     }
 
     const size_t* begin = dmlc::BeginPtr(row_indices_);
     const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
-    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+    elem_of_each_node_.emplace_back(begin, end, 0);
   }
 
   std::vector<size_t>* Data() { return &row_indices_; }
diff --git a/src/common/stats.cc b/src/common/stats.cc
index 1770f521e..80fc2c50d 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -35,11 +35,11 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
     auto iter = linalg::cbegin(ti_v);
     float q{0};
     if (opt_weights.Empty()) {
-      q = common::Quantile(0.5, iter, iter + ti_v.Size());
+      q = common::Quantile(ctx, 0.5, iter, iter + ti_v.Size());
     } else {
       CHECK_NE(t_v.Shape(1), 0);
       auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
-      q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
+      q = common::WeightedQuantile(ctx, 0.5, iter, iter + ti_v.Size(), w_it);
     }
     h_out(i) = q;
   }
diff --git a/src/common/stats.h b/src/common/stats.h
index 5f7892cb5..2f42a698e 100644
--- a/src/common/stats.h
+++ b/src/common/stats.h
@@ -4,46 +4,52 @@
 #ifndef XGBOOST_COMMON_STATS_H_
 #define XGBOOST_COMMON_STATS_H_
 #include <algorithm>
-#include <iterator>
+#include <iterator>  // for distance
 #include <limits>
 #include <vector>
 
+#include "algorithm.h"           // for StableSort
 #include "common.h"              // AssertGPUSupport, OptionalWeights
 #include "optional_weight.h"     // OptionalWeights
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/context.h"     // Context
-#include "xgboost/linalg.h"
-#include "xgboost/logging.h"  // CHECK_GE
+#include "xgboost/linalg.h"      // TensorView,VectorView
+#include "xgboost/logging.h"     // CHECK_GE
 
 namespace xgboost {
 namespace common {
 
 /**
- * \brief Percentile with masked array using linear interpolation.
+ * @brief Quantile using linear interpolation.
  *
  *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
  *
- * \param alpha Percentile, must be in range [0, 1].
+ * \param alpha Quantile, must be in range [0, 1].
  * \param begin Iterator begin for input array.
  * \param end   Iterator end for input array.
  *
  * \return The result of interpolation.
  */
 template <typename Iter>
-float Quantile(double alpha, Iter const& begin, Iter const& end) {
+float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const& end) {
   CHECK(alpha >= 0 && alpha <= 1);
   auto n = static_cast<double>(std::distance(begin, end));
   if (n == 0) {
     return std::numeric_limits<float>::quiet_NaN();
   }
 
-  std::vector<size_t> sorted_idx(n);
+  std::vector<std::size_t> sorted_idx(n);
   std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+  if (omp_in_parallel()) {
+    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }
 
   auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
-  static_assert(std::is_same<decltype(val(0)), float>::value, "");
+  static_assert(std::is_same<decltype(val(0)), float>::value);
 
   if (alpha <= (1 / (n + 1))) {
     return val(0);
@@ -51,7 +57,7 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
   if (alpha >= (n / (n + 1))) {
     return val(sorted_idx.size() - 1);
   }
-  assert(n != 0 && "The number of rows in a leaf can not be zero.");
+
   double x = alpha * static_cast<double>((n + 1));
   double k = std::floor(x) - 1;
   CHECK_GE(k, 0);
@@ -66,30 +72,35 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
  * \brief Calculate the weighted quantile with step function. Unlike the unweighted
  *        version, no interpolation is used.
  *
- *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
+ *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussions on computing
  *   weighted quantile with interpolation.
  */
 template <typename Iter, typename WeightIter>
-float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
+float WeightedQuantile(Context const* ctx, double alpha, Iter begin, Iter end, WeightIter w_begin) {
   auto n = static_cast<double>(std::distance(begin, end));
   if (n == 0) {
     return std::numeric_limits<float>::quiet_NaN();
   }
   std::vector<size_t> sorted_idx(n);
   std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+  if (omp_in_parallel()) {
+    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }
 
   auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
 
   std::vector<float> weight_cdf(n);  // S_n
   // weighted cdf is sorted during construction
-  weight_cdf[0] = *(weights + sorted_idx[0]);
+  weight_cdf[0] = *(w_begin + sorted_idx[0]);
   for (size_t i = 1; i < n; ++i) {
-    weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
+    weight_cdf[i] = weight_cdf[i - 1] + w_begin[sorted_idx[i]];
   }
   float thresh = weight_cdf.back() * alpha;
-  size_t idx =
+  std::size_t idx =
       std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
   idx = std::min(idx, static_cast<size_t>(n - 1));
   return val(idx);
diff --git a/src/data/data.cc b/src/data/data.cc
index a935220e5..d24048a2a 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -10,12 +10,13 @@
 #include <cstring>
 
 #include "../collective/communicator-inl.h"
+#include "../common/algorithm.h"  // StableSort
 #include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
 #include "../common/group_data.h"
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
-#include "../common/numeric.h"
+#include "../common/numeric.h"  // Iota
 #include "../common/threading_utils.h"
 #include "../common/version.h"
 #include "../data/adapter.h"
@@ -258,6 +259,19 @@ void LoadFeatureType(std::vector<std::string>const& type_names, std::vector<Feat
   }
 }
 
+const std::vector<size_t>& MetaInfo::LabelAbsSort(Context const* ctx) const {
+  if (label_order_cache_.size() == labels.Size()) {
+    return label_order_cache_;
+  }
+  label_order_cache_.resize(labels.Size());
+  common::Iota(ctx, label_order_cache_.begin(), label_order_cache_.end(), 0);
+  const auto& l = labels.Data()->HostVector();
+  common::StableSort(ctx, label_order_cache_.begin(), label_order_cache_.end(),
+                     [&l](size_t i1, size_t i2) { return std::abs(l[i1]) < std::abs(l[i2]); });
+
+  return label_order_cache_;
+}
+
 void MetaInfo::LoadBinary(dmlc::Stream *fi) {
   auto version = Version::Load(fi);
   auto major = std::get<0>(version);
@@ -898,6 +912,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     if (!cache_file.empty()) {
       LOG(FATAL) << "Column-wise data split is not support for external memory.";
     }
+    LOG(CONSOLE) << "Splitting data by column";
     auto* sliced = dmat->SliceCol(npart, partid);
     delete dmat;
     return sliced;
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 4a635e92d..56c494dd1 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -1,12 +1,14 @@
-/*!
- *  Copyright (c) 2019 by Contributors
+/**
+ *  Copyright 2019-2023 by XGBoost Contributors
  * \file device_adapter.cuh
  */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
+#include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <string>
+
 #include "../common/device_helpers.cuh"
 #include "../common/math.h"
 #include "adapter.h"
@@ -205,10 +207,10 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
     }
   });
   dh::XGBCachingDeviceAllocator<char> alloc;
-  size_t row_stride = dh::Reduce(
-      thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
-      thrust::maximum<size_t>());
+  size_t row_stride =
+      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                 thrust::device_pointer_cast(offset.data()) + offset.size(),
+                 static_cast<std::size_t>(0), thrust::maximum<size_t>());
   return row_stride;
 }
 };  // namespace data
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 140bcbff9..0a606ecd5 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -21,13 +21,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
 
 GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                    double sparse_thresh, bool sorted_sketch, int32_t n_threads,
-                                   common::Span<float> hess) {
+                                   common::Span<float> hess)
+    : max_numeric_bins_per_feat{max_bins_per_feat} {
   CHECK(p_fmat->SingleColBlock());
   // We use sorted sketching for approx tree method since it's more efficient in
   // computation time (but higher memory usage).
   cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
 
-  max_num_bins = max_bins_per_feat;
   const uint32_t nbins = cut.Ptrs().back();
   hit_count.resize(nbins, 0);
   hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -64,7 +64,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
     : row_ptr(info.num_row_ + 1, 0),
       hit_count(cuts.TotalBins(), 0),
       cut{std::forward<common::HistogramCuts>(cuts)},
-      max_num_bins(max_bin_per_feat),
+      max_numeric_bins_per_feat(max_bin_per_feat),
       isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
 
 #if !defined(XGBOOST_USE_CUDA)
@@ -87,13 +87,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
 }
 
 GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
-                                   common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
-                                   bool isDense, double sparse_thresh, int32_t n_threads) {
+                                   common::HistogramCuts cuts, int32_t max_bins_per_feat,
+                                   bool isDense, double sparse_thresh, int32_t n_threads)
+    : cut{std::move(cuts)},
+      max_numeric_bins_per_feat{max_bins_per_feat},
+      base_rowid{batch.base_rowid},
+      isDense_{isDense} {
   CHECK_GE(n_threads, 1);
-  base_rowid = batch.base_rowid;
-  isDense_ = isDense;
-  cut = cuts;
-  max_num_bins = max_bins_per_feat;
   CHECK_EQ(row_ptr.size(), 0);
   // The number of threads is pegged to the batch size. If the OMP
   // block is parallelized on anything other than the batch/block size,
@@ -128,12 +128,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH
 
 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
+      isDense) {
     // compress dense index to uint8
     index.SetBinTypeSize(common::kUint8BinsTypeSize);
     index.Resize((sizeof(uint8_t)) * n_index);
-  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
-              max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
+  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
+              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
              isDense) {
     // compress dense index to uint16
     index.SetBinTypeSize(common::kUint16BinsTypeSize);
@@ -149,16 +150,24 @@ common::ColumnMatrix const &GHistIndexMatrix::Transpose() const {
   return *columns_;
 }
 
+bst_bin_t GHistIndexMatrix::GetGindex(size_t ridx, size_t fidx) const {
+  auto begin = RowIdx(ridx);
+  if (IsDense()) {
+    return static_cast<bst_bin_t>(index[begin + fidx]);
+  }
+  auto end = RowIdx(ridx + 1);
+  auto const& cut_ptrs = cut.Ptrs();
+  auto f_begin = cut_ptrs[fidx];
+  auto f_end = cut_ptrs[fidx + 1];
+  return BinarySearchBin(begin, end, index, f_begin, f_end);
+}
+
 float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
   auto const &values = cut.Values();
   auto const &mins = cut.MinValues();
   auto const &ptrs = cut.Ptrs();
   if (is_cat) {
-    auto f_begin = ptrs[fidx];
-    auto f_end = ptrs[fidx + 1];
-    auto begin = RowIdx(ridx);
-    auto end = RowIdx(ridx + 1);
-    auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end);
+    auto gidx = GetGindex(ridx, fidx);
     if (gidx == -1) {
       return std::numeric_limits<float>::quiet_NaN();
     }
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index 42d935b3c..af5b0f67b 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
 
 GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
                                    EllpackPage const& in_page, BatchParam const& p)
-    : max_num_bins{p.max_bin} {
+    : max_numeric_bins_per_feat{p.max_bin} {
   auto page = in_page.Impl();
   isDense_ = page->is_dense;
 
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index b914256af..9eba9637f 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -134,11 +134,15 @@ class GHistIndexMatrix {
   std::vector<size_t> hit_count;
   /*! \brief The corresponding cuts */
   common::HistogramCuts cut;
-  /*! \brief max_bin for each feature. */
-  bst_bin_t max_num_bins;
+  /** \brief max_bin for each feature. */
+  bst_bin_t max_numeric_bins_per_feat;
   /*! \brief base row index for current page (used by external memory) */
   size_t base_rowid{0};
 
+  bst_bin_t MaxNumBinPerFeat() const {
+    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
+  }
+
   ~GHistIndexMatrix();
   /**
    * \brief Constrcutor for SimpleDMatrix.
@@ -161,7 +165,7 @@ class GHistIndexMatrix {
    * \brief Constructor for external memory.
    */
   GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
-                   common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
+                   common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
                    double sparse_thresh, int32_t n_threads);
   GHistIndexMatrix();  // also for ext mem, empty ctor so that we can read the cache back.
 
@@ -224,6 +228,8 @@ class GHistIndexMatrix {
 
   common::ColumnMatrix const& Transpose() const;
 
+  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+
   float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
 
  private:
diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index 4b3fd0ea0..204157682 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
     if (!fi->Read(&page->hit_count)) {
       return false;
     }
-    if (!fi->Read(&page->max_num_bins)) {
+    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
       return false;
     }
     if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
         page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
         sizeof(uint64_t);
     // max_bins, base row, is_dense
-    fo->Write(page.max_num_bins);
-    bytes += sizeof(page.max_num_bins);
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
     fo->Write(page.base_rowid);
     bytes += sizeof(page.base_rowid);
     fo->Write(page.IsDense());
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 472227e38..ae0cfc4a4 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -213,7 +213,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
         SyncFeatureType(&h_ft);
         p_sketch.reset(new common::HostSketchContainer{
             batch_param_.max_bin, h_ft, column_sizes, !proxy->Info().group_ptr_.empty(),
-            proxy->Info().data_split_mode == DataSplitMode::kCol, ctx_.Threads()});
+            proxy->IsColumnSplit(), ctx_.Threads()});
       }
       HostAdapterDispatch(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = batch_nnz[i];
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 7881c62d2..698e1e5b2 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -19,7 +19,7 @@ const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
 
 namespace detail {
 // Use device dispatch
-std::size_t NSamplesDevice(DMatrixProxy *)
+std::size_t NSamplesDevice(DMatrixProxy *)  // NOLINT
 #if defined(XGBOOST_USE_CUDA)
 ;  // NOLINT
 #else
@@ -28,7 +28,7 @@ std::size_t NSamplesDevice(DMatrixProxy *)
   return 0;
 }
 #endif
-std::size_t NFeaturesDevice(DMatrixProxy *)
+std::size_t NFeaturesDevice(DMatrixProxy *)  // NOLINT
 #if defined(XGBOOST_USE_CUDA)
 ;  // NOLINT
 #else
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 84e766121..575820758 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -75,10 +75,7 @@ class GBLinear : public GradientBooster {
       : GradientBooster{ctx},
         learner_model_param_{learner_model_param},
         model_{learner_model_param},
-        previous_model_{learner_model_param},
-        sum_instance_weight_(0),
-        sum_weight_complete_(false),
-        is_converged_(false) {}
+        previous_model_{learner_model_param} {}
 
   void Configure(const Args& cfg) override {
     if (model_.weight.size() == 0) {
@@ -344,10 +341,10 @@ class GBLinear : public GradientBooster {
   GBLinearModel previous_model_;
   GBLinearTrainParam param_;
   std::unique_ptr<LinearUpdater> updater_;
-  double sum_instance_weight_;
-  bool sum_weight_complete_;
+  double sum_instance_weight_{};
+  bool sum_weight_complete_{false};
   common::Monitor monitor_;
-  bool is_converged_;
+  bool is_converged_{false};
 };
 
 // register the objective functions
diff --git a/src/gbm/gblinear_model.h b/src/gbm/gblinear_model.h
index 577494f87..80dd1ac04 100644
--- a/src/gbm/gblinear_model.h
+++ b/src/gbm/gblinear_model.h
@@ -47,12 +47,12 @@ class GBLinearModel : public Model {
   DeprecatedGBLinearModelParam param_;
 
  public:
-  int32_t num_boosted_rounds;
+  int32_t num_boosted_rounds{0};
   LearnerModelParam const* learner_model_param;
 
  public:
-  explicit GBLinearModel(LearnerModelParam const* learner_model_param) :
-      num_boosted_rounds{0}, learner_model_param {learner_model_param} {}
+  explicit GBLinearModel(LearnerModelParam const *learner_model_param)
+      : learner_model_param{learner_model_param} {}
   void Configure(Args const &) { }
 
   // weight for each of feature, bias is the last one
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index dc280217e..39f38c289 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -32,15 +32,14 @@
 #include "xgboost/string_view.h"
 #include "xgboost/tree_updater.h"
 
-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);
 
-void GBTree::Configure(const Args& cfg) {
+void GBTree::Configure(Args const& cfg) {
   this->cfg_ = cfg;
   std::string updater_seq = tparam_.updater_seq;
   tparam_.UpdateAllowUnknown(cfg);
+  tree_param_.UpdateAllowUnknown(cfg);
 
   model_.Configure(cfg);
 
@@ -235,9 +234,11 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   CHECK_EQ(model_.param.num_parallel_tree, trees.size());
   CHECK_EQ(model_.param.num_parallel_tree, 1)
       << "Boosting random forest is not supported for current objective.";
+  CHECK_EQ(trees.size(), model_.param.num_parallel_tree);
   for (std::size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
     auto const& position = node_position.at(tree_idx);
-    obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, group_idx, trees[tree_idx].get());
+    obj->UpdateTreeLeaf(position, p_fmat->Info(), tree_param_.learning_rate / trees.size(),
+                        predictions, group_idx, trees[tree_idx].get());
   }
 }
 
@@ -388,9 +389,15 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
 
   CHECK(out_position);
   out_position->resize(new_trees.size());
+
+  // Rescale learning rate according to the size of trees
+  auto lr = tree_param_.learning_rate;
+  tree_param_.learning_rate /= static_cast<float>(new_trees.size());
   for (auto& up : updaters_) {
-    up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
+    up->Update(&tree_param_, gpair, p_fmat,
+               common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
   }
+  tree_param_.learning_rate = lr;
 }
 
 void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
@@ -404,6 +411,8 @@ void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& ne
 void GBTree::LoadConfig(Json const& in) {
   CHECK_EQ(get<String>(in["name"]), "gbtree");
   FromJson(in["gbtree_train_param"], &tparam_);
+  FromJson(in["tree_train_param"], &tree_param_);
+
   // Process type cannot be kUpdate from loaded model
   // This would cause all trees to be pushed to trees_to_update
   // e.g. updating a model, then saving and loading it would result in an empty model
@@ -451,6 +460,7 @@ void GBTree::SaveConfig(Json* p_out) const {
   auto& out = *p_out;
   out["name"] = String("gbtree");
   out["gbtree_train_param"] = ToJson(tparam_);
+  out["tree_train_param"] = ToJson(tree_param_);
 
   // Process type cannot be kUpdate from loaded model
   // This would cause all trees to be pushed to trees_to_update
@@ -1058,5 +1068,4 @@ XGBOOST_REGISTER_GBM(Dart, "dart")
       GBTree* p = new Dart(booster_config, ctx);
       return p;
     });
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 6bf98916f..10e6c415f 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -20,6 +20,7 @@
 
 #include "../common/common.h"
 #include "../common/timer.h"
+#include "../tree/param.h"  // TrainParam
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -405,8 +406,8 @@ class GBTree : public GradientBooster {
         p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
   }
 
-  std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
-                                     std::string format) const override {
+  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+                                                   std::string format) const override {
     return model_.DumpModel(fmap, with_stats, this->ctx_->Threads(), format);
   }
 
@@ -428,6 +429,8 @@ class GBTree : public GradientBooster {
   GBTreeModel model_;
   // training parameter
   GBTreeTrainParam tparam_;
+  // Tree training parameter
+  tree::TrainParam tree_param_;
   // ----training fields----
   bool showed_updater_warning_ {false};
   bool specified_updater_   {false};
diff --git a/src/learner.cc b/src/learner.cc
index 390889e9c..0e47c694c 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -21,7 +21,7 @@
 #include <sstream>
 #include <stack>
 #include <string>
-#include <utility>
+#include <utility>  // for as_const
 #include <vector>
 
 #include "collective/communicator-inl.h"
@@ -257,11 +257,11 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
     : LearnerModelParam{user_param, t} {
   std::swap(base_score_, base_margin);
   // Make sure read access everywhere for thread-safe prediction.
-  common::AsConst(base_score_).HostView();
+  std::as_const(base_score_).HostView();
   if (!ctx->IsCPU()) {
-    common::AsConst(base_score_).View(ctx->gpu_id);
+    std::as_const(base_score_).View(ctx->gpu_id);
   }
-  CHECK(common::AsConst(base_score_).Data()->HostCanRead());
+  CHECK(std::as_const(base_score_).Data()->HostCanRead());
 }
 
 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
@@ -287,9 +287,9 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
   base_score_.Reshape(that.base_score_.Shape());
   base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
   base_score_.Data()->Copy(*that.base_score_.Data());
-  common::AsConst(base_score_).HostView();
+  std::as_const(base_score_).HostView();
   if (that.base_score_.DeviceIdx() != Context::kCpuId) {
-    common::AsConst(base_score_).View(that.base_score_.DeviceIdx());
+    std::as_const(base_score_).View(that.base_score_.DeviceIdx());
   }
   CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
   CHECK(base_score_.Data()->HostCanRead());
@@ -328,9 +328,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
     dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
 
-using ThreadLocalPredictionCache =
-    dmlc::ThreadLocalStore<std::map<Learner const *, PredictionContainer>>;
-
 namespace {
 StringView ModelMsg() {
   return StringView{
@@ -368,6 +365,8 @@ class LearnerConfiguration : public Learner {
   LearnerModelParam learner_model_param_;
   LearnerTrainParam tparam_;
   // Initial prediction.
+  PredictionContainer prediction_container_;
+
   std::vector<std::string> metric_names_;
 
   void ConfigureModelParamWithoutBaseScore() {
@@ -426,22 +425,15 @@ class LearnerConfiguration : public Learner {
   }
 
  public:
-  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix> > cache)
+  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix>> cache)
       : need_configuration_{true} {
     monitor_.Init("Learner");
-    auto& local_cache = (*ThreadLocalPredictionCache::Get())[this];
     for (std::shared_ptr<DMatrix> const& d : cache) {
       if (d) {
-        local_cache.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, Context::kCpuId);
       }
     }
   }
-  ~LearnerConfiguration() override {
-    auto local_cache = ThreadLocalPredictionCache::Get();
-    if (local_cache->find(this) != local_cache->cend()) {
-      local_cache->erase(this);
-    }
-  }
 
   // Configuration before data is known.
   void Configure() override {
@@ -499,10 +491,6 @@ class LearnerConfiguration : public Learner {
     CHECK_NE(learner_model_param_.BaseScore(this->Ctx()).Size(), 0) << ModelNotFitted();
   }
 
-  virtual PredictionContainer* GetPredictionCache() const {
-    return &((*ThreadLocalPredictionCache::Get())[this]);
-  }
-
   void LoadConfig(Json const& in) override {
     // If configuration is loaded, ensure that the model came from the same version
     CHECK(IsA<Object>(in));
@@ -741,11 +729,10 @@ class LearnerConfiguration : public Learner {
     if (mparam_.num_feature == 0) {
       // TODO(hcho3): Change num_feature to 64-bit integer
       unsigned num_feature = 0;
-      auto local_cache = this->GetPredictionCache();
-      for (auto& matrix : local_cache->Container()) {
-        CHECK(matrix.first);
+      for (auto const& matrix : prediction_container_.Container()) {
+        CHECK(matrix.first.ptr);
         CHECK(!matrix.second.ref.expired());
-        const uint64_t num_col = matrix.first->Info().num_col_;
+        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
         CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
             << "Unfortunately, XGBoost does not support data matrices with "
             << std::numeric_limits<unsigned>::max() << " features or greater";
@@ -817,13 +804,13 @@ class LearnerConfiguration : public Learner {
    */
   void ConfigureTargets() {
     CHECK(this->obj_);
-    auto const& cache = this->GetPredictionCache()->Container();
+    auto const& cache = prediction_container_.Container();
     size_t n_targets = 1;
     for (auto const& d : cache) {
       if (n_targets == 1) {
-        n_targets = this->obj_->Targets(d.first->Info());
+        n_targets = this->obj_->Targets(d.first.ptr->Info());
       } else {
-        auto t = this->obj_->Targets(d.first->Info());
+        auto t = this->obj_->Targets(d.first.ptr->Info());
         CHECK(n_targets == t || 1 == t) << "Inconsistent labels.";
       }
     }
@@ -1275,8 +1262,7 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
-    auto local_cache = this->GetPredictionCache();
-    auto& predt = local_cache->Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
 
     monitor_.Start("PredictRaw");
     this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1303,8 +1289,7 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
-    auto local_cache = this->GetPredictionCache();
-    auto& predt = local_cache->Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
     gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
     monitor_.Stop("BoostOneIter");
   }
@@ -1326,10 +1311,9 @@ class LearnerImpl : public LearnerIO {
       metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
     }
 
-    auto local_cache = this->GetPredictionCache();
     for (size_t i = 0; i < data_sets.size(); ++i) {
       std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = local_cache->Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
       this->ValidateDMatrix(m.get(), false);
       this->PredictRaw(m.get(), &predt, false, 0, 0);
 
@@ -1370,8 +1354,7 @@ class LearnerImpl : public LearnerIO {
     } else if (pred_leaf) {
       gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
     } else {
-      auto local_cache = this->GetPredictionCache();
-      auto& prediction = local_cache->Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
       this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
       // Copy the prediction cache to output prediction. out_preds comes from C API
       out_preds->SetDevice(ctx_.gpu_id);
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 9bedd95ee..a926c2c5b 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -14,9 +14,11 @@
 #include <utility>
 #include <vector>
 
+#include "../common/algorithm.h"        // ArgSort
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "metric_common.h"              // MetricNoCache
+#include "xgboost/context.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
@@ -77,9 +79,8 @@ BinaryAUC(common::Span<float const> predts, linalg::VectorView<float const> labe
  *   Machine Learning Models
  */
 template <typename BinaryAUC>
-double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
-                     size_t n_classes, int32_t n_threads,
-                     BinaryAUC &&binary_auc) {
+double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
+                     size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
   CHECK_NE(n_classes, 0);
   auto const labels = info.labels.View(Context::kCpuId);
   if (labels.Shape(0) != 0) {
@@ -108,7 +109,7 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
       }
       double fp;
       std::tie(fp, tp(c), auc(c)) =
-          binary_auc(proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
+          binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
       local_area(c) = fp * tp(c);
     });
   }
@@ -139,23 +140,26 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
   return auc_sum;
 }
 
-std::tuple<double, double, double> BinaryROCAUC(common::Span<float const> predts,
+std::tuple<double, double, double> BinaryROCAUC(Context const *ctx,
+                                                common::Span<float const> predts,
                                                 linalg::VectorView<float const> labels,
                                                 common::OptionalWeights weights) {
-  auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
+  auto const sorted_idx =
+      common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
   return BinaryAUC(predts, labels, weights, sorted_idx, TrapezoidArea);
 }
 
 /**
  * Calculate AUC for 1 ranking group;
  */
-double GroupRankingROC(common::Span<float const> predts,
+double GroupRankingROC(Context const* ctx, common::Span<float const> predts,
                        linalg::VectorView<float const> labels, float w) {
   // on ranking, we just count all pairs.
   double auc{0};
   // argsort doesn't support tensor input yet.
   auto raw_labels = labels.Values().subspan(0, labels.Size());
-  auto const sorted_idx = common::ArgSort<size_t>(raw_labels, std::greater<>{});
+  auto const sorted_idx = common::ArgSort<size_t>(
+      ctx, raw_labels.data(), raw_labels.data() + raw_labels.size(), std::greater<>{});
   w = common::Sqr(w);
 
   double sum_w = 0.0f;
@@ -185,10 +189,11 @@ double GroupRankingROC(common::Span<float const> predts,
  *
  *   https://doi.org/10.1371/journal.pone.0092209
  */
-std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
+std::tuple<double, double, double> BinaryPRAUC(Context const *ctx, common::Span<float const> predts,
                                                linalg::VectorView<float const> labels,
                                                common::OptionalWeights weights) {
-  auto const sorted_idx = common::ArgSort<size_t>(predts, std::greater<>{});
+  auto const sorted_idx =
+      common::ArgSort<size_t>(ctx, predts.data(), predts.data() + predts.size(), std::greater<>{});
   double total_pos{0}, total_neg{0};
   for (size_t i = 0; i < labels.Size(); ++i) {
     auto w = weights[i];
@@ -211,9 +216,8 @@ std::tuple<double, double, double> BinaryPRAUC(common::Span<float const> predts,
  * Cast LTR problem to binary classification problem by comparing pairs.
  */
 template <bool is_roc>
-std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
-                                       MetaInfo const &info,
-                                       int32_t n_threads) {
+std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> const &predts,
+                                       MetaInfo const &info, int32_t n_threads) {
   CHECK_GE(info.group_ptr_.size(), 2);
   uint32_t n_groups = info.group_ptr_.size() - 1;
   auto s_predts = common::Span<float const>{predts};
@@ -237,9 +241,9 @@ std::pair<double, uint32_t> RankingAUC(std::vector<float> const &predts,
       auc = 0;
     } else {
       if (is_roc) {
-        auc = GroupRankingROC(g_predts, g_labels, w);
+        auc = GroupRankingROC(ctx, g_predts, g_labels, w);
       } else {
-        auc = std::get<2>(BinaryPRAUC(g_predts, g_labels, common::OptionalWeights{w}));
+        auc = std::get<2>(BinaryPRAUC(ctx, g_predts, g_labels, common::OptionalWeights{w}));
       }
       if (std::isnan(auc)) {
         invalid_groups++;
@@ -344,7 +348,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     auto n_threads = ctx_->Threads();
     if (ctx_->gpu_id == Context::kCpuId) {
       std::tie(auc, valid_groups) =
-          RankingAUC<true>(predts.ConstHostVector(), info, n_threads);
+          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
     } else {
       std::tie(auc, valid_groups) =
           GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
@@ -358,8 +362,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     auto n_threads = ctx_->Threads();
     CHECK_NE(n_classes, 0);
     if (ctx_->gpu_id == Context::kCpuId) {
-      auc = MultiClassOVR(predts.ConstHostVector(), info, n_classes, n_threads,
-                          BinaryROCAUC);
+      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
     } else {
       auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
     }
@@ -370,9 +373,9 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double fp, tp, auc;
     if (ctx_->gpu_id == Context::kCpuId) {
-      std::tie(fp, tp, auc) =
-          BinaryROCAUC(predts.ConstHostVector(), info.labels.HostView().Slice(linalg::All(), 0),
-                       common::OptionalWeights{info.weights_.ConstHostSpan()});
+      std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
+                                           info.labels.HostView().Slice(linalg::All(), 0),
+                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
     } else {
       std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
                                               ctx_->gpu_id, &this->d_cache_);
@@ -422,7 +425,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
     double pr, re, auc;
     if (ctx_->gpu_id == Context::kCpuId) {
       std::tie(pr, re, auc) =
-          BinaryPRAUC(predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
+          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
     } else {
       std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
@@ -435,8 +438,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
                         size_t n_classes) {
     if (ctx_->gpu_id == Context::kCpuId) {
       auto n_threads = this->ctx_->Threads();
-      return MultiClassOVR(predts.ConstHostSpan(), info, n_classes, n_threads,
-                           BinaryPRAUC);
+      return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
     } else {
       return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
     }
@@ -453,7 +455,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
         InvalidLabels();
       }
       std::tie(auc, valid_groups) =
-          RankingAUC<false>(predts.ConstHostVector(), info, n_threads);
+          RankingAUC<false>(ctx_, predts.ConstHostVector(), info, n_threads);
     } else {
       std::tie(auc, valid_groups) =
           GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index ae5ba676e..fdbf0501a 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -5,7 +5,7 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cub/cub.cuh>
+#include <cub/cub.cuh>  // NOLINT
 #include <limits>
 #include <memory>
 #include <tuple>
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index e06be9730..9006bdfca 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -451,9 +451,8 @@ class QuantileError : public MetricNoCache {
     auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
     std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
     CHECK_NE(n_targets, 0);
-    auto y_predt = linalg::MakeTensorView(
-        ctx->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan(),
-        {static_cast<std::size_t>(info.num_row_), alpha_.Size(), n_targets}, ctx->gpu_id);
+    auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
+                                          alpha_.Size(), n_targets);
 
     info.weights_.SetDevice(ctx->gpu_id);
     common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 064608ebf..5fbd6f256 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -6,6 +6,7 @@
 #define XGBOOST_METRIC_METRIC_COMMON_H_
 
 #include <limits>
+#include <memory>  // shared_ptr
 #include <string>
 
 #include "../common/common.h"
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 7ca0243f2..d39c7302a 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "../collective/communicator-inl.h"
+#include "../common/algorithm.h"  // Sort
 #include "../common/math.h"
 #include "../common/ranking_utils.h"  // MakeMetricName
 #include "../common/threading_utils.h"
@@ -113,7 +114,7 @@ struct EvalAMS : public MetricNoCache {
     const auto &h_preds = preds.ConstHostVector();
     common::ParallelFor(ndata, ctx_->Threads(),
                         [&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); });
-    XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
+    common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst);
     auto ntop = static_cast<unsigned>(ratio_ * ndata);
     if (ntop == 0) ntop = ndata;
     const double br = 10.0;
@@ -330,7 +331,7 @@ struct EvalCox : public MetricNoCache {
     using namespace std;  // NOLINT(*)
 
     const auto ndata = static_cast<bst_omp_uint>(info.labels.Size());
-    const auto &label_order = info.LabelAbsSort();
+    const auto &label_order = info.LabelAbsSort(ctx_);
 
     // pre-compute a sum for the denominator
     double exp_p_sum = 0;  // we use double because we might need the precision with large datasets
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index 173decb96..4a67e848b 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -3,27 +3,34 @@
  */
 #include "adaptive.h"
 
-#include <limits>
-#include <vector>
+#include <algorithm>                       // std::transform,std::find_if,std::copy,std::unique
+#include <cmath>                           // std::isnan
+#include <cstddef>                         // std::size_t
+#include <iterator>                        // std::distance
+#include <vector>                          // std::vector
 
-#include "../common/common.h"
-#include "../common/numeric.h"
-#include "../common/stats.h"
-#include "../common/threading_utils.h"
+#include "../common/algorithm.h"           // ArgSort
+#include "../common/common.h"              // AssertGPUSupport
+#include "../common/numeric.h"             // RunLengthEncode
+#include "../common/stats.h"               // Quantile,WeightedQuantile
+#include "../common/threading_utils.h"     // ParallelFor
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
-#include "xgboost/linalg.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/base.h"                  // bst_node_t
+#include "xgboost/context.h"               // Context
+#include "xgboost/data.h"                  // MetaInfo
+#include "xgboost/host_device_vector.h"    // HostDeviceVector
+#include "xgboost/linalg.h"                // MakeTensorView
+#include "xgboost/span.h"                  // Span
+#include "xgboost/tree_model.h"            // RegTree
 
-namespace xgboost {
-namespace obj {
-namespace detail {
-void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& position,
-                        std::vector<size_t>* p_nptr, std::vector<bst_node_t>* p_nidx,
-                        std::vector<size_t>* p_ridx) {
+namespace xgboost::obj::detail {
+void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
+                        std::vector<bst_node_t> const& position, std::vector<size_t>* p_nptr,
+                        std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_ridx) {
   auto& nptr = *p_nptr;
   auto& nidx = *p_nidx;
   auto& ridx = *p_ridx;
-  ridx = common::ArgSort<size_t>(position);
+  ridx = common::ArgSort<size_t>(ctx, position.cbegin(), position.cend());
   std::vector<bst_node_t> sorted_pos(position);
   // permutation
   for (size_t i = 0; i < position.size(); ++i) {
@@ -67,18 +74,18 @@ void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& posi
 }
 
 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info,
+                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                         HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   auto& tree = *p_tree;
 
   std::vector<bst_node_t> nidx;
   std::vector<size_t> nptr;
   std::vector<size_t> ridx;
-  EncodeTreeLeafHost(*p_tree, position, &nptr, &nidx, &ridx);
+  EncodeTreeLeafHost(ctx, *p_tree, position, &nptr, &nidx, &ridx);
   size_t n_leaf = nidx.size();
   if (nptr.empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, p_tree);
+    UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
     return;
   }
 
@@ -89,8 +96,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
   auto const& h_node_idx = nidx;
   auto const& h_node_ptr = nptr;
   CHECK_LE(h_node_ptr.back(), info.num_row_);
-  auto h_predt = linalg::MakeTensorView(predt.ConstHostSpan(),
-                                        {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
+  auto h_predt = linalg::MakeTensorView(ctx, predt.ConstHostSpan(), info.num_row_,
+                                        predt.Size() / info.num_row_);
 
   // loop over each leaf
   common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
@@ -99,8 +106,8 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
     CHECK_LT(k + 1, h_node_ptr.size());
     size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
     auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
-    CHECK_LE(group_idx, info.labels.Shape(1));
-    auto h_labels = info.labels.HostView().Slice(linalg::All(), group_idx);
+
+    auto h_labels = info.labels.HostView().Slice(linalg::All(), IdxY(info, group_idx));
     auto h_weights = linalg::MakeVec(&info.weights_);
 
     auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
@@ -114,9 +121,9 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
 
     float q{0};
     if (info.weights_.Empty()) {
-      q = common::Quantile(alpha, iter, iter + h_row_set.size());
+      q = common::Quantile(ctx, alpha, iter, iter + h_row_set.size());
     } else {
-      q = common::WeightedQuantile(alpha, iter, iter + h_row_set.size(), w_it);
+      q = common::WeightedQuantile(ctx, alpha, iter, iter + h_row_set.size(), w_it);
     }
     if (std::isnan(q)) {
       CHECK(h_row_set.empty());
@@ -124,8 +131,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
     quantiles.at(k) = q;
   });
 
-  UpdateLeafValues(&quantiles, nidx, p_tree);
+  UpdateLeafValues(&quantiles, nidx, learning_rate, p_tree);
 }
-}  // namespace detail
-}  // namespace obj
-}  // namespace xgboost
+
+#if !defined(XGBOOST_USE_CUDA)
+void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, std::int32_t,
+                          MetaInfo const&, float, HostDeviceVector<float> const&, float, RegTree*) {
+  common::AssertGPUSupport();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::obj::detail
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 774149960..662b0330b 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -3,8 +3,8 @@
  */
 #include <thrust/sort.h>
 
-#include <cstdint>  // std::int32_t
-#include <cub/cub.cuh>
+#include <cstdint>                     // std::int32_t
+#include <cub/cub.cuh>                 // NOLINT
 
 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
@@ -20,20 +20,19 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                           HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
   // copy position to buffer
   dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  auto cuctx = ctx->CUDACtx();
   size_t n_samples = position.size();
-  dh::XGBDeviceAllocator<char> alloc;
   dh::device_vector<bst_node_t> sorted_position(position.size());
   dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
-                                position.size_bytes(), cudaMemcpyDeviceToDevice));
+                                position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));
 
   p_ridx->resize(position.size());
   dh::Iota(dh::ToSpan(*p_ridx));
   // sort row index according to node index
-  thrust::stable_sort_by_key(thrust::cuda::par(alloc), sorted_position.begin(),
+  thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
                              sorted_position.begin() + n_samples, p_ridx->begin());
-  dh::XGBCachingDeviceAllocator<char> caching;
   size_t beg_pos =
-      thrust::find_if(thrust::cuda::par(caching), sorted_position.cbegin(), sorted_position.cend(),
+      thrust::find_if(cuctx->CTP(), sorted_position.cbegin(), sorted_position.cend(),
                       [] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
       sorted_position.cbegin();
   if (beg_pos == sorted_position.size()) {
@@ -72,7 +71,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());
 
   dh::CUDAEvent e;
-  e.Record(dh::DefaultStream());
+  e.Record(cuctx->Stream());
   copy_stream.View().Wait(e);
   // flag for whether there's ignored position
   bst_node_t* h_first_unique =
@@ -108,7 +107,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
       d_node_ptr[0] = beg_pos;
     }
   });
-  thrust::inclusive_scan(thrust::cuda::par(caching), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
+  thrust::inclusive_scan(cuctx->CTP(), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
                          dh::tbegin(d_node_ptr));
   copy_stream.View().Sync();
   CHECK_GT(*h_num_runs, 0);
@@ -141,7 +140,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 }
 
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info,
+                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
   dh::device_vector<size_t> ridx;
@@ -152,17 +151,17 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 
   if (nptr.Empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree);
+    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), learning_rate, p_tree);
   }
 
   HostDeviceVector<float> quantiles;
   predt.SetDevice(ctx->gpu_id);
 
-  auto d_predt = linalg::MakeTensorView(predt.ConstDeviceSpan(),
-                                        {info.num_row_, predt.Size() / info.num_row_}, ctx->gpu_id);
+  auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
+                                        predt.Size() / info.num_row_);
   CHECK_LT(group_idx, d_predt.Shape(1));
   auto t_predt = d_predt.Slice(linalg::All(), group_idx);
-  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), group_idx);
+  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));
 
   auto d_row_index = dh::ToSpan(ridx);
   auto seg_beg = nptr.DevicePointer();
@@ -187,7 +186,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                       w_it + d_weights.size(), &quantiles);
   }
 
-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree);
+  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), learning_rate, p_tree);
 }
 }  // namespace detail
 }  // namespace obj
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index 10486c85c..fef920ec9 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -6,13 +6,15 @@
 #include <algorithm>
 #include <cstdint>  // std::int32_t
 #include <limits>
-#include <vector>
+#include <vector>  // std::vector
 
 #include "../collective/communicator-inl.h"
 #include "../common/common.h"
-#include "xgboost/context.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/tree_model.h"
+#include "xgboost/base.h"                // bst_node_t
+#include "xgboost/context.h"             // Context
+#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/tree_model.h"          // RegTree
 
 namespace xgboost {
 namespace obj {
@@ -34,7 +36,7 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
 }
 
 inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             RegTree* p_tree) {
+                             float learning_rate, RegTree* p_tree) {
   auto& tree = *p_tree;
   auto& quantiles = *p_quantiles;
   auto const& h_node_idx = nidx;
@@ -69,17 +71,39 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
     auto nidx = h_node_idx[i];
     auto q = quantiles[i];
     CHECK(tree[nidx].IsLeaf());
-    tree[nidx].SetLeaf(q);
+    tree[nidx].SetLeaf(q * learning_rate);
   }
 }
 
+inline std::size_t IdxY(MetaInfo const& info, bst_group_t group_idx) {
+  std::size_t y_idx{0};
+  if (info.labels.Shape(1) > 1) {
+    y_idx = group_idx;
+  }
+  CHECK_LE(y_idx, info.labels.Shape(1));
+  return y_idx;
+}
+
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info,
+                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 
 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info,
+                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                         HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 }  // namespace detail
+
+inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
+                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
+  if (ctx->IsCPU()) {
+    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
+                               predt, alpha, p_tree);
+  } else {
+    position.SetDevice(ctx->gpu_id);
+    detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
+                                 predt, alpha, p_tree);
+  }
+}
 }  // namespace obj
 }  // namespace xgboost
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
new file mode 100644
index 000000000..96fd5d653
--- /dev/null
+++ b/src/objective/init_estimation.cc
@@ -0,0 +1,44 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors
+ */
+#include "init_estimation.h"
+
+#include <memory>                        // unique_ptr
+
+#include "../common/stats.h"             // Mean
+#include "../tree/fit_stump.h"           // FitStump
+#include "xgboost/base.h"                // GradientPair
+#include "xgboost/data.h"                // MetaInfo
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/json.h"                // Json
+#include "xgboost/linalg.h"              // Tensor,Vector
+#include "xgboost/task.h"                // ObjInfo
+
+namespace xgboost {
+namespace obj {
+void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const {
+  if (this->Task().task == ObjInfo::kRegression) {
+    CheckInitInputs(info);
+  }
+  // Avoid altering any state in child objective.
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
+  HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
+
+  Json config{Object{}};
+  this->SaveConfig(&config);
+
+  std::unique_ptr<ObjFunction> new_obj{
+      ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
+  new_obj->LoadConfig(config);
+  new_obj->GetGradient(dummy_predt, info, 0, &gpair);
+  bst_target_t n_targets = this->Targets(info);
+  linalg::Vector<float> leaf_weight;
+  tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
+
+  // workaround, we don't support multi-target due to binary model serialization for
+  // base margin.
+  common::Mean(this->ctx_, leaf_weight, base_score);
+  this->PredTransform(base_score->Data());
+}
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/init_estimation.h b/src/objective/init_estimation.h
new file mode 100644
index 000000000..b0a91d8c3
--- /dev/null
+++ b/src/objective/init_estimation.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2022-2023 by XGBoost contributors
+ */
+#ifndef XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
+#define XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
+#include "xgboost/data.h"       // MetaInfo
+#include "xgboost/linalg.h"     // Tensor
+#include "xgboost/objective.h"  // ObjFunction
+
+namespace xgboost {
+namespace obj {
+class FitIntercept : public ObjFunction {
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override;
+};
+
+inline void CheckInitInputs(MetaInfo const& info) {
+  CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
+  if (!info.weights_.Empty()) {
+    CHECK_EQ(info.weights_.Size(), info.num_row_)
+        << "Number of weights should be equal to number of data points.";
+  }
+}
+}  // namespace obj
+}  // namespace xgboost
+#endif  // XGBOOST_OBJECTIVE_INIT_ESTIMATION_H_
diff --git a/src/objective/objective.cc b/src/objective/objective.cc
index 9512233dc..d3b01d80b 100644
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -44,11 +44,13 @@ namespace obj {
 // List of files that will be force linked in static links.
 #ifdef XGBOOST_USE_CUDA
 DMLC_REGISTRY_LINK_TAG(regression_obj_gpu);
+DMLC_REGISTRY_LINK_TAG(quantile_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(hinge_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj_gpu);
 DMLC_REGISTRY_LINK_TAG(rank_obj_gpu);
 #else
 DMLC_REGISTRY_LINK_TAG(regression_obj);
+DMLC_REGISTRY_LINK_TAG(quantile_obj);
 DMLC_REGISTRY_LINK_TAG(hinge_obj);
 DMLC_REGISTRY_LINK_TAG(multiclass_obj);
 DMLC_REGISTRY_LINK_TAG(rank_obj);
diff --git a/src/objective/quantile_obj.cc b/src/objective/quantile_obj.cc
new file mode 100644
index 000000000..89e2d6010
--- /dev/null
+++ b/src/objective/quantile_obj.cc
@@ -0,0 +1,18 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+
+// Dummy file to enable the CUDA conditional compile trick.
+
+#include <dmlc/registry.h>
+namespace xgboost {
+namespace obj {
+
+DMLC_REGISTRY_FILE_TAG(quantile_obj);
+
+}  // namespace obj
+}  // namespace xgboost
+
+#ifndef XGBOOST_USE_CUDA
+#include "quantile_obj.cu"
+#endif  // !defined(XBGOOST_USE_CUDA)
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
new file mode 100644
index 000000000..0a40758bc
--- /dev/null
+++ b/src/objective/quantile_obj.cu
@@ -0,0 +1,222 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include <cstddef>                          // std::size_t
+#include <cstdint>                          // std::int32_t
+#include <vector>                           // std::vector
+
+#include "../common/linalg_op.h"            // ElementWiseKernel,cbegin,cend
+#include "../common/quantile_loss_utils.h"  // QuantileLossParam
+#include "../common/stats.h"                // Quantile,WeightedQuantile
+#include "adaptive.h"                       // UpdateTreeLeaf
+#include "dmlc/parameter.h"                 // DMLC_DECLARE_PARAMETER
+#include "init_estimation.h"                // CheckInitInputs
+#include "xgboost/base.h"                   // GradientPair,XGBOOST_DEVICE,bst_target_t
+#include "xgboost/data.h"                   // MetaInfo
+#include "xgboost/host_device_vector.h"     // HostDeviceVector
+#include "xgboost/json.h"                   // Json,String,ToJson,FromJson
+#include "xgboost/linalg.h"                 // Tensor,MakeTensorView,MakeVec
+#include "xgboost/objective.h"              // ObjFunction
+#include "xgboost/parameter.h"              // XGBoostParameter
+
+#if defined(XGBOOST_USE_CUDA)
+
+#include "../common/linalg_op.cuh"  // ElementWiseKernel
+#include "../common/stats.cuh"      // SegmentedQuantile
+
+#endif                              // defined(XGBOOST_USE_CUDA)
+
+namespace xgboost {
+namespace obj {
+class QuantileRegression : public ObjFunction {
+  common::QuantileLossParam param_;
+  HostDeviceVector<float> alpha_;
+
+  bst_target_t Targets(MetaInfo const& info) const override {
+    auto const& alpha = param_.quantile_alpha.Get();
+    CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
+    CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target is not yet supported by the quantile loss.";
+    CHECK(!alpha.empty());
+    // We have some placeholders for multi-target in the quantile loss. But it's not
+    // supported as the gbtree doesn't know how to slice the gradient and there's no 3-dim
+    // model shape in general.
+    auto n_y = std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
+    return alpha_.Size() * n_y;
+  }
+
+ public:
+  void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, std::int32_t iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    if (iter == 0) {
+      CheckInitInputs(info);
+    }
+    CHECK_EQ(param_.quantile_alpha.Get().size(), alpha_.Size());
+
+    using SizeT = decltype(info.num_row_);
+    SizeT n_targets = this->Targets(info);
+    SizeT n_alphas = alpha_.Size();
+    CHECK_NE(n_alphas, 0);
+    CHECK_GE(n_targets, n_alphas);
+    CHECK_EQ(preds.Size(), info.num_row_ * n_targets);
+
+    auto labels = info.labels.View(ctx_->gpu_id);
+
+    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->Resize(n_targets * info.num_row_);
+    auto gpair =
+        linalg::MakeTensorView(ctx_, out_gpair, info.num_row_, n_alphas, n_targets / n_alphas);
+
+    info.weights_.SetDevice(ctx_->gpu_id);
+    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
+                                                 : info.weights_.ConstDeviceSpan()};
+
+    preds.SetDevice(ctx_->gpu_id);
+    auto predt = linalg::MakeVec(&preds);
+    auto n_samples = info.num_row_;
+
+    alpha_.SetDevice(ctx_->gpu_id);
+    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
+
+    linalg::ElementWiseKernel(
+        ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
+          auto [sample_id, quantile_id, target_id] =
+              linalg::UnravelIndex(i, n_samples, alpha.size(), n_targets / alpha.size());
+
+          auto d = predt(i) - labels(sample_id, target_id);
+          auto h = weight[sample_id];
+          if (d >= 0) {
+            auto g = (1.0f - alpha[quantile_id]) * weight[sample_id];
+            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+          } else {
+            auto g = (-alpha[quantile_id] * weight[sample_id]);
+            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+          }
+        });
+  }
+
+  void InitEstimation(MetaInfo const& info, linalg::Vector<float>* base_score) const override {
+    CHECK(!alpha_.Empty());
+
+    auto n_targets = this->Targets(info);
+    base_score->SetDevice(ctx_->gpu_id);
+    base_score->Reshape(n_targets);
+
+    double sw{0};
+    if (ctx_->IsCPU()) {
+      auto quantiles = base_score->HostView();
+      auto h_weights = info.weights_.ConstHostVector();
+      if (info.weights_.Empty()) {
+        sw = info.num_row_;
+      } else {
+        sw = std::accumulate(std::cbegin(h_weights), std::cend(h_weights), 0.0);
+      }
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto alpha = param_.quantile_alpha[t];
+        auto h_labels = info.labels.HostView();
+        if (h_weights.empty()) {
+          quantiles(t) =
+              common::Quantile(ctx_, alpha, linalg::cbegin(h_labels), linalg::cend(h_labels));
+        } else {
+          CHECK_EQ(h_weights.size(), h_labels.Size());
+          quantiles(t) = common::WeightedQuantile(ctx_, alpha, linalg::cbegin(h_labels),
+                                                  linalg::cend(h_labels), std::cbegin(h_weights));
+        }
+      }
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      alpha_.SetDevice(ctx_->gpu_id);
+      auto d_alpha = alpha_.ConstDeviceSpan();
+      auto d_labels = info.labels.View(ctx_->gpu_id);
+      auto seg_it = dh::MakeTransformIterator<std::size_t>(
+          thrust::make_counting_iterator(0ul),
+          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
+      CHECK_EQ(d_labels.Shape(1), 1);
+      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                     [=] XGBOOST_DEVICE(std::size_t i) {
+                                                       auto sample_idx = i % d_labels.Shape(0);
+                                                       return d_labels(sample_idx, 0);
+                                                     });
+      auto n = d_labels.Size() * d_alpha.size();
+      CHECK_EQ(base_score->Size(), d_alpha.size());
+      if (info.weights_.Empty()) {
+        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
+                                  val_it + n, base_score->Data());
+        sw = info.num_row_;
+      } else {
+        info.weights_.SetDevice(ctx_->gpu_id);
+        auto d_weights = info.weights_.ConstDeviceSpan();
+        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                          [=] XGBOOST_DEVICE(std::size_t i) {
+                                                            auto sample_idx = i % d_labels.Shape(0);
+                                                            return d_weights[sample_idx];
+                                                          });
+        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
+                                          val_it, val_it + n, weight_it, weight_it + n,
+                                          base_score->Data());
+        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
+                        thrust::plus<double>{});
+      }
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
+    }
+
+    // For multiple quantiles, we should extend the base score to a vector instead of
+    // computing the average. For now, this is a workaround.
+    linalg::Vector<float> temp;
+    common::Mean(ctx_, *base_score, &temp);
+    double meanq = temp(0) * sw;
+
+    collective::Allreduce<collective::Operation::kSum>(&meanq, 1);
+    collective::Allreduce<collective::Operation::kSum>(&sw, 1);
+    meanq /= (sw + kRtEps);
+    base_score->Reshape(1);
+    base_score->Data()->Fill(meanq);
+  }
+
+  void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
+                      float learning_rate, HostDeviceVector<float> const& prediction,
+                      std::int32_t group_idx, RegTree* p_tree) const override {
+    auto alpha = param_.quantile_alpha[group_idx];
+    ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction,
+                                   alpha, p_tree);
+  }
+
+  void Configure(Args const& args) override {
+    param_.UpdateAllowUnknown(args);
+    param_.Validate();
+    this->alpha_.HostVector() = param_.quantile_alpha.Get();
+  }
+  ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
+  static char const* Name() { return "reg:quantileerror"; }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String(Name());
+    out["quantile_loss_param"] = ToJson(param_);
+  }
+  void LoadConfig(Json const& in) override {
+    CHECK_EQ(get<String const>(in["name"]), Name());
+    FromJson(in["quantile_loss_param"], &param_);
+    alpha_.HostVector() = param_.quantile_alpha.Get();
+  }
+
+  const char* DefaultEvalMetric() const override { return "quantile"; }
+  Json DefaultMetricConfig() const override {
+    CHECK(param_.GetInitialised());
+    Json config{Object{}};
+    config["name"] = String{this->DefaultEvalMetric()};
+    config["quantile_loss_param"] = ToJson(param_);
+    return config;
+  }
+};
+
+XGBOOST_REGISTER_OBJECTIVE(QuantileRegression, QuantileRegression::Name())
+    .describe("Regression with quantile loss.")
+    .set_body([]() { return new QuantileRegression(); });
+
+#if defined(XGBOOST_USE_CUDA)
+DMLC_REGISTRY_FILE_TAG(quantile_obj_gpu);
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/regression_loss.h b/src/objective/regression_loss.h
index 1fd1621af..1ef7106cf 100644
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -1,15 +1,16 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
  */
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 
 #include <dmlc/omp.h>
-#include <xgboost/logging.h>
 
 #include <cmath>
 
 #include "../common/math.h"
+#include "xgboost/data.h"  // MetaInfo
+#include "xgboost/logging.h"
 #include "xgboost/task.h"  // ObjInfo
 
 namespace xgboost {
@@ -105,7 +106,6 @@ struct LogisticRaw : public LogisticRegression {
 
   static ObjInfo Info() { return ObjInfo::kRegression; }
 };
-
 }  // namespace obj
 }  // namespace xgboost
 
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 7a0df336a..d7999f8c1 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -20,12 +20,12 @@
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
 #include "../common/transform.h"
-#include "../tree/fit_stump.h"  // FitStump
 #include "./regression_loss.h"
 #include "adaptive.h"
+#include "init_estimation.h"  // FitIntercept
 #include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/data.h"  // MetaInfo
+#include "xgboost/context.h"  // Context
+#include "xgboost/data.h"     // MetaInfo
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
 #include "xgboost/linalg.h"
@@ -43,45 +43,12 @@
 namespace xgboost {
 namespace obj {
 namespace {
-void CheckInitInputs(MetaInfo const& info) {
-  CHECK_EQ(info.labels.Shape(0), info.num_row_) << "Invalid shape of labels.";
-  if (!info.weights_.Empty()) {
-    CHECK_EQ(info.weights_.Size(), info.num_row_)
-        << "Number of weights should be equal to number of data points.";
-  }
-}
-
 void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
   CheckInitInputs(info);
   CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
 }
 }  // anonymous namespace
 
-class RegInitEstimation : public ObjFunction {
-  void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) const override {
-    CheckInitInputs(info);
-    // Avoid altering any state in child objective.
-    HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-    HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
-
-    Json config{Object{}};
-    this->SaveConfig(&config);
-
-    std::unique_ptr<ObjFunction> new_obj{
-        ObjFunction::Create(get<String const>(config["name"]), this->ctx_)};
-    new_obj->LoadConfig(config);
-    new_obj->GetGradient(dummy_predt, info, 0, &gpair);
-    bst_target_t n_targets = this->Targets(info);
-    linalg::Vector<float> leaf_weight;
-    tree::FitStump(this->ctx_, gpair, n_targets, &leaf_weight);
-
-    // workaround, we don't support multi-target due to binary model serialization for
-    // base margin.
-    common::Mean(this->ctx_, leaf_weight, base_score);
-    this->PredTransform(base_score->Data());
-  }
-};
-
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(regression_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)
@@ -96,7 +63,7 @@ struct RegLossParam : public XGBoostParameter<RegLossParam> {
 };
 
 template<typename Loss>
-class RegLossObj : public RegInitEstimation {
+class RegLossObj : public FitIntercept {
  protected:
   HostDeviceVector<float> additional_input_;
 
@@ -243,7 +210,7 @@ XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
     return new RegLossObj<LinearSquareLoss>(); });
 // End deprecated
 
-class PseudoHuberRegression : public RegInitEstimation {
+class PseudoHuberRegression : public FitIntercept {
   PesudoHuberParam param_;
 
  public:
@@ -318,7 +285,7 @@ struct PoissonRegressionParam : public XGBoostParameter<PoissonRegressionParam>
 };
 
 // poisson regression for count
-class PoissonRegression : public RegInitEstimation {
+class PoissonRegression : public FitIntercept {
  public:
   // declare functions
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -413,7 +380,7 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
 
 
 // cox regression for survival data (negative values mean they are censored)
-class CoxRegression : public RegInitEstimation {
+class CoxRegression : public FitIntercept {
  public:
   void Configure(Args const&) override {}
   ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -426,7 +393,7 @@ class CoxRegression : public RegInitEstimation {
     const auto& preds_h = preds.HostVector();
     out_gpair->Resize(preds_h.size());
     auto& gpair = out_gpair->HostVector();
-    const std::vector<size_t> &label_order = info.LabelAbsSort();
+    const std::vector<size_t> &label_order = info.LabelAbsSort(ctx_);
 
     const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
     const bool is_null_weight = info.weights_.Size() == 0;
@@ -510,7 +477,7 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 .set_body([]() { return new CoxRegression(); });
 
 // gamma regression
-class GammaRegression : public RegInitEstimation {
+class GammaRegression : public FitIntercept {
  public:
   void Configure(Args const&) override {}
   ObjInfo Task() const override { return ObjInfo::kRegression; }
@@ -601,7 +568,7 @@ struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam>
 };
 
 // tweedie regression
-class TweedieRegression : public RegInitEstimation {
+class TweedieRegression : public FitIntercept {
  public:
   // declare functions
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
@@ -775,20 +742,10 @@ class MeanAbsoluteError : public ObjFunction {
   }
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
-                      HostDeviceVector<float> const& prediction, std::int32_t group_idx,
-                      RegTree* p_tree) const override {
-    if (ctx_->IsCPU()) {
-      auto const& h_position = position.ConstHostVector();
-      detail::UpdateTreeLeafHost(ctx_, h_position, group_idx, info, prediction, 0.5, p_tree);
-    } else {
-#if defined(XGBOOST_USE_CUDA)
-      position.SetDevice(ctx_->gpu_id);
-      auto d_position = position.ConstDeviceSpan();
-      detail::UpdateTreeLeafDevice(ctx_, d_position, group_idx, info, prediction, 0.5, p_tree);
-#else
-      common::AssertGPUSupport();
-#endif  //  defined(XGBOOST_USE_CUDA)
-    }
+                      float learning_rate, HostDeviceVector<float> const& prediction,
+                      std::int32_t group_idx, RegTree* p_tree) const override {
+    ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
+                                   p_tree);
   }
 
   const char* DefaultEvalMetric() const override { return "mae"; }
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 2f578fae7..288dc5fb0 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -164,7 +164,7 @@ struct GHistIndexMatrixView {
   SparsePage::Inst operator[](size_t r) {
     auto t = omp_get_thread_num();
     auto const beg = (n_features_ * kUnroll * t) + (current_unroll_[t] * n_features_);
-    size_t non_missing{beg};
+    size_t non_missing{static_cast<std::size_t>(beg)};
 
     for (bst_feature_t c = 0; c < n_features_; ++c) {
       float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
@@ -477,7 +477,8 @@ class ColumnSplitHelper {
     // auto block_id has the same type as `n_blocks`.
     common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
       auto const batch_offset = block_id * block_of_rows_size;
-      auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
+      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
+                                       static_cast<std::size_t>(block_of_rows_size));
       auto const fvec_offset = omp_get_thread_num() * block_of_rows_size;
 
       FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
@@ -490,7 +491,8 @@ class ColumnSplitHelper {
     // auto block_id has the same type as `n_blocks`.
     common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
       auto const batch_offset = block_id * block_of_rows_size;
-      auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
+      auto const block_size = std::min(static_cast<std::size_t>(nsize - batch_offset),
+                                       static_cast<std::size_t>(block_of_rows_size));
       PredictAllTrees(out_preds, batch_offset, batch_offset + batch.base_rowid, num_group,
                       block_size);
     });
@@ -584,7 +586,7 @@ class CPUPredictor : public Predictor {
 
   void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
                       gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
-    if (p_fmat->Info().data_split_mode == DataSplitMode::kCol) {
+    if (p_fmat->IsColumnSplit()) {
       ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
       helper.PredictDMatrix(p_fmat, out_preds);
       return;
diff --git a/src/predictor/cpu_treeshap.cc b/src/predictor/cpu_treeshap.cc
index 87da2612c..64b195d78 100644
--- a/src/predictor/cpu_treeshap.cc
+++ b/src/predictor/cpu_treeshap.cc
@@ -3,10 +3,11 @@
  */
 #include "cpu_treeshap.h"
 
-#include <cinttypes>  // std::uint32_t
+#include <algorithm>             // copy
+#include <cinttypes>             // std::uint32_t
 
-#include "predict_fn.h"    // GetNextNode
-#include "xgboost/base.h"  // bst_node_t
+#include "predict_fn.h"          // GetNextNode
+#include "xgboost/base.h"        // bst_node_t
 #include "xgboost/logging.h"
 #include "xgboost/tree_model.h"  // RegTree
 
diff --git a/src/predictor/cpu_treeshap.h b/src/predictor/cpu_treeshap.h
index 702b3d571..3cdbcc4a9 100644
--- a/src/predictor/cpu_treeshap.h
+++ b/src/predictor/cpu_treeshap.h
@@ -1,6 +1,10 @@
+#ifndef XGBOOST_PREDICTOR_CPU_TREESHAP_H_
+#define XGBOOST_PREDICTOR_CPU_TREESHAP_H_
 /**
  * Copyright by XGBoost Contributors 2017-2022
  */
+#include <vector>                // vector
+
 #include "xgboost/tree_model.h"  // RegTree
 
 namespace xgboost {
@@ -15,3 +19,4 @@ void CalculateContributions(RegTree const &tree, const RegTree::FVec &feat,
                             std::vector<float> *mean_values, bst_float *out_contribs, int condition,
                             unsigned condition_feature);
 }  // namespace xgboost
+#endif  // XGBOOST_PREDICTOR_CPU_TREESHAP_H_
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index a5f4aac2d..3a46a168a 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -9,6 +9,7 @@
 #include <limits>  // std::numeric_limits
 #include <vector>
 
+#include "../collective/communicator-inl.h"
 #include "../common/numeric.h"  // Iota
 #include "../common/partition_builder.h"
 #include "hist/expand_entry.h"  // CPUExpandEntry
@@ -16,17 +17,73 @@
 
 namespace xgboost {
 namespace tree {
-class CommonRowPartitioner {
-  static constexpr size_t kPartitionBlockSize = 2048;
-  common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
-  common::RowSetCollection row_set_collection_;
 
+static constexpr size_t kPartitionBlockSize = 2048;
+
+class ColumnSplitHelper {
+ public:
+  ColumnSplitHelper() = default;
+
+  ColumnSplitHelper(bst_row_t num_row,
+                    common::PartitionBuilder<kPartitionBlockSize>* partition_builder,
+                    common::RowSetCollection* row_set_collection)
+      : partition_builder_{partition_builder}, row_set_collection_{row_set_collection} {
+    decision_storage_.resize(num_row);
+    decision_bits_ = BitVector(common::Span<BitVector::value_type>(decision_storage_));
+    missing_storage_.resize(num_row);
+    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
+  }
+
+  void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
+                 GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
+                 std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+    // When data is split by column, we don't have all the feature values in the local worker, so
+    // we first collect all the decisions and whether the feature is missing into bit vectors.
+    std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
+    std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
+    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
+      const int32_t nid = nodes[node_in_set].nid;
+      partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+                                   (*row_set_collection_)[nid].begin, &decision_bits_,
+                                   &missing_bits_);
+    });
+
+    // Then aggregate the bit vectors across all the workers.
+    collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
+                                                             decision_storage_.size());
+    collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
+                                                              missing_storage_.size());
+
+    // Finally use the bit vectors to partition the rows.
+    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
+      size_t begin = r.begin();
+      const int32_t nid = nodes[node_in_set].nid;
+      const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
+      partition_builder_->AllocateForTask(task_id);
+      partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
+                                          (*row_set_collection_)[nid].begin, decision_bits_,
+                                          missing_bits_);
+    });
+  }
+
+ private:
+  using BitVector = RBitField8;
+  std::vector<BitVector::value_type> decision_storage_{};
+  BitVector decision_bits_{};
+  std::vector<BitVector::value_type> missing_storage_{};
+  BitVector missing_bits_{};
+  common::PartitionBuilder<kPartitionBlockSize>* partition_builder_;
+  common::RowSetCollection* row_set_collection_;
+};
+
+class CommonRowPartitioner {
  public:
   bst_row_t base_rowid = 0;
 
   CommonRowPartitioner() = default;
-  CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid)
-      : base_rowid{_base_rowid} {
+  CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid,
+                       bool is_col_split)
+      : base_rowid{_base_rowid}, is_col_split_{is_col_split} {
     row_set_collection_.Clear();
     std::vector<size_t>& row_indices = *row_set_collection_.Data();
     row_indices.resize(num_row);
@@ -34,6 +91,10 @@ class CommonRowPartitioner {
     std::size_t* p_row_indices = row_indices.data();
     common::Iota(ctx, p_row_indices, p_row_indices + row_indices.size(), base_rowid);
     row_set_collection_.Init();
+
+    if (is_col_split_) {
+      column_split_helper_ = ColumnSplitHelper{num_row, &partition_builder_, &row_set_collection_};
+    }
   }
 
   void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
@@ -156,16 +217,20 @@ class CommonRowPartitioner {
 
     // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
     // Store results in intermediate buffers from partition_builder_
-    common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
-      size_t begin = r.begin();
-      const int32_t nid = nodes[node_in_set].nid;
-      const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
-      partition_builder_.AllocateForTask(task_id);
-      bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
-      partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
-          node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-          row_set_collection_[nid].begin);
-    });
+    if (is_col_split_) {
+      column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
+    } else {
+      common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
+        size_t begin = r.begin();
+        const int32_t nid = nodes[node_in_set].nid;
+        const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
+        partition_builder_.AllocateForTask(task_id);
+        bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
+        partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
+            node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
+            row_set_collection_[nid].begin);
+      });
+    }
 
     // 3. Compute offsets to copy blocks of row-indexes
     // from partition_builder_ to row_set_collection_
@@ -205,6 +270,12 @@ class CommonRowPartitioner {
         ctx, tree, this->Partitions(), p_out_position,
         [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
   }
+
+ private:
+  common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
+  common::RowSetCollection row_set_collection_;
+  bool is_col_split_;
+  ColumnSplitHelper column_split_helper_;
 };
 
 }  // namespace tree
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 781fff92a..c48c8ddf3 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -97,7 +97,7 @@ class EvaluateSplitAgent {
          idx += kBlockSize) {
       local_sum += LoadGpair(node_histogram + idx);
     }
-    local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);
+    local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
     // Broadcast result from thread 0
     return {__shfl_sync(0xffffffff, local_sum.GetQuantisedGrad(), 0),
             __shfl_sync(0xffffffff, local_sum.GetQuantisedHess(), 0)};
@@ -359,8 +359,8 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
 
   // One block for each feature
   uint32_t constexpr kBlockThreads = 32;
-  dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
-                   0}(
+  dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads,
+                    0}(
       EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
       shared_inputs,
       this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 1344ecf4f..489c8d6f7 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,15 +1,15 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
 #include <algorithm>
-#include <ctgmath>
+#include <cstdint>  // uint32_t
 #include <limits>
 
-#include "../../common/device_helpers.cuh"
 #include "../../common/deterministic.cuh"
+#include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
 #include "histogram.cuh"
 #include "row_partitioner.cuh"
@@ -83,7 +83,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
    */
   to_floating_point_ =
       histogram_rounding /
-      T(IntT(1) << (sizeof(typename GradientSumT::ValueT) * 8 - 2));  // keep 1 for sign bit
+      static_cast<T>(static_cast<IntT>(1)
+                     << (sizeof(typename GradientSumT::ValueT) * 8 - 2));  // keep 1 for sign bit
   /**
    * Factor for converting gradients from floating-point to fixed-point. For
    * f64:
@@ -93,8 +94,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
    * rounding is calcuated as exp(m), see the rounding factor calcuation for
    * details.
    */
-  to_fixed_point_ =
-      GradientSumT(T(1) / to_floating_point_.GetGrad(), T(1) / to_floating_point_.GetHess());
+  to_fixed_point_ = GradientSumT(static_cast<T>(1) / to_floating_point_.GetGrad(),
+                                 static_cast<T>(1) / to_floating_point_.GetHess());
 }
 
 
@@ -153,7 +154,8 @@ class HistogramAgent {
         d_gpair_(d_gpair) {}
   __device__ void ProcessPartialTileShared(std::size_t offset) {
     for (std::size_t idx = offset + threadIdx.x;
-         idx < min(offset + kBlockThreads * kItemsPerTile, n_elements_); idx += kBlockThreads) {
+         idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
+         idx += kBlockThreads) {
       int ridx = d_ridx_[idx / feature_stride_];
       int gidx =
           matrix_
@@ -295,11 +297,10 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
 
     // Allocate number of blocks such that each block has about kMinItemsPerBlock work
     // Up to a maximum where the device is saturated
-    grid_size =
-        min(grid_size,
-            unsigned(common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
+    grid_size = std::min(grid_size, static_cast<std::uint32_t>(
+                                        common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
 
-    dh::LaunchKernel{dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
+    dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
                      ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
                                      gpair.data(), rounding);
   };
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a2519ae6f..f1c420ba0 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -130,7 +130,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
         std::size_t item_idx;
         AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
         auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
-        return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res};
+        return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
       });
   size_t temp_bytes = 0;
   if (tmp->empty()) {
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index f76565e9a..31a61fb9d 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,10 +1,11 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <limits>
 #include <memory>
 #include <numeric>
@@ -16,13 +17,11 @@
 #include "../../common/random.h"
 #include "../../data/gradient_index.h"
 #include "../constraints.h"
-#include "../param.h"
+#include "../param.h"  // for TrainParam
 #include "../split_evaluator.h"
 #include "xgboost/context.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 template <typename ExpandEntry>
 class HistEvaluator {
  private:
@@ -34,10 +33,11 @@ class HistEvaluator {
   };
 
  private:
-  TrainParam param_;
+  Context const* ctx_;
+  TrainParam const* param_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   TreeEvaluator tree_evaluator_;
-  int32_t n_threads_ {0};
+  bool is_col_split_{false};
   FeatureInteractionConstraintHost interaction_constraints_;
   std::vector<NodeEntry> snode_;
 
@@ -53,8 +53,9 @@ class HistEvaluator {
     }
   }
 
-  bool IsValid(GradStats const &left, GradStats const &right) const {
-    return left.GetHess() >= param_.min_child_weight && right.GetHess() >= param_.min_child_weight;
+  [[nodiscard]] bool IsValid(GradStats const &left, GradStats const &right) const {
+    return left.GetHess() >= param_->min_child_weight &&
+           right.GetHess() >= param_->min_child_weight;
   }
 
   /**
@@ -93,9 +94,10 @@ class HistEvaluator {
       right_sum = GradStats{hist[i]};
       left_sum.SetSubstract(parent.stats, right_sum);
       if (IsValid(left_sum, right_sum)) {
-        auto missing_left_chg = static_cast<float>(
-            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
-            parent.root_gain);
+        auto missing_left_chg =
+            static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
+                                                       GradStats{right_sum}) -
+                               parent.root_gain);
         best.Update(missing_left_chg, fidx, split_pt, true, true, left_sum, right_sum);
       }
 
@@ -103,9 +105,10 @@ class HistEvaluator {
       right_sum.Add(missing);
       left_sum.SetSubstract(parent.stats, right_sum);
       if (IsValid(left_sum, right_sum)) {
-        auto missing_right_chg = static_cast<float>(
-            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
-            parent.root_gain);
+        auto missing_right_chg =
+            static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
+                                                       GradStats{right_sum}) -
+                               parent.root_gain);
         best.Update(missing_right_chg, fidx, split_pt, false, true, left_sum, right_sum);
       }
     }
@@ -150,7 +153,7 @@ class HistEvaluator {
     bst_bin_t f_begin = cut_ptr[fidx];
     bst_bin_t f_end = cut_ptr[fidx + 1];
     bst_bin_t n_bins_feature{f_end - f_begin};
-    auto n_bins = std::min(param_.max_cat_threshold, n_bins_feature);
+    auto n_bins = std::min(param_->max_cat_threshold, n_bins_feature);
 
     // statistics on both sides of split
     GradStats left_sum;
@@ -179,9 +182,9 @@ class HistEvaluator {
         right_sum.SetSubstract(parent.stats, left_sum);  // missing on right
       }
       if (IsValid(left_sum, right_sum)) {
-        auto loss_chg =
-            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
-            parent.root_gain;
+        auto loss_chg = evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
+                                                GradStats{right_sum}) -
+                        parent.root_gain;
         // We don't have a numeric split point, nan here is a dummy split.
         if (best.Update(loss_chg, fidx, std::numeric_limits<float>::quiet_NaN(), d_step == 1, true,
                         left_sum, right_sum)) {
@@ -254,7 +257,7 @@ class HistEvaluator {
         if (d_step > 0) {
           // forward enumeration: split at right bound of each bin
           loss_chg =
-              static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum},
+              static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{left_sum},
                                                          GradStats{right_sum}) -
                                  parent.root_gain);
           split_pt = cut_val[i];  // not used for partition based
@@ -262,7 +265,7 @@ class HistEvaluator {
         } else {
           // backward enumeration: split at left bound of each bin
           loss_chg =
-              static_cast<float>(evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{right_sum},
+              static_cast<float>(evaluator.CalcSplitGain(*param_, nidx, fidx, GradStats{right_sum},
                                                          GradStats{left_sum}) -
                                  parent.root_gain);
           if (i == imin) {
@@ -283,6 +286,7 @@ class HistEvaluator {
   void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
                       common::Span<FeatureType const> feature_types, const RegTree &tree,
                       std::vector<ExpandEntry> *p_entries) {
+    auto n_threads = ctx_->Threads();
     auto& entries = *p_entries;
     // All nodes are on the same level, so we can store the shared ptr.
     std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
@@ -294,23 +298,23 @@ class HistEvaluator {
     }
     CHECK(!features.empty());
     const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads_);
+        std::max<size_t>(1, features.front()->Size() / n_threads);
     common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
       return features[nidx_in_set]->Size();
     }, grain_size);
 
-    std::vector<ExpandEntry> tloc_candidates(n_threads_ * entries.size());
+    std::vector<ExpandEntry> tloc_candidates(n_threads * entries.size());
     for (size_t i = 0; i < entries.size(); ++i) {
-      for (decltype(n_threads_) j = 0; j < n_threads_; ++j) {
-        tloc_candidates[i * n_threads_ + j] = entries[i];
+      for (decltype(n_threads) j = 0; j < n_threads; ++j) {
+        tloc_candidates[i * n_threads + j] = entries[i];
       }
     }
     auto evaluator = tree_evaluator_.GetEvaluator();
     auto const& cut_ptrs = cut.Ptrs();
 
-    common::ParallelFor2d(space, n_threads_, [&](size_t nidx_in_set, common::Range1d r) {
+    common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
       auto tidx = omp_get_thread_num();
-      auto entry = &tloc_candidates[n_threads_ * nidx_in_set + tidx];
+      auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
       auto best = &entry->split;
       auto nidx = entry->nid;
       auto histogram = hist[nidx];
@@ -323,7 +327,7 @@ class HistEvaluator {
         }
         if (is_cat) {
           auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
-          if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
+          if (common::UseOneHot(n_bins, param_->max_cat_to_onehot)) {
             EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best);
           } else {
             std::vector<size_t> sorted_idx(n_bins);
@@ -331,8 +335,8 @@ class HistEvaluator {
             auto feat_hist = histogram.subspan(cut_ptrs[fidx], n_bins);
             // Sort the histogram to get contiguous partitions.
             std::stable_sort(sorted_idx.begin(), sorted_idx.end(), [&](size_t l, size_t r) {
-              auto ret = evaluator.CalcWeightCat(param_, feat_hist[l]) <
-                         evaluator.CalcWeightCat(param_, feat_hist[r]);
+              auto ret = evaluator.CalcWeightCat(*param_, feat_hist[l]) <
+                         evaluator.CalcWeightCat(*param_, feat_hist[r]);
               return ret;
             });
             EnumeratePart<+1>(cut, sorted_idx, histogram, fidx, nidx, evaluator, best);
@@ -349,12 +353,29 @@ class HistEvaluator {
 
     for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
          ++nidx_in_set) {
-      for (auto tidx = 0; tidx < n_threads_; ++tidx) {
+      for (auto tidx = 0; tidx < n_threads; ++tidx) {
         entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads_ * nidx_in_set + tidx].split);
+            tloc_candidates[n_threads * nidx_in_set + tidx].split);
+      }
+    }
+
+    if (is_col_split_) {
+      // With column-wise data split, we gather the best splits from all the workers and update the
+      // expand entries accordingly.
+      auto const world = collective::GetWorldSize();
+      auto const rank = collective::GetRank();
+      auto const num_entries = entries.size();
+      std::vector<ExpandEntry> buffer{num_entries * world};
+      std::copy_n(entries.cbegin(), num_entries, buffer.begin() + num_entries * rank);
+      collective::Allgather(buffer.data(), buffer.size() * sizeof(ExpandEntry));
+      for (auto worker = 0; worker < world; ++worker) {
+        for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
+          entries[nidx_in_set].split.Update(buffer[worker * num_entries + nidx_in_set].split);
+        }
       }
     }
   }
+
   // Add splits to tree, handles all statistic
   void ApplyTreeSplit(ExpandEntry const& candidate, RegTree *p_tree) {
     auto evaluator = tree_evaluator_.GetEvaluator();
@@ -362,24 +383,22 @@ class HistEvaluator {
 
     GradStats parent_sum = candidate.split.left_sum;
     parent_sum.Add(candidate.split.right_sum);
-    auto base_weight =
-        evaluator.CalcWeight(candidate.nid, param_, GradStats{parent_sum});
-
+    auto base_weight = evaluator.CalcWeight(candidate.nid, *param_, GradStats{parent_sum});
     auto left_weight =
-        evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.left_sum});
+        evaluator.CalcWeight(candidate.nid, *param_, GradStats{candidate.split.left_sum});
     auto right_weight =
-        evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.right_sum});
+        evaluator.CalcWeight(candidate.nid, *param_, GradStats{candidate.split.right_sum});
 
     if (candidate.split.is_cat) {
       tree.ExpandCategorical(
           candidate.nid, candidate.split.SplitIndex(), candidate.split.cat_bits,
-          candidate.split.DefaultLeft(), base_weight, left_weight * param_.learning_rate,
-          right_weight * param_.learning_rate, candidate.split.loss_chg, parent_sum.GetHess(),
+          candidate.split.DefaultLeft(), base_weight, left_weight * param_->learning_rate,
+          right_weight * param_->learning_rate, candidate.split.loss_chg, parent_sum.GetHess(),
           candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
     } else {
       tree.ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
                       candidate.split.DefaultLeft(), base_weight,
-                      left_weight * param_.learning_rate, right_weight * param_.learning_rate,
+                      left_weight * param_->learning_rate, right_weight * param_->learning_rate,
                       candidate.split.loss_chg, parent_sum.GetHess(),
                       candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
     }
@@ -395,11 +414,11 @@ class HistEvaluator {
     max_node = std::max(candidate.nid, max_node);
     snode_.resize(tree.GetNodes().size());
     snode_.at(left_child).stats = candidate.split.left_sum;
-    snode_.at(left_child).root_gain = evaluator.CalcGain(
-        candidate.nid, param_, GradStats{candidate.split.left_sum});
+    snode_.at(left_child).root_gain =
+        evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.left_sum});
     snode_.at(right_child).stats = candidate.split.right_sum;
-    snode_.at(right_child).root_gain = evaluator.CalcGain(
-        candidate.nid, param_, GradStats{candidate.split.right_sum});
+    snode_.at(right_child).root_gain =
+        evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});
 
     interaction_constraints_.Split(candidate.nid,
                                    tree[candidate.nid].SplitIndex(), left_child,
@@ -409,30 +428,31 @@ class HistEvaluator {
   auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
   auto const& Stats() const { return snode_; }
 
-  float InitRoot(GradStats const& root_sum) {
+  float InitRoot(GradStats const &root_sum) {
     snode_.resize(1);
     auto root_evaluator = tree_evaluator_.GetEvaluator();
 
     snode_[0].stats = GradStats{root_sum.GetGrad(), root_sum.GetHess()};
-    snode_[0].root_gain = root_evaluator.CalcGain(RegTree::kRoot, param_,
-                                                  GradStats{snode_[0].stats});
-    auto weight = root_evaluator.CalcWeight(RegTree::kRoot, param_,
-                                            GradStats{snode_[0].stats});
+    snode_[0].root_gain =
+        root_evaluator.CalcGain(RegTree::kRoot, *param_, GradStats{snode_[0].stats});
+    auto weight = root_evaluator.CalcWeight(RegTree::kRoot, *param_, GradStats{snode_[0].stats});
     return weight;
   }
 
  public:
   // The column sampler must be constructed by caller since we need to preserve the rng
   // for the entire training session.
-  explicit HistEvaluator(TrainParam const &param, MetaInfo const &info, int32_t n_threads,
+  explicit HistEvaluator(Context const *ctx, TrainParam const *param, MetaInfo const &info,
                          std::shared_ptr<common::ColumnSampler> sampler)
-      : param_{param},
+      : ctx_{ctx},
+        param_{param},
         column_sampler_{std::move(sampler)},
-        tree_evaluator_{param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
-        n_threads_{n_threads} {
-    interaction_constraints_.Configure(param, info.num_col_);
-    column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(), param_.colsample_bynode,
-                          param_.colsample_bylevel, param_.colsample_bytree);
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        is_col_split_{info.data_split_mode == DataSplitMode::kCol} {
+    interaction_constraints_.Configure(*param, info.num_col_);
+    column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
+                          param_->colsample_bynode, param_->colsample_bylevel,
+                          param_->colsample_bytree);
   }
 };
 
@@ -467,6 +487,5 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
     });
   }
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index f3ed27a88..4e64cbd75 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -29,6 +29,7 @@ class HistogramBuilder {
   size_t n_batches_{0};
   // Whether XGBoost is running in distributed environment.
   bool is_distributed_{false};
+  bool is_col_split_{false};
 
  public:
   /**
@@ -40,7 +41,7 @@ class HistogramBuilder {
    *                         of using global rabit variable.
    */
   void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
-             bool is_distributed) {
+             bool is_distributed, bool is_col_split) {
     CHECK_GE(n_threads, 1);
     n_threads_ = n_threads;
     n_batches_ = n_batches;
@@ -50,6 +51,7 @@ class HistogramBuilder {
     buffer_.Init(total_bins);
     builder_ = common::GHistBuilder(total_bins);
     is_distributed_ = is_distributed;
+    is_col_split_ = is_col_split;
     // Workaround s390x gcc 7.5.0
     auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
   }
@@ -96,7 +98,7 @@ class HistogramBuilder {
                    std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
                    std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
                    RegTree const *p_tree) {
-    if (is_distributed_) {
+    if (is_distributed_ && !is_col_split_) {
       this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
                                    nodes_for_subtraction_trick, p_tree);
     } else {
@@ -130,7 +132,7 @@ class HistogramBuilder {
       return;
     }
 
-    if (is_distributed_) {
+    if (is_distributed_ && !is_col_split_) {
       this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
                                      nodes_for_subtraction_trick,
                                      starting_index, sync_count);
diff --git a/src/tree/param.h b/src/tree/param.h
index 3f5e4ec7b..98895e5a2 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2021 by Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file param.h
  * \brief training parameters, statistics used to support tree construction.
  * \author Tianqi Chen
@@ -238,9 +238,8 @@ XGBOOST_DEVICE inline static T1 ThresholdL1(T1 w, T2 alpha) {
 
 // calculate the cost of loss function
 template <typename TrainingParams, typename T>
-XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p,
-                                            T sum_grad, T sum_hess, T w) {
-  return -(T(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
+XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad, T sum_hess, T w) {
+  return -(static_cast<T>(2.0) * sum_grad * w + (sum_hess + p.reg_lambda) * common::Sqr(w));
 }
 
 // calculate weight given the statistics
@@ -261,7 +260,7 @@ XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
 template <typename TrainingParams, typename T>
 XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
   if (sum_hess < p.min_child_weight || sum_hess <= 0.0) {
-    return T(0.0);
+    return static_cast<T>(0.0);
   }
   if (p.max_delta_step == 0.0f) {
     if (p.reg_alpha == 0.0f) {
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 4bd2294d1..55e37a919 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1069,8 +1069,8 @@ bool LoadModelImpl(Json const& in, TreeParam* param, std::vector<RTreeNodeStat>*
   split_types = std::remove_reference_t<decltype(split_types)>(n_nodes);
   split_categories_segments = std::remove_reference_t<decltype(split_categories_segments)>(n_nodes);
 
-  static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value, "");
-  static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value, "");
+  static_assert(std::is_integral<decltype(GetElem<Integer>(lefts, 0))>::value);
+  static_assert(std::is_floating_point<decltype(GetElem<Number>(loss_changes, 0))>::value);
   CHECK_EQ(n_nodes, split_categories_segments.size());
 
   // Set node
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 0e3675888..2bc3ff543 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -23,8 +23,7 @@
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_approx);
 
@@ -41,7 +40,7 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
 
 class GloablApproxBuilder {
  protected:
-  TrainParam param_;
+  TrainParam const* param_;
   std::shared_ptr<common::ColumnSampler> col_sampler_;
   HistEvaluator<CPUExpandEntry> evaluator_;
   HistogramBuilder<CPUExpandEntry> histogram_builder_;
@@ -64,19 +63,19 @@ class GloablApproxBuilder {
     bst_bin_t n_total_bins = 0;
     partitioner_.clear();
     // Generating the GHistIndexMatrix is quite slow, is there a way to speed it up?
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess, task_))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess, task_))) {
       if (n_total_bins == 0) {
         n_total_bins = page.cut.TotalBins();
         feature_values_ = page.cut;
       } else {
         CHECK_EQ(n_total_bins, page.cut.TotalBins());
       }
-      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid);
+      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, p_fmat->IsColumnSplit());
       n_batches_++;
     }
 
-    histogram_builder_.Reset(n_total_bins, BatchSpec(param_, hess), ctx_->Threads(), n_batches_,
-                             collective::IsDistributed());
+    histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
+                             collective::IsDistributed(), p_fmat->IsColumnSplit());
     monitor_->Stop(__func__);
   }
 
@@ -90,11 +89,13 @@ class GloablApproxBuilder {
     for (auto const &g : gpair) {
       root_sum.Add(g);
     }
-    collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
+    if (p_fmat->IsRowSplit()) {
+      collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<double *>(&root_sum), 2);
+    }
     std::vector<CPUExpandEntry> nodes{best};
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
       histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
                                    {}, gpair);
       i++;
@@ -103,7 +104,7 @@ class GloablApproxBuilder {
     auto weight = evaluator_.InitRoot(root_sum);
     p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
     p_tree->Stat(RegTree::kRoot).base_weight = weight;
-    (*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight);
+    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
 
     auto const &histograms = histogram_builder_.Histogram();
     auto ft = p_fmat->Info().feature_types.ConstHostSpan();
@@ -145,7 +146,7 @@ class GloablApproxBuilder {
 
     size_t i = 0;
     auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
       histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
                                    nodes_to_build, nodes_to_sub, gpair);
       i++;
@@ -166,12 +167,12 @@ class GloablApproxBuilder {
   }
 
  public:
-  explicit GloablApproxBuilder(TrainParam param, MetaInfo const &info, Context const *ctx,
+  explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
                                std::shared_ptr<common::ColumnSampler> column_sampler, ObjInfo task,
                                common::Monitor *monitor)
-      : param_{std::move(param)},
+      : param_{param},
         col_sampler_{std::move(column_sampler)},
-        evaluator_{param_, info, ctx->Threads(), col_sampler_},
+        evaluator_{ctx, param_, info, col_sampler_},
         ctx_{ctx},
         task_{task},
         monitor_{monitor} {}
@@ -181,7 +182,7 @@ class GloablApproxBuilder {
     p_last_tree_ = p_tree;
     this->InitData(p_fmat, hess);
 
-    Driver<CPUExpandEntry> driver(param_);
+    Driver<CPUExpandEntry> driver(*param_);
     auto &tree = *p_tree;
     driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)});
     auto expand_set = driver.Pop();
@@ -211,7 +212,7 @@ class GloablApproxBuilder {
 
       monitor_->Start("UpdatePosition");
       size_t page_id = 0;
-      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(param_, hess))) {
+      for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(BatchSpec(*param_, hess))) {
         partitioner_.at(page_id).UpdatePosition(ctx_, page, applied, p_tree);
         page_id++;
       }
@@ -248,7 +249,6 @@ class GloablApproxBuilder {
  *        iteration.
  */
 class GlobalApproxUpdater : public TreeUpdater {
-  TrainParam param_;
   common::Monitor monitor_;
   // specializations for different histogram precision.
   std::unique_ptr<GloablApproxBuilder> pimpl_;
@@ -263,15 +263,9 @@ class GlobalApproxUpdater : public TreeUpdater {
     monitor_.Init(__func__);
   }
 
-  void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
-  void LoadConfig(Json const &in) override {
-    auto const &config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json *p_out) const override {
-    auto &out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
+  void Configure(Args const &) override {}
+  void LoadConfig(Json const &) override {}
+  void SaveConfig(Json *) const override {}
 
   void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
                 linalg::Matrix<GradientPair> *sampled) {
@@ -281,20 +275,17 @@ class GlobalApproxUpdater : public TreeUpdater {
     SampleGradient(ctx_, param, sampled->HostView());
   }
 
-  char const *Name() const override { return "grow_histmaker"; }
+  [[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
 
-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *m,
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-
-    pimpl_ = std::make_unique<GloablApproxBuilder>(param_, m->Info(), ctx_, column_sampler_, task_,
+    pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
                                                    &monitor_);
 
     linalg::Matrix<GradientPair> h_gpair;
     // Obtain the hessian values for weighted sketching
-    InitData(param_, gpair, &h_gpair);
+    InitData(*param, gpair, &h_gpair);
     std::vector<float> hess(h_gpair.Size());
     auto const &s_gpair = h_gpair.Data()->ConstHostVector();
     std::transform(s_gpair.begin(), s_gpair.end(), hess.begin(),
@@ -302,12 +293,11 @@ class GlobalApproxUpdater : public TreeUpdater {
 
     cached_ = m;
 
-    size_t t_idx = 0;
+    std::size_t t_idx = 0;
     for (auto p_tree : trees) {
       this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
       ++t_idx;
     }
-    param_.learning_rate = lr;
   }
 
   bool UpdatePredictionCache(const DMatrix *data, linalg::VectorView<float> out_preds) override {
@@ -318,7 +308,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     return true;
   }
 
-  bool HasNodePosition() const override { return true; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
 };
 
 DMLC_REGISTRY_FILE_TAG(grow_histmaker);
@@ -328,5 +318,4 @@ XGBOOST_REGISTER_TREE_UPDATER(GlobalHistMaker, "grow_histmaker")
         "Tree constructor that uses approximate histogram construction "
         "for each node.")
     .set_body([](Context const *ctx, ObjInfo task) { return new GlobalApproxUpdater(ctx, task); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 07483038c..070bfe578 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file updater_colmaker.cc
  * \brief use columnwise update to construct a tree
  * \author Tianqi Chen
@@ -17,8 +17,7 @@
 #include "../common/random.h"
 #include "split_evaluator.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_colmaker);
 
@@ -57,18 +56,15 @@ class ColMaker: public TreeUpdater {
  public:
   explicit ColMaker(Context const *ctx) : TreeUpdater(ctx) {}
   void Configure(const Args &args) override {
-    param_.UpdateAllowUnknown(args);
     colmaker_param_.UpdateAllowUnknown(args);
   }
 
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
     FromJson(config.at("colmaker_train_param"), &this->colmaker_param_);
   }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
+  void SaveConfig(Json *p_out) const override {
+    auto &out = *p_out;
     out["colmaker_train_param"] = ToJson(colmaker_param_);
   }
 
@@ -95,7 +91,7 @@ class ColMaker: public TreeUpdater {
     }
   }
 
-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (collective::IsDistributed()) {
@@ -108,22 +104,16 @@ class ColMaker: public TreeUpdater {
     }
     this->LazyGetColumnDensity(dmat);
     // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-    interaction_constraints_.Configure(param_, dmat->Info().num_row_);
+    interaction_constraints_.Configure(*param, dmat->Info().num_row_);
     // build tree
     for (auto tree : trees) {
       CHECK(ctx_);
-      Builder builder(param_, colmaker_param_, interaction_constraints_, ctx_,
-                      column_densities_);
+      Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
       builder.Update(gpair->ConstHostVector(), dmat, tree);
     }
-    param_.learning_rate = lr;
   }
 
  protected:
-  // training parameter
-  TrainParam param_;
   ColMakerTrainParam colmaker_param_;
   // SplitEvaluator that will be cloned for each Builder
   std::vector<float> column_densities_;
@@ -234,9 +224,9 @@ class ColMaker: public TreeUpdater {
         }
       }
       {
-        column_sampler_.Init(fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
-                             param_.colsample_bynode, param_.colsample_bylevel,
-                             param_.colsample_bytree);
+        column_sampler_.Init(ctx_, fmat.Info().num_col_,
+                             fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
+                             param_.colsample_bylevel, param_.colsample_bytree);
       }
       {
         // setup temp space for each thread
@@ -614,5 +604,4 @@ class ColMaker: public TreeUpdater {
 XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
     .describe("Grow tree with parallelization over columns.")
     .set_body([](Context const *ctx, ObjInfo) { return new ColMaker(ctx); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index a02ee5cdd..32b3f4a03 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
  */
 #include <thrust/copy.h>
 #include <thrust/reduce.h>
@@ -160,11 +160,11 @@ class DeviceHistogramStorage {
     if (nidx_map_.find(nidx) != nidx_map_.cend()) {
       // Fetch from normal cache
       auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
     } else {
       // Fetch from overflow
       auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
     }
   }
 };
@@ -243,7 +243,7 @@ struct GPUHistMakerDevice {
   // thread safe
   void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
     auto const& info = dmat->Info();
-    this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
+    this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
                               param.colsample_bynode, param.colsample_bylevel,
                               param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
@@ -306,6 +306,8 @@ struct GPUHistMakerDevice {
         matrix.is_dense
     };
     dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
+    // Store the feature set ptrs so they dont go out of scope before the kernel is called
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
     for (size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
       int left_nidx = tree[candidate.nid].LeftChild();
@@ -314,10 +316,12 @@ struct GPUHistMakerDevice {
       nidx[i * 2 + 1] = right_nidx;
       auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
       left_sampled_features->SetDevice(ctx_->gpu_id);
+      feature_sets.emplace_back(left_sampled_features);
       common::Span<bst_feature_t> left_feature_set =
           interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
       auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
       right_sampled_features->SetDevice(ctx_->gpu_id);
+      feature_sets.emplace_back(right_sampled_features);
       common::Span<bst_feature_t> right_feature_set =
           interaction_constraints.Query(right_sampled_features->DeviceSpan(),
                                         right_nidx);
@@ -330,8 +334,8 @@ struct GPUHistMakerDevice {
     }
     bst_feature_t max_active_features = 0;
     for (auto input : h_node_inputs) {
-      max_active_features = std::max(max_active_features,
-                                     bst_feature_t(input.feature_set.size()));
+      max_active_features =
+          std::max(max_active_features, static_cast<bst_feature_t>(input.feature_set.size()));
     }
     dh::safe_cuda(cudaMemcpyAsync(
         d_node_inputs.data().get(), h_node_inputs.data(),
@@ -752,7 +756,6 @@ class GPUHistMaker : public TreeUpdater {
   void Configure(const Args& args) override {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Hist]: Configure";
-    param_.UpdateAllowUnknown(args);
     hist_maker_param_.UpdateAllowUnknown(args);
     dh::CheckComputeCapability();
     initialised_ = false;
@@ -764,32 +767,26 @@ class GPUHistMaker : public TreeUpdater {
     auto const& config = get<Object const>(in);
     FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
     initialised_ = false;
-    FromJson(config.at("train_param"), &param_);
   }
   void SaveConfig(Json* p_out) const override {
     auto& out = *p_out;
     out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
-    out["train_param"] = ToJson(param_);
   }
 
   ~GPUHistMaker() {  // NOLINT
     dh::GlobalMemoryLogger().Log();
   }
 
-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     monitor_.Start("Update");
 
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-
     // build tree
     try {
       size_t t_idx{0};
       for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(gpair, dmat, tree, &out_position[t_idx]);
+        this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
 
         if (hist_maker_param_.debug_synchronize) {
           this->CheckTreesSynchronized(tree);
@@ -800,12 +797,10 @@ class GPUHistMaker : public TreeUpdater {
     } catch (const std::exception& e) {
       LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
     }
-
-    param_.learning_rate = lr;
     monitor_.Stop("Update");
   }
 
-  void InitDataOnce(DMatrix* dmat) {
+  void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
     CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
     info_ = &dmat->Info();
 
@@ -814,24 +809,24 @@ class GPUHistMaker : public TreeUpdater {
     collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
     BatchParam batch_param{
-      ctx_->gpu_id,
-      param_.max_bin,
+        ctx_->gpu_id,
+        param->max_bin,
     };
     auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     info_->feature_types.SetDevice(ctx_->gpu_id);
     maker.reset(new GPUHistMakerDevice<GradientSumT>(
-        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_,
+        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
         column_sampling_seed, info_->num_col_, batch_param));
 
     p_last_fmat_ = dmat;
     initialised_ = true;
   }
 
-  void InitData(DMatrix* dmat, RegTree const* p_tree) {
+  void InitData(TrainParam const* param, DMatrix* dmat, RegTree const* p_tree) {
     if (!initialised_) {
       monitor_.Start("InitDataOnce");
-      this->InitDataOnce(dmat);
+      this->InitDataOnce(param, dmat);
       monitor_.Stop("InitDataOnce");
     }
     p_last_tree_ = p_tree;
@@ -852,10 +847,10 @@ class GPUHistMaker : public TreeUpdater {
     CHECK(*local_tree == reference_tree);
   }
 
-  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
-                  HostDeviceVector<bst_node_t>* p_out_position) {
+  void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+                  RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
     monitor_.Start("InitData");
-    this->InitData(p_fmat, p_tree);
+    this->InitData(param, p_fmat, p_tree);
     monitor_.Stop("InitData");
 
     gpair->SetDevice(ctx_->gpu_id);
@@ -874,7 +869,6 @@ class GPUHistMaker : public TreeUpdater {
     return result;
   }
 
-  TrainParam param_;  // NOLINT
   MetaInfo* info_{};  // NOLINT
 
   std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index bec49bf47..c591ce454 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file updater_prune.cc
  * \brief prune a tree given the statistics
  * \author Tianqi Chen
@@ -8,13 +8,11 @@
 
 #include <memory>
 
+#include "../common/timer.h"
+#include "./param.h"
 #include "xgboost/base.h"
 #include "xgboost/json.h"
-#include "./param.h"
-#include "../common/timer.h"
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_prune);
 
 /*! \brief pruner that prunes a tree after growing finishes */
@@ -24,47 +22,31 @@ class TreePruner : public TreeUpdater {
     syncher_.reset(TreeUpdater::Create("sync", ctx_, task));
     pruner_monitor_.Init("TreePruner");
   }
-  char const* Name() const override {
-    return "prune";
-  }
-
+  [[nodiscard]] char const* Name() const override { return "prune"; }
   // set training parameter
-  void Configure(const Args& args) override {
-    param_.UpdateAllowUnknown(args);
-    syncher_->Configure(args);
-  }
+  void Configure(const Args& args) override { syncher_->Configure(args); }
 
-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
-  bool CanModifyTree() const override {
-    return true;
-  }
+  void LoadConfig(Json const&) override {}
+  void SaveConfig(Json*) const override {}
+  [[nodiscard]] bool CanModifyTree() const override { return true; }
 
   // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     pruner_monitor_.Start("PrunerUpdate");
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
     for (auto tree : trees) {
-      this->DoPrune(tree);
+      this->DoPrune(param, tree);
     }
-    param_.learning_rate = lr;
-    syncher_->Update(gpair, p_fmat, out_position, trees);
+    syncher_->Update(param, gpair, p_fmat, out_position, trees);
     pruner_monitor_.Stop("PrunerUpdate");
   }
 
  private:
   // try to prune off current leaf
-  bst_node_t TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*)
+  bst_node_t TryPruneLeaf(TrainParam const* param, RegTree* p_tree, int nid, int depth,
+                          int npruned) {
+    auto& tree = *p_tree;
     CHECK(tree[nid].IsLeaf());
     if (tree[nid].IsRoot()) {
       return npruned;
@@ -77,22 +59,22 @@ class TreePruner : public TreeUpdater {
     auto right = tree[pid].RightChild();
     bool balanced = tree[left].IsLeaf() &&
                     right != RegTree::kInvalidNodeId && tree[right].IsLeaf();
-    if (balanced && param_.NeedPrune(s.loss_chg, depth)) {
+    if (balanced && param->NeedPrune(s.loss_chg, depth)) {
       // need to be pruned
-      tree.ChangeToLeaf(pid, param_.learning_rate * s.base_weight);
+      tree.ChangeToLeaf(pid, param->learning_rate * s.base_weight);
       // tail recursion
-      return this->TryPruneLeaf(tree, pid, depth - 1, npruned + 2);
+      return this->TryPruneLeaf(param, p_tree, pid, depth - 1, npruned + 2);
     } else {
       return npruned;
     }
   }
   /*! \brief do pruning of a tree */
-  void DoPrune(RegTree* p_tree) {
+  void DoPrune(TrainParam const* param, RegTree* p_tree) {
     auto& tree = *p_tree;
     bst_node_t npruned = 0;
     for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
       if (tree[nid].IsLeaf() && !tree[nid].IsDeleted()) {
-        npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
+        npruned = this->TryPruneLeaf(param, p_tree, nid, tree.GetDepth(nid), npruned);
       }
     }
     LOG(INFO) << "tree pruning end, "
@@ -103,13 +85,10 @@ class TreePruner : public TreeUpdater {
  private:
   // synchronizer
   std::unique_ptr<TreeUpdater> syncher_;
-  // training parameter
-  TrainParam param_;
   common::Monitor pruner_monitor_;
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune")
     .describe("Pruner that prune the tree according to statistics.")
     .set_body([](Context const* ctx, ObjInfo task) { return new TreePruner(ctx, task); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index f7cf73f1d..1929efb28 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -28,21 +28,14 @@ namespace tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
 
-void QuantileHistMaker::Configure(const Args &args) {
-  param_.UpdateAllowUnknown(args);
-}
-
-void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+void QuantileHistMaker::Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair,
+                               DMatrix *dmat,
                                common::Span<HostDeviceVector<bst_node_t>> out_position,
                                const std::vector<RegTree *> &trees) {
-  // rescale learning rate according to size of trees
-  float lr = param_.learning_rate;
-  param_.learning_rate = lr / trees.size();
-
   // build tree
   const size_t n_trees = trees.size();
   if (!pimpl_) {
-    pimpl_.reset(new Builder(n_trees, param_, dmat, task_, ctx_));
+    pimpl_.reset(new Builder(n_trees, param, dmat, task_, ctx_));
   }
 
   size_t t_idx{0};
@@ -51,8 +44,6 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *d
     this->pimpl_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
     ++t_idx;
   }
-
-  param_.learning_rate = lr;
 }
 
 bool QuantileHistMaker::UpdatePredictionCache(const DMatrix *data,
@@ -107,7 +98,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
     auto weight = evaluator_->InitRoot(GradStats{grad_stat});
     p_tree->Stat(RegTree::kRoot).sum_hess = grad_stat.GetHess();
     p_tree->Stat(RegTree::kRoot).base_weight = weight;
-    (*p_tree)[RegTree::kRoot].SetLeaf(param_.learning_rate * weight);
+    (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
 
     std::vector<CPUExpandEntry> entries{node};
     monitor_->Start("EvaluateSplits");
@@ -173,7 +164,7 @@ void QuantileHistMaker::Builder::ExpandTree(DMatrix *p_fmat, RegTree *p_tree,
                                             HostDeviceVector<bst_node_t> *p_out_position) {
   monitor_->Start(__func__);
 
-  Driver<CPUExpandEntry> driver(param_);
+  Driver<CPUExpandEntry> driver(*param_);
   driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
   auto const &tree = *p_tree;
   auto expand_set = driver.Pop();
@@ -277,21 +268,19 @@ void QuantileHistMaker::Builder::InitData(DMatrix *fmat, const RegTree &tree,
       } else {
         CHECK_EQ(n_total_bins, page.cut.TotalBins());
       }
-      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid);
+      partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid, fmat->IsColumnSplit());
       ++page_id;
     }
     histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                              collective::IsDistributed());
+                              collective::IsDistributed(), fmat->IsColumnSplit());
 
-    auto m_gpair =
-        linalg::MakeTensorView(*gpair, {gpair->size(), static_cast<std::size_t>(1)}, ctx_->gpu_id);
-    SampleGradient(ctx_, param_, m_gpair);
+    auto m_gpair = linalg::MakeTensorView(ctx_, *gpair, gpair->size(), static_cast<std::size_t>(1));
+    SampleGradient(ctx_, *param_, m_gpair);
   }
 
   // store a pointer to the tree
   p_last_tree_ = &tree;
-  evaluator_.reset(
-      new HistEvaluator<CPUExpandEntry>{param_, info, this->ctx_->Threads(), column_sampler_});
+  evaluator_.reset(new HistEvaluator<CPUExpandEntry>{ctx_, param_, info, column_sampler_});
 
   monitor_->Stop(__func__);
 }
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index ea7000651..f2e562691 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -35,49 +35,36 @@
 #include "../common/partition_builder.h"
 #include "../common/column_matrix.h"
 
-namespace xgboost {
-namespace tree {
-inline BatchParam HistBatch(TrainParam const& param) {
-  return {param.max_bin, param.sparse_threshold};
+namespace xgboost::tree {
+inline BatchParam HistBatch(TrainParam const* param) {
+  return {param->max_bin, param->sparse_threshold};
 }
 
 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker: public TreeUpdater {
  public:
   explicit QuantileHistMaker(Context const* ctx, ObjInfo task) : TreeUpdater(ctx), task_{task} {}
-  void Configure(const Args& args) override;
+  void Configure(const Args&) override {}
 
-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override;
 
   bool UpdatePredictionCache(const DMatrix *data,
                              linalg::VectorView<float> out_preds) override;
 
-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
+  void LoadConfig(Json const&) override {}
+  void SaveConfig(Json*) const override {}
 
-  char const* Name() const override {
-    return "grow_quantile_histmaker";
-  }
-
-  bool HasNodePosition() const override { return true; }
+  [[nodiscard]] char const* Name() const override { return "grow_quantile_histmaker"; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
 
  protected:
-  // training parameter
-  TrainParam param_;
-
   // actual builder that runs the algorithm
   struct Builder {
    public:
     // constructor
-    explicit Builder(const size_t n_trees, const TrainParam& param, DMatrix const* fmat,
+    explicit Builder(const size_t n_trees, TrainParam const* param, DMatrix const* fmat,
                      ObjInfo task, Context const* ctx)
         : n_trees_(n_trees),
           param_(param),
@@ -115,7 +102,7 @@ class QuantileHistMaker: public TreeUpdater {
 
    private:
     const size_t n_trees_;
-    const TrainParam& param_;
+    TrainParam const* param_;
     std::shared_ptr<common::ColumnSampler> column_sampler_{
         std::make_shared<common::ColumnSampler>()};
 
@@ -140,7 +127,6 @@ class QuantileHistMaker: public TreeUpdater {
   std::unique_ptr<Builder> pimpl_;
   ObjInfo task_;
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 
 #endif  // XGBOOST_TREE_UPDATER_QUANTILE_HIST_H_
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 864c704fa..ebda2a999 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023 by XGBoost Contributors
  * \file updater_refresh.cc
  * \brief refresh the statistics and leaf value on the tree on the dataset
  * \author Tianqi Chen
@@ -16,8 +16,7 @@
 #include "./param.h"
 #include "xgboost/json.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_refresh);
 
@@ -25,23 +24,14 @@ DMLC_REGISTRY_FILE_TAG(updater_refresh);
 class TreeRefresher : public TreeUpdater {
  public:
   explicit TreeRefresher(Context const *ctx) : TreeUpdater(ctx) {}
-  void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("train_param"), &this->param_);
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["train_param"] = ToJson(param_);
-  }
-  char const* Name() const override {
-    return "refresh";
-  }
-  bool CanModifyTree() const override {
-    return true;
-  }
+  void Configure(const Args &) override {}
+  void LoadConfig(Json const &) override {}
+  void SaveConfig(Json *) const override {}
+
+  [[nodiscard]] char const *Name() const override { return "refresh"; }
+  [[nodiscard]] bool CanModifyTree() const override { return true; }
   // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (trees.size() == 0) return;
@@ -103,16 +93,11 @@ class TreeRefresher : public TreeUpdater {
     lazy_get_stats();
     collective::Allreduce<collective::Operation::kSum>(&dmlc::BeginPtr(stemp[0])->sum_grad,
                                                        stemp[0].size() * 2);
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
     int offset = 0;
     for (auto tree : trees) {
-      this->Refresh(dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
+      this->Refresh(param, dmlc::BeginPtr(stemp[0]) + offset, 0, tree);
       offset += tree->param.num_nodes;
     }
-    // set learning rate back
-    param_.learning_rate = lr;
   }
 
  private:
@@ -135,31 +120,27 @@ class TreeRefresher : public TreeUpdater {
       gstats[pid].Add(gpair[ridx]);
     }
   }
-  inline void Refresh(const GradStats *gstats,
-                      int nid, RegTree *p_tree) {
+  inline void Refresh(TrainParam const *param, const GradStats *gstats, int nid, RegTree *p_tree) {
     RegTree &tree = *p_tree;
     tree.Stat(nid).base_weight =
-        static_cast<bst_float>(CalcWeight(param_, gstats[nid]));
+        static_cast<bst_float>(CalcWeight(*param, gstats[nid]));
     tree.Stat(nid).sum_hess = static_cast<bst_float>(gstats[nid].sum_hess);
     if (tree[nid].IsLeaf()) {
-      if (param_.refresh_leaf) {
-        tree[nid].SetLeaf(tree.Stat(nid).base_weight * param_.learning_rate);
+      if (param->refresh_leaf) {
+        tree[nid].SetLeaf(tree.Stat(nid).base_weight * param->learning_rate);
       }
     } else {
-      tree.Stat(nid).loss_chg = static_cast<bst_float>(
-          xgboost::tree::CalcGain(param_, gstats[tree[nid].LeftChild()]) +
-          xgboost::tree::CalcGain(param_, gstats[tree[nid].RightChild()]) -
-          xgboost::tree::CalcGain(param_, gstats[nid]));
-      this->Refresh(gstats, tree[nid].LeftChild(), p_tree);
-      this->Refresh(gstats, tree[nid].RightChild(), p_tree);
+      tree.Stat(nid).loss_chg =
+          static_cast<bst_float>(xgboost::tree::CalcGain(*param, gstats[tree[nid].LeftChild()]) +
+                                 xgboost::tree::CalcGain(*param, gstats[tree[nid].RightChild()]) -
+                                 xgboost::tree::CalcGain(*param, gstats[nid]));
+      this->Refresh(param, gstats, tree[nid].LeftChild(), p_tree);
+      this->Refresh(param, gstats, tree[nid].RightChild(), p_tree);
     }
   }
-  // training parameter
-  TrainParam param_;
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")
     .describe("Refresher that refreshes the weight and statistics according to data.")
     .set_body([](Context const *ctx, ObjInfo) { return new TreeRefresher(ctx); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index a3f99362e..bb28bc4e6 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2019 by Contributors
+/**
+ * Copyright 2014-2013 by XBGoost Contributors
  * \file updater_sync.cc
  * \brief synchronize the tree in all distributed nodes
  */
@@ -13,8 +13,7 @@
 #include "../common/io.h"
 #include "xgboost/json.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 
 DMLC_REGISTRY_FILE_TAG(updater_sync);
 
@@ -30,11 +29,9 @@ class TreeSyncher : public TreeUpdater {
   void LoadConfig(Json const&) override {}
   void SaveConfig(Json*) const override {}
 
-  char const* Name() const override {
-    return "prune";
-  }
+  [[nodiscard]] char const* Name() const override { return "prune"; }
 
-  void Update(HostDeviceVector<GradientPair>*, DMatrix*,
+  void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree*>& trees) override {
     if (collective::GetWorldSize() == 1) return;
@@ -57,5 +54,4 @@ class TreeSyncher : public TreeUpdater {
 XGBOOST_REGISTER_TREE_UPDATER(TreeSyncher, "sync")
     .describe("Syncher that synchronize the tree in all distributed nodes.")
     .set_body([](Context const* ctx, ObjInfo) { return new TreeSyncher(ctx); });
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh
index 41a13eaea..899976a7d 100755
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -23,10 +23,15 @@ case "${container}" in
   gpu|rmm)
     BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
     BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    if [[ $container == "rmm" ]]
+    then
+      BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
+    fi
     ;;
 
   gpu_build_centos7|jvm_gpu_build)
     BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
     ;;
 
   *)
diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh
index ae704ce66..f474f318b 100755
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -15,7 +15,8 @@ fi
 
 command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
                 `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"
 
 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh
index a50963f7c..b25345b1b 100755
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -16,7 +16,8 @@ else
 fi
 
 command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
-                `"CUDA_VERSION_ARG=$CUDA_VERSION"
+                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"
 
 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/prune_libnccl.sh
diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh
index 30e73eb37..6a9a29cb3 100755
--- a/tests/buildkite/build-jvm-packages-gpu.sh
+++ b/tests/buildkite/build-jvm-packages-gpu.sh
@@ -14,5 +14,7 @@ else
 fi
 
 tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \
-  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_jvm_packages.sh \
+  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
+  --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
+  tests/ci_build/build_jvm_packages.sh \
   ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag}
diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1
index 6ee723abb..05d7aefb9 100644
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -12,10 +12,10 @@ if ( $is_release_branch -eq 0 ) {
 }
 mkdir build
 cd build
-cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_UNITY_BUILD=ON ${arch_flag}
+cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
+  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
 $msbuild = -join @(
-  "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0"
+  "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current"
   "\\Bin\\MSBuild.exe"
 )
 & $msbuild xgboost.sln /m /p:Configuration=Release /nodeReuse:false
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 8e315c9cd..cf9270c11 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -22,8 +22,9 @@ function set_buildkite_env_vars_in_container {
 
 set -x
 
-CUDA_VERSION=11.0.3
-RAPIDS_VERSION=22.10
+CUDA_VERSION=11.8.0
+NCCL_VERSION=2.16.5-1
+RAPIDS_VERSION=23.02
 SPARK_VERSION=3.1.1
 JDK_VERSION=8
 
diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh
index 6ae5a719d..a3410b294 100755
--- a/tests/buildkite/deploy-jvm-packages.sh
+++ b/tests/buildkite/deploy-jvm-packages.sh
@@ -9,5 +9,6 @@ then
   echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo"
   tests/ci_build/ci_build.sh jvm_gpu_build docker \
     --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
+    --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
     tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION}
 fi
diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
index b9409de4c..4277eed53 100644
--- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
@@ -2,12 +2,16 @@ import argparse
 import copy
 import os
 import re
+import sys
 
 import boto3
 import botocore
 from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
 
 current_dir = os.path.dirname(__file__)
+sys.path.append(os.path.join(current_dir, ".."))
+
+from common_blocks.utils import create_or_update_stack, wait
 
 TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
 
@@ -68,72 +72,7 @@ def get_full_stack_id(stack_id):
     return f"buildkite-{stack_id}-autoscaling-group"
 
 
-def stack_exists(args, *, stack_name):
-    client = boto3.client("cloudformation", region_name=args.aws_region)
-    waiter = client.get_waiter("stack_exists")
-    try:
-        waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
-        return True
-    except botocore.exceptions.WaiterError as e:
-        return False
-
-
-def create_or_update_stack(
-    args, *, stack_name, template_url=None, template_body=None, params=None
-):
-    kwargs = {
-        "StackName": stack_name,
-        "Capabilities": [
-            "CAPABILITY_IAM",
-            "CAPABILITY_NAMED_IAM",
-            "CAPABILITY_AUTO_EXPAND",
-        ],
-    }
-    if template_url:
-        kwargs["TemplateURL"] = template_url
-    if template_body:
-        kwargs["TemplateBody"] = template_body
-    if params:
-        kwargs["Parameters"] = params
-
-    client = boto3.client("cloudformation", region_name=args.aws_region)
-
-    if stack_exists(args, stack_name=stack_name):
-        print(f"Stack {stack_name} already exists. Updating...")
-        try:
-            response = client.update_stack(**kwargs)
-            return {"StackName": stack_name, "Action": "update"}
-        except botocore.exceptions.ClientError as e:
-            if e.response["Error"]["Code"] == "ValidationError" and re.search(
-                "No updates are to be performed", e.response["Error"]["Message"]
-            ):
-                print(f"No update was made to {stack_name}")
-                return {"StackName": stack_name, "Action": "noop"}
-            else:
-                raise e
-    else:
-        kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
-        response = client.create_stack(**kwargs)
-        return {"StackName": stack_name, "Action": "create"}
-
-
-def wait(promise):
-    client = boto3.client("cloudformation", region_name=args.aws_region)
-    stack_name = promise["StackName"]
-    print(f"Waiting for {stack_name}...")
-    if promise["Action"] == "create":
-        waiter = client.get_waiter("stack_create_complete")
-        waiter.wait(StackName=stack_name)
-        print(f"Finished creating stack {stack_name}")
-    elif promise["Action"] == "update":
-        waiter = client.get_waiter("stack_update_complete")
-        waiter.wait(StackName=stack_name)
-        print(f"Finished updating stack {stack_name}")
-    elif promise["Action"] != "noop":
-        raise ValueError(f"Invalid promise {promise}")
-
-
-def create_agent_iam_policy(args):
+def create_agent_iam_policy(args, *, client):
     policy_stack_name = "buildkite-agent-iam-policy"
     print(f"Creating stack {policy_stack_name} for agent IAM policy...")
     with open(
@@ -142,9 +81,9 @@ def create_agent_iam_policy(args):
     ) as f:
         policy_template = f.read()
     promise = create_or_update_stack(
-        args, stack_name=policy_stack_name, template_body=policy_template
+        args, client=client, stack_name=policy_stack_name, template_body=policy_template
     )
-    wait(promise)
+    wait(promise, client=client)
 
     cf = boto3.resource("cloudformation", region_name=args.aws_region)
     policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
@@ -152,10 +91,10 @@ def create_agent_iam_policy(args):
 
 
 def main(args):
-    agent_iam_policy = create_agent_iam_policy(args)
-
     client = boto3.client("cloudformation", region_name=args.aws_region)
 
+    agent_iam_policy = create_agent_iam_policy(args, client=client)
+
     promises = []
 
     for stack_id in AMI_ID:
@@ -167,13 +106,17 @@ def main(args):
         )
 
         promise = create_or_update_stack(
-            args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
+            args,
+            client=client,
+            stack_name=stack_id_full,
+            template_url=TEMPLATE_URL,
+            params=params,
         )
         promises.append(promise)
         print(f"CI stack {stack_id_full} is in progress in the background")
 
     for promise in promises:
-        wait(promise)
+        wait(promise, client=client)
 
 
 if __name__ == "__main__":
diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
index edb4cc036..30aa20a09 100644
--- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
@@ -1,27 +1,27 @@
 AMI_ID = {
     # Managed by XGBoost team
     "linux-amd64-gpu": {
-        "us-west-2": "ami-00ed92bd37f77bc33",
+        "us-west-2": "ami-094271bed4788ddb5",
     },
     "linux-amd64-mgpu": {
-        "us-west-2": "ami-00ed92bd37f77bc33",
+        "us-west-2": "ami-094271bed4788ddb5",
     },
     "windows-gpu": {
-        "us-west-2": "ami-0a1a2ea551a07ad5f",
+        "us-west-2": "ami-0839681594a1d7627",
     },
     "windows-cpu": {
-        "us-west-2": "ami-0a1a2ea551a07ad5f",
+        "us-west-2": "ami-0839681594a1d7627",
     },
     # Managed by BuildKite
     # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
     "linux-amd64-cpu": {
-        "us-west-2": "ami-075d4c25d5f0c17c1",
+        "us-west-2": "ami-00f2127550cf03658",
     },
     "pipeline-loader": {
-        "us-west-2": "ami-075d4c25d5f0c17c1",
+        "us-west-2": "ami-00f2127550cf03658",
     },
     "linux-arm64-cpu": {
-        "us-west-2": "ami-0952c6fb6db9a9891",
+        "us-west-2": "ami-0c5789068f4a2d1b5",
     },
 }
 
diff --git a/tests/buildkite/infrastructure/common_blocks/utils.py b/tests/buildkite/infrastructure/common_blocks/utils.py
new file mode 100644
index 000000000..27a0835e8
--- /dev/null
+++ b/tests/buildkite/infrastructure/common_blocks/utils.py
@@ -0,0 +1,97 @@
+import re
+
+import boto3
+import botocore
+
+
+def stack_exists(args, *, stack_name):
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    waiter = client.get_waiter("stack_exists")
+    try:
+        waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
+        return True
+    except botocore.exceptions.WaiterError as e:
+        return False
+
+
+def create_or_update_stack(
+    args, *, client, stack_name, template_url=None, template_body=None, params=None
+):
+    kwargs = {
+        "StackName": stack_name,
+        "Capabilities": [
+            "CAPABILITY_IAM",
+            "CAPABILITY_NAMED_IAM",
+            "CAPABILITY_AUTO_EXPAND",
+        ],
+    }
+    if template_url:
+        kwargs["TemplateURL"] = template_url
+    if template_body:
+        kwargs["TemplateBody"] = template_body
+    if params:
+        kwargs["Parameters"] = params
+
+    if stack_exists(args, stack_name=stack_name):
+        print(f"Stack {stack_name} already exists. Updating...")
+        try:
+            response = client.update_stack(**kwargs)
+            return {"StackName": stack_name, "Action": "update"}
+        except botocore.exceptions.ClientError as e:
+            if e.response["Error"]["Code"] == "ValidationError" and re.search(
+                "No updates are to be performed", e.response["Error"]["Message"]
+            ):
+                print(f"No update was made to {stack_name}")
+                return {"StackName": stack_name, "Action": "noop"}
+            else:
+                raise e
+    else:
+        kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
+        response = client.create_stack(**kwargs)
+        return {"StackName": stack_name, "Action": "create"}
+
+
+def replace_stack(
+    args, *, client, stack_name, template_url=None, template_body=None, params=None
+):
+    """Delete an existing stack and create a new stack with identical name"""
+
+    if not stack_exists(args, stack_name=stack_name):
+        raise ValueError(f"Stack {stack_name} does not exist")
+    r = client.delete_stack(StackName=stack_name)
+    delete_waiter = client.get_waiter("stack_delete_complete")
+    delete_waiter.wait(StackName=stack_name)
+
+    kwargs = {
+        "StackName": stack_name,
+        "Capabilities": [
+            "CAPABILITY_IAM",
+            "CAPABILITY_NAMED_IAM",
+            "CAPABILITY_AUTO_EXPAND",
+        ],
+        "OnFailure": "ROLLBACK",
+        "EnableTerminationProtection": False,
+    }
+    if template_url:
+        kwargs["TemplateURL"] = template_url
+    if template_body:
+        kwargs["TemplateBody"] = template_body
+    if params:
+        kwargs["Parameters"] = params
+    response = client.create_stack(**kwargs)
+    return {"StackName": stack_name, "Action": "create"}
+
+
+def wait(promise, *, client):
+    stack_name = promise["StackName"]
+    print(f"Waiting for {stack_name}...")
+    if promise["Action"] == "create":
+        waiter = client.get_waiter("stack_create_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished creating stack {stack_name}")
+    elif promise["Action"] == "update":
+        waiter = client.get_waiter("stack_update_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished updating stack {stack_name}")
+    elif promise["Action"] != "noop":
+        raise ValueError(f"Invalid promise {promise}")
diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py
index 0c71d5e77..8051b991d 100644
--- a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py
@@ -2,6 +2,7 @@ import argparse
 import copy
 import json
 import os
+import sys
 from urllib.request import urlopen
 
 import boto3
@@ -9,6 +10,9 @@ import cfn_flip
 from metadata import IMAGE_PARAMS
 
 current_dir = os.path.dirname(__file__)
+sys.path.append(os.path.join(current_dir, ".."))
+
+from common_blocks.utils import replace_stack, wait
 
 BUILDKITE_CF_TEMPLATE_URL = (
     "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
@@ -47,6 +51,9 @@ def main(args):
 
     ami_mapping = get_ami_mapping()
 
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    promises = []
+
     for stack_id in IMAGE_PARAMS:
         stack_id_full = get_full_stack_id(stack_id)
         print(f"Creating EC2 image builder stack {stack_id_full}...")
@@ -55,28 +62,20 @@ def main(args):
             stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping
         )
 
-        client = boto3.client("cloudformation", region_name=args.aws_region)
-        response = client.create_stack(
-            StackName=stack_id_full,
-            TemplateBody=ec2_image_pipeline_template,
-            Capabilities=[
-                "CAPABILITY_IAM",
-                "CAPABILITY_NAMED_IAM",
-                "CAPABILITY_AUTO_EXPAND",
-            ],
-            OnFailure="ROLLBACK",
-            EnableTerminationProtection=False,
-            Parameters=params,
+        promise = replace_stack(
+            args,
+            client=client,
+            stack_name=stack_id_full,
+            template_body=ec2_image_pipeline_template,
+            params=params,
         )
+        promises.append(promise)
         print(
             f"EC2 image builder stack {stack_id_full} is in progress in the background"
         )
 
-    for stack_id in IMAGE_PARAMS:
-        stack_id_full = get_full_stack_id(stack_id)
-        waiter = client.get_waiter("stack_create_complete")
-        waiter.wait(StackName=stack_id_full)
-        print(f"EC2 image builder stack {stack_id_full} is now finished.")
+    for promise in promises:
+        wait(promise, client=client)
 
 
 if __name__ == "__main__":
diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml
index 478adf3d4..8d3bafa72 100644
--- a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml
@@ -58,7 +58,7 @@ Resources:
   BootstrapComponent:
     Type: AWS::ImageBuilder::Component
     Properties:
-      Name: !Sub "${AWS::StackName}-bootstrap-component"
+      Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
       Platform: !Ref InstanceOperatingSystem
       Version: "1.0.0"
       Description: Execute a bootstrap script.
@@ -67,7 +67,7 @@ Resources:
   Recipe:
     Type: AWS::ImageBuilder::ImageRecipe
     Properties:
-      Name: !Sub "${AWS::StackName}-image"
+      Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
       Components:
         - ComponentArn: !Ref BootstrapComponent
       ParentImage: !Ref BaseImageId
@@ -83,7 +83,7 @@ Resources:
   Infrastructure:
     Type: AWS::ImageBuilder::InfrastructureConfiguration
     Properties:
-      Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure"
+      Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
       InstanceProfileName: !Ref InstanceProfile
       InstanceTypes:
         - !Ref InstanceType
@@ -93,7 +93,7 @@ Resources:
   Distribution:
     Type: AWS::ImageBuilder::DistributionConfiguration
     Properties:
-      Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config"
+      Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
       Distributions:
         - Region: !Ref AWS::Region
           AmiDistributionConfiguration: {}
@@ -102,7 +102,7 @@ Resources:
   Pipeline:
     Type: AWS::ImageBuilder::ImagePipeline
     Properties:
-      Name: !Sub "${AWS::StackName}-image-pipeline"
+      Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
       DistributionConfigurationArn: !Ref Distribution
       ImageRecipeArn: !Ref Recipe
       InfrastructureConfigurationArn: !Ref Infrastructure
diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py
index c74914e54..37100209f 100644
--- a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py
@@ -13,6 +13,6 @@ IMAGE_PARAMS = {
         "BootstrapScript": "windows-gpu-bootstrap.yml",
         "InstanceType": "g4dn.2xlarge",
         "InstanceOperatingSystem": "Windows",
-        "VolumeSize": "80",  # in GiBs
+        "VolumeSize": "120",  # in GiBs
     },
 }
diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
index ef3fade44..03fb105a7 100644
--- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
@@ -15,9 +15,9 @@ phases:
               choco --version
               choco feature enable -n=allowGlobalConfirmation
 
-              # CMake 3.18
-              Write-Host '>>> Installing CMake 3.18...'
-              choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System"
+              # CMake 3.25
+              Write-Host '>>> Installing CMake 3.25...'
+              choco install cmake --version 3.25.2 --installargs "ADD_CMAKE_TO_PATH=System"
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
               # Notepad++
@@ -45,18 +45,18 @@ phases:
               choco install graphviz
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
-              # Install Visual Studio Community 2017 (15.9)
-              Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...'
-              choco install visualstudio2017community --version 15.9.23.0 `
+              # Install Visual Studio 2022 Community
+              Write-Host '>>> Installing Visual Studio 2022 Community...'
+              choco install visualstudio2022community `
                   --params "--wait --passive --norestart"
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
-              choco install visualstudio2017-workload-nativedesktop --params `
+              choco install visualstudio2022-workload-nativedesktop --params `
                   "--wait --passive --norestart --includeOptional"
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
-              # Install CUDA 11.0
-              Write-Host '>>> Installing CUDA 11.0...'
-              choco install cuda --version 11.0.3
+              # Install CUDA 11.8
+              Write-Host '>>> Installing CUDA 11.8...'
+              choco install cuda --version=11.8.0.52206
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
               # Install Python packages
diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index f1ddf9d5f..75a600d7a 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -20,4 +20,5 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
 # tests/ci_build/ci_build.sh rmm nvidia-docker \
 #   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
 #   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
+#   --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
 #   "source activate gpu_test && build/testxgboost --use-rmm-pool"
diff --git a/tests/ci_build/Dockerfile.aarch64 b/tests/ci_build/Dockerfile.aarch64
index 848b50263..9b06e1c83 100644
--- a/tests/ci_build/Dockerfile.aarch64
+++ b/tests/ci_build/Dockerfile.aarch64
@@ -8,15 +8,15 @@ RUN \
     yum install -y tar unzip wget xz git centos-release-scl-rh yum-utils && \
     yum-config-manager --enable centos-sclo-rh-testing && \
     yum update -y && \
-    yum install -y devtoolset-7 && \
+    yum install -y devtoolset-9 && \
     # Python
     wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-aarch64.sh && \
     bash conda.sh -b -p /opt/mambaforge
 
 ENV PATH=/opt/mambaforge/bin:$PATH
-ENV CC=/opt/rh/devtoolset-7/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-7/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-7/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 ENV GOSU_VERSION 1.10
 
 # Create new Conda environment
diff --git a/tests/ci_build/Dockerfile.clang_tidy b/tests/ci_build/Dockerfile.clang_tidy
index b0166f240..967f24d3c 100644
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
 ARG CUDA_VERSION_ARG
 
 # Environment
@@ -7,21 +7,21 @@ ENV DEBIAN_FRONTEND noninteractive
 
 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
     apt-get update && \
     apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
                        apt-transport-https ca-certificates gnupg-agent && \
     wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    add-apt-repository -u 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main' && \
+    add-apt-repository -u 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main' && \
     apt-get update && \
-    apt-get install -y llvm-11 clang-tidy-11 clang-11 && \
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    apt-get install -y llvm-15 clang-tidy-15 clang-15 libomp-15-dev && \
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
 
 # Set default clang-tidy version
 RUN \
-    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 100 && \
-    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-11 100
+    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 && \
+    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 100
 
 # Install Python packages
 RUN \
diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
index c1c20ba37..fa9ea772d 100644
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:22.04
 
 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -10,18 +10,15 @@ RUN \
     apt-get install -y software-properties-common && \
     add-apt-repository ppa:ubuntu-toolchain-r/test && \
     apt-get update && \
-    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
-    # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libidn12 cmake ninja-build gcc-9 g++-9 openjdk-8-jdk-headless && \
     # Python
     wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge
 
 ENV PATH=/opt/mambaforge/bin:$PATH
-ENV CC=gcc-8
-ENV CXX=g++-8
-ENV CPP=cpp-8
+ENV CC=gcc-9
+ENV CXX=g++-9
+ENV CPP=cpp-9
 
 ENV GOSU_VERSION 1.10
 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 04dc5bcd0..3b5701693 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -22,10 +22,10 @@ ENV PATH=/opt/mambaforge/bin:$PATH
 RUN \
     conda install -c conda-forge mamba && \
     mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
+        python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
         dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python=11.7.0 && \
+        pyspark cloudpickle cuda-python && \
     mamba clean --all && \
     conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
 
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index fb27cf4f2..bfc79c216 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -1,6 +1,7 @@
 ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
+ARG NCCL_VERSION_ARG
 
 # Install all basic requirements
 RUN \
@@ -9,7 +10,7 @@ RUN \
     yum install -y epel-release centos-release-scl && \
     yum-config-manager --enable centos-sclo-rh-testing && \
     yum -y update && \
-    yum install -y tar unzip wget xz git which ninja-build devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ && \
+    yum install -y tar unzip wget xz git which ninja-build devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
     # Python
     wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
@@ -21,7 +22,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
     export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.13.4-1 && \
+    export NCCL_VERSION=$NCCL_VERSION_ARG && \
     wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
     rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
     yum -y update && \
@@ -29,9 +30,9 @@ RUN \
     rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;
 
 ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 
 ENV GOSU_VERSION 1.10
 
diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7
index ad5f15495..6cfd30fe5 100644
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -12,16 +12,16 @@ RUN \
     yum install -y tar unzip wget xz git which ninja-build readline-devel libX11-devel libXt-devel \
                    xorg-x11-server-devel openssl-devel zlib-devel bzip2-devel xz-devel \
                    pcre-devel libcurl-devel texlive-* \
-                   devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ \
-                   devtoolset-8-gcc-gfortran devtoolset-8-libquadmath-devel \
-                   devtoolset-8-runtime devtoolset-8-libstdc++-devel
+                   devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \
+                   devtoolset-9-gcc-gfortran devtoolset-9-libquadmath-devel \
+                   devtoolset-9-runtime devtoolset-9-libstdc++-devel
 
 ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/3.3.0/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/3.3.0/lib64:$LD_LIBRARY_PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
-ENV F77=/opt/rh/devtoolset-8/root/usr/bin/gfortran
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
+ENV F77=/opt/rh/devtoolset-9/root/usr/bin/gfortran
 
 # R 3.3.0
 RUN \
@@ -36,8 +36,8 @@ RUN \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/python -m pip install auditwheel awscli && \
     # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
 
 ENV GOSU_VERSION 1.10
 
diff --git a/tests/ci_build/Dockerfile.jvm b/tests/ci_build/Dockerfile.jvm
index 4c5e21203..43fbd8ff5 100644
--- a/tests/ci_build/Dockerfile.jvm
+++ b/tests/ci_build/Dockerfile.jvm
@@ -6,23 +6,23 @@ RUN \
     yum-config-manager --enable centos-sclo-rh-testing && \
     yum -y update && \
     yum install -y tar unzip make bzip2 wget xz git which ninja-build java-1.8.0-openjdk-devel \
-                   devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ \
-                   devtoolset-8-runtime devtoolset-8-libstdc++-devel && \
+                   devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \
+                   devtoolset-9-runtime devtoolset-9-libstdc++-devel && \
     # Python
     wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
     # Maven
     wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
     tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
     ln -s /opt/apache-maven-3.6.1/ /opt/maven
 
 ENV PATH=/opt/mambaforge/bin:/opt/maven/bin:$PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 
 # Install Python packages
 RUN \
diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build
index 304db2d52..d4a580495 100644
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -1,6 +1,7 @@
 ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
+ARG NCCL_VERSION_ARG
 
 # Install all basic requirements
 RUN \
@@ -9,13 +10,13 @@ RUN \
     yum install -y epel-release centos-release-scl && \
     yum-config-manager --enable centos-sclo-rh-testing && \
     yum -y update && \
-    yum install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ && \
+    yum install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
     # Python
     wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
     bash conda.sh -b -p /opt/mambaforge && \
     # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
     # Maven
     wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
     tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
@@ -24,15 +25,15 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
     export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.13.4-1 && \
+    export NCCL_VERSION=$NCCL_VERSION_ARG && \
     yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
     yum -y update && \
     yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}
 
 ENV PATH=/opt/mambaforge/bin:/opt/maven/bin:$PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 
 # Install Python packages
 RUN \
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index a1fce9c00..16db377c2 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -1,7 +1,8 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
+ARG NCCL_VERSION_ARG
 
 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -19,7 +20,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
     export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.13.4-1 && \
+    export NCCL_VERSION=$NCCL_VERSION_ARG && \
     apt-get update && \
     apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
 
@@ -29,7 +30,7 @@ ENV PATH=/opt/mambaforge/bin:$PATH
 RUN \
     conda install -c conda-forge mamba && \
     mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
+        python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
     mamba clean --all
 
 ENV GOSU_VERSION 1.10
diff --git a/tests/ci_build/build_r_pkg_with_cuda_win64.sh b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
index 7d32bfe6a..ca67704b5 100644
--- a/tests/ci_build/build_r_pkg_with_cuda_win64.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
@@ -18,7 +18,7 @@ mv xgboost/ xgboost_rpack/
 
 mkdir build
 cd build
-cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3"
+cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3"
 cmake --build . --config Release --parallel
 cd ..
 
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 34eb92fa6..8d601f355 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -3,12 +3,15 @@ import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple
 
 from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time
 
 CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
+SRCPATH = os.path.normpath(
+    os.path.join(CURDIR, os.path.pardir, os.path.pardir, "python-package")
+)
 
 
 @record_time
@@ -29,7 +32,7 @@ Please run the following command on your machine to address the formatting error
 
 @record_time
 def run_isort(rel_path: str) -> bool:
-    cmd = ["isort", "--check", "--profile=black", rel_path]
+    cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
     ret = subprocess.run(cmd).returncode
     if ret != 0:
         subprocess.run(["isort", "--version"])
@@ -151,6 +154,7 @@ def main(args: argparse.Namespace) -> None:
                 "demo/guide-python/sklearn_parallel.py",
                 "demo/guide-python/spark_estimator_examples.py",
                 "demo/guide-python/individual_trees.py",
+                "demo/guide-python/quantile_regression.py",
                 # CI
                 "tests/ci_build/lint_python.py",
                 "tests/ci_build/test_r_package.py",
@@ -193,6 +197,7 @@ def main(args: argparse.Namespace) -> None:
                 "demo/guide-python/cat_in_the_dat.py",
                 "demo/guide-python/feature_weights.py",
                 "demo/guide-python/individual_trees.py",
+                "demo/guide-python/quantile_regression.py",
                 # tests
                 "tests/python/test_dt.py",
                 "tests/python/test_data_iterator.py",
diff --git a/tests/ci_build/tidy.py b/tests/ci_build/tidy.py
index 107e62662..33e153850 100755
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@@ -109,6 +109,10 @@ class ClangTidy(object):
                 continue
             elif components[i] == '-rdynamic':
                 continue
+            elif components[i] == "-Xfatbin=-compress-all":
+                continue
+            elif components[i] == "-forward-unknown-to-host-compiler":
+                continue
             elif (components[i] == '-x' and
                   components[i+1] == 'cu'):
                 # -x cu -> -x cuda
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 6b5bc7cb8..675da940c 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -267,7 +267,7 @@ TEST(CAPI, DMatrixSetFeatureName) {
   }
 
   char const* feat_types [] {"i", "q"};
-  static_assert(sizeof(feat_types)/ sizeof(feat_types[0]) == kCols, "");
+  static_assert(sizeof(feat_types) / sizeof(feat_types[0]) == kCols);
   XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols);
   char const **c_out_types;
   XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len,
diff --git a/tests/cpp/common/test_algorithm.cc b/tests/cpp/common/test_algorithm.cc
new file mode 100644
index 000000000..630460714
--- /dev/null
+++ b/tests/cpp/common/test_algorithm.cc
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/context.h>  // Context
+#include <xgboost/span.h>
+
+#include <algorithm>  // is_sorted
+
+#include "../../../src/common/algorithm.h"
+
+namespace xgboost {
+namespace common {
+TEST(Algorithm, ArgSort) {
+  Context ctx;
+  std::vector<float> inputs{3.0, 2.0, 1.0};
+  auto ret = ArgSort<bst_feature_t>(&ctx, inputs.cbegin(), inputs.cend());
+  std::vector<bst_feature_t> sol{2, 1, 0};
+  ASSERT_EQ(ret, sol);
+}
+
+TEST(Algorithm, Sort) {
+  Context ctx;
+  ctx.Init(Args{{"nthread", "8"}});
+  std::vector<float> inputs{3.0, 1.0, 2.0};
+
+  Sort(&ctx, inputs.begin(), inputs.end(), std::less<>{});
+  ASSERT_TRUE(std::is_sorted(inputs.cbegin(), inputs.cend()));
+
+  inputs = {3.0, 1.0, 2.0};
+  StableSort(&ctx, inputs.begin(), inputs.end(), std::less<>{});
+  ASSERT_TRUE(std::is_sorted(inputs.cbegin(), inputs.cend()));
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu
index c2e159dc4..982f0c9ca 100644
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -52,9 +52,9 @@ void TestSegmentedArgSort() {
   }
 }
 
-TEST(Algorithms, SegmentedArgSort) { TestSegmentedArgSort(); }
+TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }
 
-TEST(Algorithms, ArgSort) {
+TEST(Algorithm, GpuArgSort) {
   Context ctx;
   ctx.gpu_id = 0;
 
@@ -80,7 +80,7 @@ TEST(Algorithms, ArgSort) {
       thrust::is_sorted(sorted_idx.begin() + 10, sorted_idx.end(), thrust::greater<size_t>{}));
 }
 
-TEST(Algorithms, SegmentedSequence) {
+TEST(Algorithm, SegmentedSequence) {
   dh::device_vector<std::size_t> idx(16);
   dh::device_vector<std::size_t> ptr(3);
   Context ctx = CreateEmptyGenericParam(0);
diff --git a/tests/cpp/common/test_charconv.cc b/tests/cpp/common/test_charconv.cc
index cce48f76f..0e43ea51e 100644
--- a/tests/cpp/common/test_charconv.cc
+++ b/tests/cpp/common/test_charconv.cc
@@ -128,7 +128,7 @@ TEST(Ryu, Regression) {
   TestRyu("2E2", 200.0f);
   TestRyu("3.3554432E7", 3.3554432E7f);
 
-  static_assert(1.1920929E-7f == std::numeric_limits<float>::epsilon(), "");
+  static_assert(1.1920929E-7f == std::numeric_limits<float>::epsilon());
   TestRyu("1.1920929E-7", std::numeric_limits<float>::epsilon());
 }
 
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
deleted file mode 100644
index adaf21fea..000000000
--- a/tests/cpp/common/test_common.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <gtest/gtest.h>
-#include <xgboost/span.h>
-#include "../../../src/common/common.h"
-
-namespace xgboost {
-namespace common {
-TEST(ArgSort, Basic) {
-  std::vector<float> inputs {3.0, 2.0, 1.0};
-  auto ret = ArgSort<bst_feature_t>(Span<float>{inputs});
-  std::vector<bst_feature_t> sol{2, 1, 0};
-  ASSERT_EQ(ret, sol);
-}
-}  // namespace common
-}  // namespace xgboost
diff --git a/tests/cpp/common/test_group_data.cc b/tests/cpp/common/test_group_data.cc
index 94bb23e4a..719bc3fc5 100644
--- a/tests/cpp/common/test_group_data.cc
+++ b/tests/cpp/common/test_group_data.cc
@@ -43,8 +43,8 @@ TEST(GroupData, ParallelGroupBuilder) {
   builder2.Push(2, Entry(0, 4), 0);
   builder2.Push(2, Entry(1, 5), 0);
 
-  expected_data.emplace_back(Entry(0, 4));
-  expected_data.emplace_back(Entry(1, 5));
+  expected_data.emplace_back(0, 4);
+  expected_data.emplace_back(1, 5);
   expected_offsets.emplace_back(6);
 
   EXPECT_EQ(data, expected_data);
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index c9db7f646..45948b711 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -143,7 +143,7 @@ void TestMixedSketch() {
   size_t n_samples = 1000, n_features = 2, n_categories = 3;
   std::vector<float> data(n_samples * n_features);
   SimpleLCG gen;
-  SimpleRealUniformDistribution<float> cat_d{0.0f, float(n_categories)};
+  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
   SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
   for (size_t i = 0; i < n_samples * n_features; ++i) {
     if (i % 2 == 0) {
diff --git a/tests/cpp/common/test_intrusive_ptr.cc b/tests/cpp/common/test_intrusive_ptr.cc
index a41697f17..5b0747625 100644
--- a/tests/cpp/common/test_intrusive_ptr.cc
+++ b/tests/cpp/common/test_intrusive_ptr.cc
@@ -13,9 +13,9 @@ class NotCopyConstructible {
   NotCopyConstructible(NotCopyConstructible&& that) = default;
 };
 static_assert(
-    !std::is_trivially_copy_constructible<NotCopyConstructible>::value, "");
+    !std::is_trivially_copy_constructible<NotCopyConstructible>::value);
 static_assert(
-    !std::is_trivially_copy_assignable<NotCopyConstructible>::value, "");
+    !std::is_trivially_copy_assignable<NotCopyConstructible>::value);
 
 class ForIntrusivePtrTest {
  public:
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index 3da4c482c..b1a90d773 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -1,22 +1,23 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/linalg.h>
 
-#include <numeric>
+#include <cstddef>  // size_t
+#include <numeric>  // iota
+#include <vector>
 
 #include "../../../src/common/linalg_op.h"
 
-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace {
 auto kCpuId = Context::kCpuId;
 }
 
-auto MakeMatrixFromTest(HostDeviceVector<float> *storage, size_t n_rows, size_t n_cols) {
+auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, std::size_t n_cols) {
   storage->Resize(n_rows * n_cols);
   auto &h_storage = storage->HostVector();
 
@@ -48,10 +49,11 @@ TEST(Linalg, VectorView) {
 }
 
 TEST(Linalg, TensorView) {
+  Context ctx;
   std::vector<double> data(2 * 3 * 4, 0);
   std::iota(data.begin(), data.end(), 0);
 
-  auto t = MakeTensorView(data, {2, 3, 4}, -1);
+  auto t = MakeTensorView(&ctx, data, 2, 3, 4);
   ASSERT_EQ(t.Shape()[0], 2);
   ASSERT_EQ(t.Shape()[1], 3);
   ASSERT_EQ(t.Shape()[2], 4);
@@ -106,12 +108,12 @@ TEST(Linalg, TensorView) {
   {
     // Don't assign the initial dimension, tensor should be able to deduce the correct dim
     // for Slice.
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto s = t.Slice(1, 2, All());
-    static_assert(decltype(s)::kDimension == 1, "");
+    static_assert(decltype(s)::kDimension == 1);
   }
   {
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto s = t.Slice(1, linalg::All(), 1);
     ASSERT_EQ(s(0), 13);
     ASSERT_EQ(s(1), 17);
@@ -119,9 +121,9 @@ TEST(Linalg, TensorView) {
   }
   {
     // range slice
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto s = t.Slice(linalg::All(), linalg::Range(1, 3), 2);
-    static_assert(decltype(s)::kDimension == 2, "");
+    static_assert(decltype(s)::kDimension == 2);
     std::vector<double> sol{6, 10, 18, 22};
     auto k = 0;
     for (size_t i = 0; i < s.Shape(0); ++i) {
@@ -134,9 +136,9 @@ TEST(Linalg, TensorView) {
   }
   {
     // range slice
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto s = t.Slice(1, linalg::Range(1, 3), linalg::Range(1, 3));
-    static_assert(decltype(s)::kDimension == 2, "");
+    static_assert(decltype(s)::kDimension == 2);
     std::vector<double> sol{17, 18, 21, 22};
     auto k = 0;
     for (size_t i = 0; i < s.Shape(0); ++i) {
@@ -149,9 +151,9 @@ TEST(Linalg, TensorView) {
   }
   {
     // same as no slice.
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
-    static_assert(decltype(s)::kDimension == 3, "");
+    static_assert(decltype(s)::kDimension == 3);
     auto all = t.Slice(linalg::All(), linalg::All(), linalg::All());
     for (size_t i = 0; i < s.Shape(0); ++i) {
       for (size_t j = 0; j < s.Shape(1); ++j) {
@@ -166,7 +168,7 @@ TEST(Linalg, TensorView) {
 
   {
     // copy and move constructor.
-    auto t = MakeTensorView(data, {2, 3, 4}, kCpuId);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto from_copy = t;
     auto from_move = std::move(t);
     for (size_t i = 0; i < t.Shape().size(); ++i) {
@@ -177,7 +179,7 @@ TEST(Linalg, TensorView) {
 
   {
     // multiple slices
-    auto t = MakeTensorView(data, {2, 3, 4}, kCpuId);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
     auto s_0 = t.Slice(linalg::All(), linalg::Range(0, 2), linalg::Range(1, 4));
     ASSERT_FALSE(s_0.CContiguous());
     auto s_1 = s_0.Slice(1, 1, linalg::Range(0, 2));
@@ -208,7 +210,7 @@ TEST(Linalg, TensorView) {
 
 TEST(Linalg, Tensor) {
   {
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId};
+    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
     auto view = t.View(kCpuId);
 
     auto const &as_const = t;
@@ -227,7 +229,7 @@ TEST(Linalg, Tensor) {
   }
   {
     // Reshape
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId};
+    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
     t.Reshape(4, 3, 2);
     ASSERT_EQ(t.Size(), 24);
     ASSERT_EQ(t.Shape(2), 2);
@@ -245,7 +247,7 @@ TEST(Linalg, Tensor) {
 
 TEST(Linalg, Empty) {
   {
-    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId};
+    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId, Order::kC};
     for (int32_t i : {0, 1, 2}) {
       auto s = t.Slice(All(), i);
       ASSERT_EQ(s.Size(), 0);
@@ -254,7 +256,7 @@ TEST(Linalg, Empty) {
     }
   }
   {
-    auto t = Tensor<double, 2>{{0, 3}, kCpuId};
+    auto t = Tensor<double, 2>{{0, 3}, kCpuId, Order::kC};
     ASSERT_EQ(t.Size(), 0);
     auto view = t.View(kCpuId);
 
@@ -269,7 +271,7 @@ TEST(Linalg, Empty) {
 
 TEST(Linalg, ArrayInterface) {
   auto cpu = kCpuId;
-  auto t = Tensor<double, 2>{{3, 3}, cpu};
+  auto t = Tensor<double, 2>{{3, 3}, cpu, Order::kC};
   auto v = t.View(cpu);
   std::iota(v.Values().begin(), v.Values().end(), 0);
   auto arr = Json::Load(StringView{ArrayInterfaceStr(v)});
@@ -313,21 +315,48 @@ TEST(Linalg, Popc) {
 }
 
 TEST(Linalg, Stack) {
-  Tensor<float, 3> l{{2, 3, 4}, kCpuId};
+  Tensor<float, 3> l{{2, 3, 4}, kCpuId, Order::kC};
   ElementWiseTransformHost(l.View(kCpuId), omp_get_max_threads(),
                            [=](size_t i, float) { return i; });
-  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId};
+  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId, Order::kC};
   ElementWiseTransformHost(r_0.View(kCpuId), omp_get_max_threads(),
                            [=](size_t i, float) { return i; });
 
   Stack(&l, r_0);
 
-  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId};
+  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId, Order::kC};
   Stack(&l, r_1);
   ASSERT_EQ(l.Shape(0), 4);
 
   Stack(&r_1, l);
   ASSERT_EQ(r_1.Shape(0), l.Shape(0));
 }
-}  // namespace linalg
-}  // namespace xgboost
+
+TEST(Linalg, FOrder) {
+  std::size_t constexpr kRows = 16, kCols = 3;
+  std::vector<float> data(kRows * kCols);
+  MatrixView<float> mat{data, {kRows, kCols}, Context::kCpuId, Order::kF};
+  float k{0};
+  for (std::size_t i = 0; i < kRows; ++i) {
+    for (std::size_t j = 0; j < kCols; ++j) {
+      mat(i, j) = k;
+      k++;
+    }
+  }
+  auto column = mat.Slice(linalg::All(), 1);
+  ASSERT_TRUE(column.FContiguous());
+  ASSERT_EQ(column.Stride(0), 1);
+  ASSERT_TRUE(column.CContiguous());
+  k = 1;
+  for (auto it = linalg::cbegin(column); it != linalg::cend(column); ++it) {
+    ASSERT_EQ(*it, k);
+    k += kCols;
+  }
+  k = 1;
+  auto ptr = column.Values().data();
+  for (auto it = ptr; it != ptr + kRows; ++it) {
+    ASSERT_EQ(*it, k);
+    k += kCols;
+  }
+}
+}  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 14f89774b..fe38f0f9b 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -7,8 +7,7 @@
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"
 
-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
   Tensor<float, 3> l{{2, 3, 4}, 0};
@@ -55,12 +54,14 @@ void TestElementWiseKernel() {
 }
 
 void TestSlice() {
+  Context ctx;
+  ctx.gpu_id = 1;
   thrust::device_vector<double> data(2 * 3 * 4);
-  auto t = MakeTensorView(dh::ToSpan(data), {2, 3, 4}, 0);
+  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
   dh::LaunchN(1, [=] __device__(size_t) {
     auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
     auto all = t.Slice(linalg::All(), linalg::All(), linalg::All());
-    static_assert(decltype(s)::kDimension == 3, "");
+    static_assert(decltype(s)::kDimension == 3);
     for (size_t i = 0; i < s.Shape(0); ++i) {
       for (size_t j = 0; j < s.Shape(1); ++j) {
         for (size_t k = 0; k < s.Shape(2); ++k) {
@@ -75,5 +76,4 @@ void TestSlice() {
 TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }
 
 TEST(Linalg, GPUTensorView) { TestSlice(); }
-}  // namespace linalg
-}  // namespace xgboost
+}  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc
index 201f7b407..e2ecd0990 100644
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -2,16 +2,18 @@
 #include "../../../src/common/random.h"
 #include "../helpers.h"
 #include "gtest/gtest.h"
+#include "xgboost/context.h"  // Context
 
 namespace xgboost {
 namespace common {
 TEST(ColumnSampler, Test) {
+  Context ctx;
   int n = 128;
   ColumnSampler cs;
   std::vector<float> feature_weights;
 
   // No node sampling
-  cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 0.5f, 0.5f);
   auto set0 = cs.GetFeatureSet(0);
   ASSERT_EQ(set0->Size(), 32);
 
@@ -24,7 +26,7 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set2->Size(), 32);
 
   // Node sampling
-  cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f);
+  cs.Init(&ctx, n, feature_weights, 0.5f, 1.0f, 0.5f);
   auto set3 = cs.GetFeatureSet(0);
   ASSERT_EQ(set3->Size(), 32);
 
@@ -34,24 +36,25 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set4->Size(), 32);
 
   // No level or node sampling, should be the same at different depth
-  cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 0.5f);
   ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
             cs.GetFeatureSet(1)->HostVector());
 
-  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set5 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->Size(), n);
-  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set6 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->HostVector(), set6->HostVector());
 
   // Should always be a minimum of one feature
-  cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
+  cs.Init(&ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
   ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
 }
 
 // Test if different threads using the same seed produce the same result
 TEST(ColumnSampler, ThreadSynchronisation) {
+  Context ctx;
   const int64_t num_threads = 100;
   int n = 128;
   size_t iterations = 10;
@@ -63,7 +66,7 @@ TEST(ColumnSampler, ThreadSynchronisation) {
   {
     for (auto j = 0ull; j < iterations; j++) {
       ColumnSampler cs(j);
-      cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f);
+      cs.Init(&ctx, n, feature_weights, 0.5f, 0.5f, 0.5f);
       for (auto level = 0ull; level < levels; level++) {
         auto result = cs.GetFeatureSet(level)->ConstHostVector();
 #pragma omp single
@@ -80,11 +83,12 @@ TEST(ColumnSampler, ThreadSynchronisation) {
 
 TEST(ColumnSampler, WeightedSampling) {
   auto test_basic = [](int first) {
+    Context ctx;
     std::vector<float> feature_weights(2);
     feature_weights[0] = std::abs(first - 1.0f);
     feature_weights[1] = first - 0.0f;
     ColumnSampler cs{0};
-    cs.Init(2, feature_weights, 1.0, 1.0, 0.5);
+    cs.Init(&ctx, 2, feature_weights, 1.0, 1.0, 0.5);
     auto feature_sets = cs.GetFeatureSet(0);
     auto const &h_feat_set = feature_sets->HostVector();
     ASSERT_EQ(h_feat_set.size(), 1);
@@ -100,7 +104,8 @@ TEST(ColumnSampler, WeightedSampling) {
   SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
   std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
   ColumnSampler cs{0};
-  cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f);
+  Context ctx;
+  cs.Init(&ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f);
   std::vector<bst_feature_t> features(kCols);
   std::iota(features.begin(), features.end(), 0);
   std::vector<float> freq(kCols, 0);
@@ -135,7 +140,8 @@ TEST(ColumnSampler, WeightedMultiSampling) {
   }
   ColumnSampler cs{0};
   float bytree{0.5}, bylevel{0.5}, bynode{0.5};
-  cs.Init(feature_weights.size(), feature_weights, bytree, bylevel, bynode);
+  Context ctx;
+  cs.Init(&ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode);
   auto feature_set = cs.GetFeatureSet(0);
   size_t n_sampled = kCols * bytree * bylevel * bynode;
   ASSERT_EQ(feature_set->Size(), n_sampled);
diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc
index 3ee99c0ae..133fae9fd 100644
--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@@ -522,9 +522,9 @@ TEST(Span, Empty) {
 TEST(SpanDeathTest, Empty) {
   std::vector<float> data(1, 0);
   ASSERT_TRUE(data.data());
-  Span<float> s{data.data(), Span<float>::index_type(0)};  // ok to define 0 size span.
+  // ok to define 0 size span.
+  Span<float> s{data.data(), static_cast<Span<float>::index_type>(0)};
   EXPECT_DEATH(s[0], "");  // not ok to use it.
 }
-
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
index 3f3786809..abdf00425 100644
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -11,19 +11,20 @@
 namespace xgboost {
 namespace common {
 TEST(Stats, Quantile) {
+  Context ctx;
   {
     linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
     std::vector<size_t> index{0, 2, 3, 4, 6};
     auto h_arr = arr.HostView();
     auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
     auto end = beg + index.size();
-    auto q = Quantile(0.40f, beg, end);
+    auto q = Quantile(&ctx, 0.40f, beg, end);
     ASSERT_EQ(q, 26.0);
 
-    q = Quantile(0.20f, beg, end);
+    q = Quantile(&ctx, 0.20f, beg, end);
     ASSERT_EQ(q, 16.0);
 
-    q = Quantile(0.10f, beg, end);
+    q = Quantile(&ctx, 0.10f, beg, end);
     ASSERT_EQ(q, 15.0);
   }
 
@@ -31,12 +32,13 @@ TEST(Stats, Quantile) {
     std::vector<float> vec{1., 2., 3., 4., 5.};
     auto beg = MakeIndexTransformIter([&](size_t i) { return vec[i]; });
     auto end = beg + vec.size();
-    auto q = Quantile(0.5f, beg, end);
+    auto q = Quantile(&ctx, 0.5f, beg, end);
     ASSERT_EQ(q, 3.);
   }
 }
 
 TEST(Stats, WeightedQuantile) {
+  Context ctx;
   linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
   linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
 
@@ -47,13 +49,13 @@ TEST(Stats, WeightedQuantile) {
   auto end = beg + arr.Size();
   auto w = MakeIndexTransformIter([&](size_t i) { return h_weight(i); });
 
-  auto q = WeightedQuantile(0.50f, beg, end, w);
+  auto q = WeightedQuantile(&ctx, 0.50f, beg, end, w);
   ASSERT_EQ(q, 3);
 
-  q = WeightedQuantile(0.0, beg, end, w);
+  q = WeightedQuantile(&ctx, 0.0, beg, end, w);
   ASSERT_EQ(q, 1);
 
-  q = WeightedQuantile(1.0, beg, end, w);
+  q = WeightedQuantile(&ctx, 1.0, beg, end, w);
   ASSERT_EQ(q, 5);
 }
 
diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc
index 72e5ccc10..7e0484842 100644
--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -119,13 +119,13 @@ TEST(ArrayInterface, TrivialDim) {
 }
 
 TEST(ArrayInterface, ToDType) {
-  static_assert(ToDType<float>::kType == ArrayInterfaceHandler::kF4, "");
-  static_assert(ToDType<double>::kType == ArrayInterfaceHandler::kF8, "");
+  static_assert(ToDType<float>::kType == ArrayInterfaceHandler::kF4);
+  static_assert(ToDType<double>::kType == ArrayInterfaceHandler::kF8);
 
-  static_assert(ToDType<uint32_t>::kType == ArrayInterfaceHandler::kU4, "");
-  static_assert(ToDType<uint64_t>::kType == ArrayInterfaceHandler::kU8, "");
+  static_assert(ToDType<uint32_t>::kType == ArrayInterfaceHandler::kU4);
+  static_assert(ToDType<uint64_t>::kType == ArrayInterfaceHandler::kU8);
 
-  static_assert(ToDType<int32_t>::kType == ArrayInterfaceHandler::kI4, "");
-  static_assert(ToDType<int64_t>::kType == ArrayInterfaceHandler::kI8, "");
+  static_assert(ToDType<int32_t>::kType == ArrayInterfaceHandler::kI4);
+  static_assert(ToDType<int64_t>::kType == ArrayInterfaceHandler::kI8);
 }
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc
index 7b35c6f6f..c37328192 100644
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -21,7 +21,7 @@ TEST(SparsePage, PushCSC) {
 
   offset = {0, 1, 4};
   for (size_t i = 0; i < offset.back(); ++i) {
-    data.emplace_back(Entry(i, 0.1f));
+    data.emplace_back(i, 0.1f);
   }
 
   SparsePage other;
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index 2bfb756c1..93194972f 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
   }
 }
 
+TEST(GradientIndex, FromCategoricalLarge) {
+  size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
+  bst_bin_t max_bins = 8;
+  auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
+  auto m = GetDMatrixFromData(x, kRows, 1);
+  Context ctx;
+
+  auto &h_ft = m->Info().feature_types.HostVector();
+  h_ft.resize(kCols, FeatureType::kCategorical);
+
+  BatchParam p{max_bins, 0.8};
+  {
+    GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
+    ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
+  }
+  {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
+      common::HistogramCuts cut = page.cut;
+      GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
+      ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
+    }
+  }
+}
+
 TEST(GradientIndex, PushBatch) {
   size_t constexpr kRows = 64, kCols = 4;
   bst_bin_t max_bins = 64;
diff --git a/tests/cpp/data/test_simple_dmatrix.cu b/tests/cpp/data/test_simple_dmatrix.cu
index 4b020c0a6..04859ed1e 100644
--- a/tests/cpp/data/test_simple_dmatrix.cu
+++ b/tests/cpp/data/test_simple_dmatrix.cu
@@ -189,8 +189,8 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
     auto& mask = column_bitfields[0];
     mask.resize(8);
 
-    for (size_t j = 0; j < mask.size(); ++j) {
-      mask[j] = ~0;
+    for (auto && j : mask) {
+      j = ~0;
     }
     // the 2^th entry of first column is invalid
     // [0 0 0 0 0 1 0 0]
@@ -201,8 +201,8 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
     auto& mask = column_bitfields[1];
     mask.resize(8);
 
-    for (size_t j = 0; j < mask.size(); ++j) {
-      mask[j] = ~0;
+    for (auto && j : mask) {
+      j = ~0;
     }
     // the 19^th entry of second column is invalid
     // [~0~], [~0~], [0 0 0 0 1 0 0 0]
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 8c2ff9514..24dc40949 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -96,7 +96,7 @@ void TestRetainPage() {
 
   // make sure it's const and the caller can not modify the content of page.
   for (auto& page : m->GetBatches<Page>()) {
-    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
+    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 }
 
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 64ce0568c..bb562ffb7 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -1,5 +1,6 @@
-// Copyright by Contributors
-
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
+ */
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_dmatrix.h"
@@ -69,7 +70,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
   std::vector<std::shared_ptr<EllpackPage const>> iterators;
   for (auto it = begin; it != end; ++it) {
     iterators.push_back(it.Page());
-    gidx_buffers.emplace_back(HostDeviceVector<common::CompressedByteT>{});
+    gidx_buffers.emplace_back();
     gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
     gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
   }
@@ -87,7 +88,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 
   // make sure it's const and the caller can not modify the content of page.
   for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
-    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
+    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 
   // The above iteration clears out all references inside DMatrix.
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index fcaffa5c6..ebb56d2d3 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -186,7 +186,7 @@ SimpleLCG::StateType SimpleLCG::operator()() {
 SimpleLCG::StateType SimpleLCG::Min() const { return min(); }
 SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 // Make sure it's compile time constant.
-static_assert(SimpleLCG::max() - SimpleLCG::min(), "");
+static_assert(SimpleLCG::max() - SimpleLCG::min());
 
 void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
   xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 63ef6ac50..ec1ace796 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -46,7 +46,7 @@ class GradientBooster;
 
 template <typename Float>
 Float RelError(Float l, Float r) {
-  static_assert(std::is_floating_point<Float>::value, "");
+  static_assert(std::is_floating_point<Float>::value);
   return std::abs(1.0f - l / r);
 }
 
@@ -164,7 +164,7 @@ class SimpleRealUniformDistribution {
     ResultT sum_value = 0, r_k = 1;
 
     for (size_t k = m; k != 0; --k) {
-      sum_value += ResultT((*rng)() - rng->Min()) * r_k;
+      sum_value += static_cast<ResultT>((*rng)() - rng->Min()) * r_k;
       r_k *= r;
     }
 
@@ -191,12 +191,10 @@ Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
   Json array_interface{Object()};
   array_interface["data"] = std::vector<Json>(2);
   if (storage->DeviceCanRead()) {
-    array_interface["data"][0] =
-        Integer(reinterpret_cast<int64_t>(storage->ConstDevicePointer()));
+    array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(storage->ConstDevicePointer())};
     array_interface["stream"] = nullptr;
   } else {
-    array_interface["data"][0] =
-        Integer(reinterpret_cast<int64_t>(storage->ConstHostPointer()));
+    array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(storage->ConstHostPointer())};
   }
   array_interface["data"][1] = Boolean(false);
 
diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc
index 2f13b8bb3..718f8f659 100644
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -1,4 +1,6 @@
-// Copyright by Contributors
+/**
+ * Copyright 2016-2023 by XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/objective.h>
@@ -25,11 +27,14 @@ TEST(Objective, PredTransform) {
   tparam.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
   size_t n = 100;
 
-  for (const auto &entry :
-       ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) {
-    std::unique_ptr<xgboost::ObjFunction> obj{
-        xgboost::ObjFunction::Create(entry->name, &tparam)};
-    obj->Configure(Args{{"num_class", "2"}});
+  for (const auto& entry : ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create(entry->name, &tparam)};
+    if (entry->name.find("multi") != std::string::npos) {
+      obj->Configure(Args{{"num_class", "2"}});
+    }
+    if (entry->name.find("quantile") != std::string::npos) {
+      obj->Configure(Args{{"quantile_alpha", "0.5"}});
+    }
     HostDeviceVector<float> predts;
     predts.Resize(n, 3.14f);  // prediction is performed on host.
     ASSERT_FALSE(predts.DeviceCanRead());
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
new file mode 100644
index 000000000..76233975a
--- /dev/null
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>       // Args
+#include <xgboost/context.h>    // Context
+#include <xgboost/objective.h>  // ObjFunction
+#include <xgboost/span.h>       // Span
+
+#include <memory>               // std::unique_ptr
+#include <vector>               // std::vector
+
+#include "../helpers.h"         // CheckConfigReload,CreateEmptyGenericParam,DeclareUnifiedTest
+
+namespace xgboost {
+TEST(Objective, DeclareUnifiedTest(Quantile)) {
+  Context ctx = CreateEmptyGenericParam(GPUIDX);
+
+  {
+    Args args{{"quantile_alpha", "[0.6, 0.8]"}};
+    std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
+    obj->Configure(args);
+    CheckConfigReload(obj, "reg:quantileerror");
+  }
+
+  Args args{{"quantile_alpha", "0.6"}};
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
+  obj->Configure(args);
+  CheckConfigReload(obj, "reg:quantileerror");
+
+  std::vector<float> predts{1.0f, 2.0f, 3.0f};
+  std::vector<float> labels{3.0f, 2.0f, 1.0f};
+  std::vector<float> weights{1.0f, 1.0f, 1.0f};
+  std::vector<float> grad{-0.6f, 0.4f, 0.4f};
+  std::vector<float> hess = weights;
+  CheckObjFunction(obj, predts, labels, weights, grad, hess);
+}
+
+TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
+  Context ctx = CreateEmptyGenericParam(GPUIDX);
+  Args args{{"quantile_alpha", "[0.6, 0.8]"}};
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
+  obj->Configure(args);
+
+  MetaInfo info;
+  info.num_row_ = 10;
+  info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
+    data->SetDevice(ctx.gpu_id);
+    data->Resize(info.num_row_);
+    shape[0] = info.num_row_;
+    shape[1] = 1;
+
+    auto& h_labels = data->HostVector();
+    for (std::size_t i = 0; i < info.num_row_; ++i) {
+      h_labels[i] = i;
+    }
+  });
+
+  linalg::Vector<float> base_scores;
+  obj->InitEstimation(info, &base_scores);
+  ASSERT_EQ(base_scores.Size(), 1) << "Vector is not yet supported.";
+  // mean([5.6, 7.8])
+  ASSERT_NEAR(base_scores(0), 6.7, kRtEps);
+
+  for (std::size_t i = 0; i < info.num_row_; ++i) {
+    info.weights_.HostVector().emplace_back(info.num_row_ - i - 1.0);
+  }
+
+  obj->InitEstimation(info, &base_scores);
+  ASSERT_EQ(base_scores.Size(), 1) << "Vector is not yet supported.";
+  // mean([3, 5])
+  ASSERT_NEAR(base_scores(0), 4.0, kRtEps);
+}
+}  // namespace xgboost
diff --git a/tests/cpp/objective/test_quantile_obj_gpu.cu b/tests/cpp/objective/test_quantile_obj_gpu.cu
new file mode 100644
index 000000000..518692411
--- /dev/null
+++ b/tests/cpp/objective/test_quantile_obj_gpu.cu
@@ -0,0 +1,5 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+// Dummy file to enable the CUDA tests.
+#include "test_quantile_obj.cc"
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index c5cd2537c..4e37eef18 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -6,8 +6,9 @@
 #include <xgboost/json.h>
 #include <xgboost/objective.h>
 
-#include "../../../src/common/linalg_op.h"  // begin,end
+#include "../../../src/common/linalg_op.h"  // for begin, end
 #include "../../../src/objective/adaptive.h"
+#include "../../../src/tree/param.h"        // for TrainParam
 #include "../helpers.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -157,7 +158,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
     ObjFunction::Create("count:poisson", &ctx)
   };
 
-  args.emplace_back(std::make_pair("max_delta_step", "0.1f"));
+  args.emplace_back("max_delta_step", "0.1f");
   obj->Configure(args);
 
   CheckObjFunction(obj,
@@ -259,7 +260,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:tweedie", &ctx)};
 
-  args.emplace_back(std::make_pair("tweedie_variance_power", "1.1f"));
+  args.emplace_back("tweedie_variance_power", "1.1f");
   obj->Configure(args);
 
   CheckObjFunction(obj,
@@ -408,9 +409,13 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
     h_predt[i] = labels[i] + i;
   }
 
-  obj->UpdateTreeLeaf(position, info, predt, 0, &tree);
-  ASSERT_EQ(tree[1].LeafValue(), -1);
-  ASSERT_EQ(tree[2].LeafValue(), -4);
+  tree::TrainParam param;
+  param.Init(Args{});
+  auto lr = param.learning_rate;
+
+  obj->UpdateTreeLeaf(position, info, param.learning_rate, predt, 0, &tree);
+  ASSERT_EQ(tree[1].LeafValue(), -1.0f * lr);
+  ASSERT_EQ(tree[2].LeafValue(), -4.0f * lr);
 }
 
 TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
@@ -428,8 +433,8 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
     auto h_labels = info.labels.HostView().Slice(linalg::All(), t);
     std::iota(linalg::begin(h_labels), linalg::end(h_labels), 0);
 
-    auto h_predt = linalg::MakeTensorView(predt.HostSpan(), {kRows, kTargets}, Context::kCpuId)
-                       .Slice(linalg::All(), t);
+    auto h_predt =
+        linalg::MakeTensorView(&ctx, predt.HostSpan(), kRows, kTargets).Slice(linalg::All(), t);
     for (size_t i = 0; i < h_predt.Size(); ++i) {
       h_predt(i) = h_labels(i) + i;
     }
@@ -457,11 +462,16 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
     ASSERT_EQ(tree.GetNumLeaves(), 4);
 
     auto empty_leaf = tree[4].LeafValue();
-    obj->UpdateTreeLeaf(position, info, predt, t, &tree);
-    ASSERT_EQ(tree[3].LeafValue(), -5);
-    ASSERT_EQ(tree[4].LeafValue(), empty_leaf);
-    ASSERT_EQ(tree[5].LeafValue(), -10);
-    ASSERT_EQ(tree[6].LeafValue(), -14);
+
+    tree::TrainParam param;
+    param.Init(Args{});
+    auto lr = param.learning_rate;
+
+    obj->UpdateTreeLeaf(position, info, lr, predt, t, &tree);
+    ASSERT_EQ(tree[3].LeafValue(), -5.0f * lr);
+    ASSERT_EQ(tree[4].LeafValue(), empty_leaf * lr);
+    ASSERT_EQ(tree[5].LeafValue(), -10.0f * lr);
+    ASSERT_EQ(tree[6].LeafValue(), -14.0f * lr);
   }
 }
 
diff --git a/tests/cpp/test_cache.cc b/tests/cpp/test_cache.cc
index 4099fa2de..351730181 100644
--- a/tests/cpp/test_cache.cc
+++ b/tests/cpp/test_cache.cc
@@ -3,16 +3,18 @@
  */
 #include <gtest/gtest.h>
 #include <xgboost/cache.h>
-#include <xgboost/data.h>  // DMatrix
+#include <xgboost/data.h>  // for DMatrix
 
-#include <cstddef>         // std::size_t
+#include <cstddef>         // for size_t
+#include <cstdint>         // for uint32_t
+#include <thread>          // for thread
 
-#include "helpers.h"       // RandomDataGenerator
+#include "helpers.h"       // for RandomDataGenerator
 
 namespace xgboost {
 namespace {
 struct CacheForTest {
-  std::size_t i;
+  std::size_t const i;
 
   explicit CacheForTest(std::size_t k) : i{k} {}
 };
@@ -20,7 +22,7 @@ struct CacheForTest {
 
 TEST(DMatrixCache, Basic) {
   std::size_t constexpr kRows = 2, kCols = 1, kCacheSize = 4;
-  DMatrixCache<CacheForTest> cache(kCacheSize);
+  DMatrixCache<CacheForTest> cache{kCacheSize};
 
   auto add_cache = [&]() {
     // Create a lambda function here, so that p_fmat gets deleted upon the
@@ -52,4 +54,63 @@ TEST(DMatrixCache, Basic) {
     }
   }
 }
+
+TEST(DMatrixCache, MultiThread) {
+  std::size_t constexpr kRows = 2, kCols = 1, kCacheSize = 3;
+  auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+
+  auto n = std::thread::hardware_concurrency() * 128u;
+  CHECK_NE(n, 0);
+  std::vector<std::shared_ptr<CacheForTest>> results(n);
+
+  {
+    DMatrixCache<CacheForTest> cache{kCacheSize};
+    std::vector<std::thread> tasks;
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      tasks.emplace_back([&, i = tidx]() {
+        cache.CacheItem(p_fmat, i);
+
+        auto p_fmat_local = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+        results[i] = cache.CacheItem(p_fmat_local, i);
+      });
+    }
+    for (auto& t : tasks) {
+      t.join();
+    }
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      ASSERT_EQ(results[tidx]->i, tidx);
+    }
+
+    tasks.clear();
+
+    for (std::int32_t tidx = static_cast<std::int32_t>(n - 1); tidx >= 0; --tidx) {
+      tasks.emplace_back([&, i = tidx]() {
+        cache.CacheItem(p_fmat, i);
+
+        auto p_fmat_local = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+        results[i] = cache.CacheItem(p_fmat_local, i);
+      });
+    }
+    for (auto& t : tasks) {
+      t.join();
+    }
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      ASSERT_EQ(results[tidx]->i, tidx);
+    }
+  }
+
+  {
+    DMatrixCache<CacheForTest> cache{n};
+    std::vector<std::thread> tasks;
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      tasks.emplace_back([&, tidx]() { results[tidx] = cache.CacheItem(p_fmat, tidx); });
+    }
+    for (auto& t : tasks) {
+      t.join();
+    }
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      ASSERT_EQ(results[tidx]->i, tidx);
+    }
+  }
+}
 }  // namespace xgboost
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index c45ed5385..fc94f3130 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -9,12 +9,14 @@
 #include "../../../../src/tree/hist/evaluate_splits.h"
 #include "../test_evaluate_splits.h"
 #include "../../helpers.h"
+#include "xgboost/context.h"  // Context
 
 namespace xgboost {
 namespace tree {
 void TestEvaluateSplits(bool force_read_by_column) {
+  Context ctx;
+  ctx.nthread = 4;
   int static constexpr kRows = 8, kCols = 16;
-  int32_t n_threads = std::min(omp_get_max_threads(), 4);
   auto sampler = std::make_shared<common::ColumnSampler>();
 
   TrainParam param;
@@ -22,7 +24,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
 
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();
 
-  auto evaluator = HistEvaluator<CPUExpandEntry>{param, dmat->Info(), n_threads, sampler};
+  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
   common::HistCollection hist;
   std::vector<GradientPair> row_gpairs = {
       {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
@@ -86,13 +88,15 @@ TEST(HistEvaluator, Evaluate) {
 }
 
 TEST(HistEvaluator, Apply) {
+  Context ctx;
+  ctx.nthread = 4;
   RegTree tree;
   int static constexpr kNRows = 8, kNCols = 16;
   TrainParam param;
   param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
   auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
   auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator_ = HistEvaluator<CPUExpandEntry>{param, dmat->Info(), 4, sampler};
+  auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
 
   CPUExpandEntry entry{0, 0, 10.0f};
   entry.split.left_sum = GradStats{0.4, 0.6f};
@@ -115,10 +119,11 @@ TEST(HistEvaluator, Apply) {
 }
 
 TEST_F(TestPartitionBasedSplit, CPUHist) {
+  Context ctx;
   // check the evaluator is returning the optimal split
   std::vector<FeatureType> ft{FeatureType::kCategorical};
   auto sampler = std::make_shared<common::ColumnSampler>();
-  HistEvaluator<CPUExpandEntry> evaluator{param_, info_, AllThreadsForTest(), sampler};
+  HistEvaluator<CPUExpandEntry> evaluator{&ctx, &param_, info_, sampler};
   evaluator.InitRoot(GradStats{total_gpair_});
   RegTree tree;
   std::vector<CPUExpandEntry> entries(1);
@@ -128,6 +133,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {
 
 namespace {
 auto CompareOneHotAndPartition(bool onehot) {
+  Context ctx;
   int static constexpr kRows = 128, kCols = 1;
   std::vector<FeatureType> ft(kCols, FeatureType::kCategorical);
 
@@ -147,8 +153,7 @@ auto CompareOneHotAndPartition(bool onehot) {
       RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();
 
   auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator =
-      HistEvaluator<CPUExpandEntry>{param, dmat->Info(), AllThreadsForTest(), sampler};
+  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
   std::vector<CPUExpandEntry> entries(1);
 
   for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>({32, param.sparse_threshold})) {
@@ -198,8 +203,8 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   MetaInfo info;
   info.num_col_ = 1;
   info.feature_types = {FeatureType::kCategorical};
-  auto evaluator =
-      HistEvaluator<CPUExpandEntry>{param_, info, AllThreadsForTest(), sampler};
+  Context ctx;
+  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param_, info, sampler};
   evaluator.InitRoot(GradStats{parent_sum_});
 
   std::vector<CPUExpandEntry> entries(1);
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 1e37f1cd4..8462fa7d5 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -48,7 +48,7 @@ void TestAddHistRows(bool is_distributed) {
 
   HistogramBuilder<CPUExpandEntry> histogram_builder;
   histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
-                          is_distributed);
+                          is_distributed, false);
   histogram_builder.AddHistRows(&starting_index, &sync_count,
                                 nodes_for_explicit_hist_build_,
                                 nodes_for_subtraction_trick_, &tree);
@@ -86,7 +86,7 @@ void TestSyncHist(bool is_distributed) {
 
   HistogramBuilder<CPUExpandEntry> histogram;
   uint32_t total_bins = gmat.cut.Ptrs().back();
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed);
+  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed, false);
 
   common::RowSetCollection row_set_collection_;
   {
@@ -226,11 +226,14 @@ TEST(CPUHistogram, SyncHist) {
   TestSyncHist(false);
 }
 
-void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {
+void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
-  auto p_fmat =
-      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  if (is_col_split) {
+    p_fmat = std::shared_ptr<DMatrix>{
+        p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+  }
   auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
   uint32_t total_bins = gmat.cut.Ptrs().back();
 
@@ -241,7 +244,8 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {
 
   bst_node_t nid = 0;
   HistogramBuilder<CPUExpandEntry> histogram;
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed);
+  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed,
+                  is_col_split);
 
   RegTree tree;
 
@@ -284,11 +288,16 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {
 }
 
 TEST(CPUHistogram, BuildHist) {
-  TestBuildHistogram(true, false);
-  TestBuildHistogram(false, false);
-  TestBuildHistogram(true, true);
-  TestBuildHistogram(false, true);
+  TestBuildHistogram(true, false, false);
+  TestBuildHistogram(false, false, false);
+  TestBuildHistogram(true, true, false);
+  TestBuildHistogram(false, true, false);
+}
 
+TEST(CPUHistogram, BuildHistColSplit) {
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true);
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true);
 }
 
 namespace {
@@ -340,7 +349,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   HistogramBuilder<CPUExpandEntry> cat_hist;
   for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
-    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false);
+    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
     cat_hist.BuildHist(0, gidx, &tree, row_set_collection,
                         nodes_for_explicit_hist_build, {}, gpair.HostVector(),
                         force_read_by_column);
@@ -354,7 +363,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   HistogramBuilder<CPUExpandEntry> onehot_hist;
   for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
-    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false);
+    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
     onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
                           gpair.HostVector(),
                           force_read_by_column);
@@ -419,7 +428,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
         1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
         256};
 
-    multi_build.Reset(total_bins, batch_param, ctx.Threads(), rows_set.size(), false);
+    multi_build.Reset(total_bins, batch_param, ctx.Threads(), rows_set.size(), false, false);
 
     size_t page_idx{0};
     for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
@@ -440,7 +449,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
     common::RowSetCollection row_set_collection;
     InitRowPartitionForTest(&row_set_collection, n_samples);
 
-    single_build.Reset(total_bins, batch_param, ctx.Threads(), 1, false);
+    single_build.Reset(total_bins, batch_param, ctx.Threads(), 1, false, false);
     SparsePage concat;
     std::vector<float> hess(m->Info().num_row_, 1.0f);
     for (auto const& page : m->GetBatches<SparsePage>()) {
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 0b2d95100..cae76c373 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -10,29 +10,36 @@
 
 namespace xgboost {
 namespace tree {
-TEST(Approx, Partitioner) {
-  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
-  Context ctx;
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
-  ASSERT_EQ(partitioner.base_rowid, base_rowid);
-  ASSERT_EQ(partitioner.Size(), 1);
-  ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
-
-  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  ctx.InitAllowUnknown(Args{});
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
 
+namespace {
+std::vector<float> GenerateHess(size_t n_samples) {
   auto grad = GenerateRandomGradients(n_samples);
   std::vector<float> hess(grad.Size());
   std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(),
                  [](auto gpair) { return gpair.GetHess(); });
+  return hess;
+}
+}  // anonymous namespace
+
+TEST(Approx, Partitioner) {
+  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
+  ASSERT_EQ(partitioner.base_rowid, base_rowid);
+  ASSERT_EQ(partitioner.Size(), 1);
+  ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
+
+  auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  auto hess = GenerateHess(n_samples);
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
 
   for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
     bst_feature_t const split_ind = 0;
     {
       auto min_value = page.cut.MinValues()[split_ind];
       RegTree tree;
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       GetSplit(&tree, min_value, &candidates);
       partitioner.UpdatePosition(&ctx, page, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
@@ -40,7 +47,7 @@ TEST(Approx, Partitioner) {
       ASSERT_EQ(partitioner[2].Size(), n_samples);
     }
     {
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       auto ptr = page.cut.Ptrs()[split_ind + 1];
       float split_value = page.cut.Values().at(ptr / 2);
       RegTree tree;
@@ -66,12 +73,85 @@ TEST(Approx, Partitioner) {
   }
 }
 
+namespace {
+void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared_ptr<DMatrix> Xy,
+                                std::vector<float>* hess, float min_value, float mid_value,
+                                CommonRowPartitioner const& expected_mid_partitioner) {
+  auto dmat =
+      std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({64, *hess, true})) {
+    {
+      RegTree tree;
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
+      GetSplit(&tree, min_value, &candidates);
+      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+      ASSERT_EQ(partitioner.Size(), 3);
+      ASSERT_EQ(partitioner[1].Size(), 0);
+      ASSERT_EQ(partitioner[2].Size(), n_samples);
+    }
+    {
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
+      RegTree tree;
+      GetSplit(&tree, mid_value, &candidates);
+      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+
+      auto left_nidx = tree[RegTree::kRoot].LeftChild();
+      auto elem = partitioner[left_nidx];
+      ASSERT_LT(elem.Size(), n_samples);
+      ASSERT_GT(elem.Size(), 1);
+      auto expected_elem = expected_mid_partitioner[left_nidx];
+      ASSERT_EQ(elem.Size(), expected_elem.Size());
+      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
+        ASSERT_EQ(*it, *eit);
+      }
+
+      auto right_nidx = tree[RegTree::kRoot].RightChild();
+      elem = partitioner[right_nidx];
+      expected_elem = expected_mid_partitioner[right_nidx];
+      ASSERT_EQ(elem.Size(), expected_elem.Size());
+      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
+        ASSERT_EQ(*it, *eit);
+      }
+    }
+  }
+}
+}  // anonymous namespace
+
+TEST(Approx, PartitionerColSplit) {
+  size_t n_samples = 1024, n_features = 16, base_rowid = 0;
+  auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  auto hess = GenerateHess(n_samples);
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+
+  float min_value, mid_value;
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+    bst_feature_t const split_ind = 0;
+    min_value = page.cut.MinValues()[split_ind];
+
+    auto ptr = page.cut.Ptrs()[split_ind + 1];
+    mid_value = page.cut.Values().at(ptr / 2);
+    RegTree tree;
+    GetSplit(&tree, mid_value, &candidates);
+    mid_partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+  }
+
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, TestColumnSplitPartitioner, n_samples, base_rowid, Xy,
+                              &hess, min_value, mid_value, mid_partitioner);
+}
+
 namespace {
 void TestLeafPartition(size_t n_samples) {
   size_t const n_features = 2, base_rowid = 0;
   Context ctx;
   common::RowSetCollection row_set;
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
 
   auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
   std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 2421b8ba0..a74739faa 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
@@ -12,8 +12,7 @@
 #include "../../../src/tree/split_evaluator.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 /**
  * \brief Enumerate all possible partitions for categorical split.
  */
@@ -151,5 +150,4 @@ class TestCategoricalSplitWithMissing : public testing::Test {
     ASSERT_EQ(right_sum.GetHess(), parent_sum_.GetHess() - left_sum.GetHess());
   }
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 100a4c393..e828d1379 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -13,6 +13,7 @@
 #include "../../../src/common/common.h"
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/constraints.cuh"
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../../../src/tree/updater_gpu_common.cuh"
 #include "../../../src/tree/updater_gpu_hist.cu"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
@@ -21,8 +22,7 @@
 #include "xgboost/context.h"
 #include "xgboost/json.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 TEST(GpuHist, DeviceHistogram) {
   // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
   dh::safe_cuda(cudaSetDevice(0));
@@ -83,11 +83,12 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   int const kNRows = 16, kNCols = 8;
 
   TrainParam param;
-  std::vector<std::pair<std::string, std::string>> args {
-    {"max_depth", "6"},
-    {"max_leaves", "0"},
+  Args args{
+      {"max_depth", "6"},
+      {"max_leaves", "0"},
   };
   param.Init(args);
+
   auto page = BuildEllpackPage(kNRows, kNCols);
   BatchParam batch_param{};
   Context ctx{CreateEmptyGenericParam(0)};
@@ -168,7 +169,6 @@ void TestHistogramIndexImpl() {
   int constexpr kNRows = 1000, kNCols = 10;
 
   // Build 2 matrices and build a histogram maker with that
-
   Context ctx(CreateEmptyGenericParam(0));
   tree::GPUHistMaker hist_maker{&ctx, ObjInfo{ObjInfo::kRegression}},
       hist_maker_ext{&ctx, ObjInfo{ObjInfo::kRegression}};
@@ -179,15 +179,14 @@ void TestHistogramIndexImpl() {
   std::unique_ptr<DMatrix> hist_maker_ext_dmat(
     CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true, tempdir));
 
-  std::vector<std::pair<std::string, std::string>> training_params = {
-    {"max_depth", "10"},
-    {"max_leaves", "0"}
-  };
+  Args training_params = {{"max_depth", "10"}, {"max_leaves", "0"}};
+  TrainParam param;
+  param.UpdateAllowUnknown(training_params);
 
   hist_maker.Configure(training_params);
-  hist_maker.InitDataOnce(hist_maker_dmat.get());
+  hist_maker.InitDataOnce(&param, hist_maker_dmat.get());
   hist_maker_ext.Configure(training_params);
-  hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
+  hist_maker_ext.InitDataOnce(&param, hist_maker_ext_dmat.get());
 
   // Extract the device maker from the histogram makers and from that its compressed
   // histogram index
@@ -237,13 +236,15 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
       {"subsample", std::to_string(subsample)},
       {"sampling_method", sampling_method},
   };
+  TrainParam param;
+  param.UpdateAllowUnknown(args);
 
   Context ctx(CreateEmptyGenericParam(0));
   tree::GPUHistMaker hist_maker{&ctx,ObjInfo{ObjInfo::kRegression}};
-  hist_maker.Configure(args);
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
-  hist_maker.Update(gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, {tree});
+  hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                    {tree});
   auto cache = linalg::VectorView<float>{preds->DeviceSpan(), {preds->Size()}, 0};
   hist_maker.UpdatePredictionCache(dmat, cache);
 }
@@ -391,13 +392,11 @@ TEST(GpuHist, ConfigIO) {
   Json j_updater { Object() };
   updater->SaveConfig(&j_updater);
   ASSERT_TRUE(IsA<Object>(j_updater["gpu_hist_train_param"]));
-  ASSERT_TRUE(IsA<Object>(j_updater["train_param"]));
   updater->LoadConfig(j_updater);
 
   Json j_updater_roundtrip { Object() };
   updater->SaveConfig(&j_updater_roundtrip);
   ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["gpu_hist_train_param"]));
-  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["train_param"]));
 
   ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
@@ -414,5 +413,4 @@ TEST(GpuHist, MaxDepth) {
 
   ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index 17dcb4c93..20340f539 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -1,33 +1,42 @@
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
+ */
 #include <gtest/gtest.h>
-
 #include <xgboost/tree_model.h>
 #include <xgboost/tree_updater.h>
 
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
+std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols){
+  return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
+}
 
-TEST(GrowHistMaker, InteractionConstraint) {
-  size_t constexpr kRows = 32;
-  size_t constexpr kCols = 16;
-
-  Context ctx;
-
-  auto p_dmat = RandomDataGenerator{kRows, kCols, 0.6f}.Seed(3).GenerateDMatrix();
-
-  HostDeviceVector<GradientPair> gradients (kRows);
-  std::vector<GradientPair>& h_gradients = gradients.HostVector();
+std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(std::size_t rows) {
+  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows);
+  auto& h_gradients = p_gradients->HostVector();
 
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
 
-  for (size_t i = 0; i < kRows; ++i) {
-    bst_float grad = dist(&gen);
-    bst_float hess = dist(&gen);
-    h_gradients[i] = GradientPair(grad, hess);
+  for (std::size_t i = 0; i < rows; ++i) {
+    auto grad = dist(&gen);
+    auto hess = dist(&gen);
+    h_gradients[i] = GradientPair{grad, hess};
   }
 
+  return p_gradients;
+}
+
+TEST(GrowHistMaker, InteractionConstraint)
+{
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+  auto p_dmat = GenerateDMatrix(kRows, kCols);
+  auto p_gradients = GenerateGradients(kRows);
+
+  Context ctx;
   {
     // With constraints
     RegTree tree;
@@ -35,11 +44,11 @@ TEST(GrowHistMaker, InteractionConstraint) {
 
     std::unique_ptr<TreeUpdater> updater{
         TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
-    updater->Configure(Args{
-        {"interaction_constraints", "[[0, 1]]"},
-        {"num_feature", std::to_string(kCols)}});
+    TrainParam param;
+    param.UpdateAllowUnknown(
+        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
     std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Update(&gradients, p_dmat.get(), position, {&tree});
+    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 4);
     ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -54,9 +63,10 @@ TEST(GrowHistMaker, InteractionConstraint) {
 
     std::unique_ptr<TreeUpdater> updater{
         TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
-    updater->Configure(Args{{"num_feature", std::to_string(kCols)}});
     std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Update(&gradients, p_dmat.get(), position, {&tree});
+    TrainParam param;
+    param.Init(Args{});
+    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 10);
     ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -66,5 +76,53 @@ TEST(GrowHistMaker, InteractionConstraint) {
   }
 }
 
-}  // namespace tree
-}  // namespace xgboost
+namespace {
+void TestColumnSplit(int32_t rows, int32_t cols, RegTree const& expected_tree) {
+  auto p_dmat = GenerateDMatrix(rows, cols);
+  auto p_gradients = GenerateGradients(rows);
+  Context ctx;
+  std::unique_ptr<TreeUpdater> updater{
+      TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+
+  std::unique_ptr<DMatrix> sliced{
+      p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+
+  RegTree tree;
+  tree.param.num_feature = cols;
+  TrainParam param;
+  param.Init(Args{});
+  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+
+  EXPECT_EQ(tree.NumExtraNodes(), 10);
+  EXPECT_EQ(tree[0].SplitIndex(), 1);
+
+  EXPECT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
+  EXPECT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
+
+  EXPECT_EQ(tree, expected_tree);
+}
+}  // anonymous namespace
+
+TEST(GrowHistMaker, ColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  RegTree expected_tree;
+  expected_tree.param.num_feature = kCols;
+  {
+    auto p_dmat = GenerateDMatrix(kRows, kCols);
+    auto p_gradients = GenerateGradients(kRows);
+    Context ctx;
+    std::unique_ptr<TreeUpdater> updater{
+        TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&expected_tree});
+  }
+
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit, kRows, kCols, std::cref(expected_tree));
+}
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index dc41b3edd..f4e67d836 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -7,6 +7,7 @@
 
 #include <memory>
 
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
 
 namespace xgboost {
@@ -75,9 +76,11 @@ class TestPredictionCache : public ::testing::Test {
       RegTree tree;
       std::vector<RegTree *> trees{&tree};
       auto gpair = GenerateRandomGradients(n_samples_);
-      updater->Configure(Args{{"max_bin", "64"}});
+      tree::TrainParam param;
+      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
+
       std::vector<HostDeviceVector<bst_node_t>> position(1);
-      updater->Update(&gpair, Xy_.get(), position, trees);
+      updater->Update(&param, &gpair, Xy_.get(), position, trees);
       HostDeviceVector<float> out_prediction_cached;
       out_prediction_cached.SetDevice(ctx.gpu_id);
       out_prediction_cached.Resize(n_samples_);
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 52fa58a2d..258396976 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -1,28 +1,26 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
  */
+#include <gtest/gtest.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
-#include <xgboost/tree_updater.h>
 #include <xgboost/learner.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include <string>
-#include <memory>
+#include <xgboost/tree_updater.h>
 
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 TEST(Updater, Prune) {
   int constexpr kCols = 16;
 
   std::vector<std::pair<std::string, std::string>> cfg;
-  cfg.emplace_back(std::pair<std::string, std::string>("num_feature",
-                                                       std::to_string(kCols)));
-  cfg.emplace_back(std::pair<std::string, std::string>(
-      "min_split_loss", "10"));
+  cfg.emplace_back("num_feature", std::to_string(kCols));
+  cfg.emplace_back("min_split_loss", "10");
 
   // These data are just place holders.
   HostDeviceVector<GradientPair> gpair =
@@ -38,28 +36,30 @@ TEST(Updater, Prune) {
   tree.param.UpdateAllowUnknown(cfg);
   std::vector<RegTree*> trees {&tree};
   // prepare pruner
+  TrainParam param;
+  param.UpdateAllowUnknown(cfg);
+
   std::unique_ptr<TreeUpdater> pruner(
       TreeUpdater::Create("prune", &ctx, ObjInfo{ObjInfo::kRegression}));
-  pruner->Configure(cfg);
 
   // loss_chg < min_split_loss;
   std::vector<HostDeviceVector<bst_node_t>> position(trees.size());
   tree.ExpandNode(0, 0, 0, true, 0.0f, 0.3f, 0.4f, 0.0f, 0.0f,
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 0);
 
   // loss_chg > min_split_loss;
   tree.ExpandNode(0, 0, 0, true, 0.0f, 0.3f, 0.4f, 11.0f, 0.0f,
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
   // loss_chg == min_split_loss;
   tree.Stat(0).loss_chg = 10;
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
@@ -73,20 +73,20 @@ TEST(Updater, Prune) {
                   0, 0.5f, true, 0.3, 0.4, 0.5,
                   /*loss_chg=*/19.0f, 0.0f,
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  cfg.emplace_back(std::make_pair("max_depth", "1"));
-  pruner->Configure(cfg);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
 
+  cfg.emplace_back("max_depth", "1");
+  param.UpdateAllowUnknown(cfg);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
   tree.ExpandNode(tree[0].LeftChild(),
                   0, 0.5f, true, 0.3, 0.4, 0.5,
                   /*loss_chg=*/18.0f, 0.0f,
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  cfg.emplace_back(std::make_pair("min_split_loss", "0"));
-  pruner->Configure(cfg);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  cfg.emplace_back("min_split_loss", "0");
+  param.UpdateAllowUnknown(cfg);
+
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 23cb868ee..ad98d1d6b 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -23,7 +23,7 @@ TEST(QuantileHist, Partitioner) {
   Context ctx;
   ctx.InitAllowUnknown(Args{});
 
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
   ASSERT_EQ(partitioner.base_rowid, base_rowid);
   ASSERT_EQ(partitioner.Size(), 1);
   ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
@@ -41,7 +41,7 @@ TEST(QuantileHist, Partitioner) {
     {
       auto min_value = gmat.cut.MinValues()[split_ind];
       RegTree tree;
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       GetSplit(&tree, min_value, &candidates);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
@@ -49,7 +49,7 @@ TEST(QuantileHist, Partitioner) {
       ASSERT_EQ(partitioner[2].Size(), n_samples);
     }
     {
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
       auto ptr = gmat.cut.Ptrs()[split_ind + 1];
       float split_value = gmat.cut.Values().at(ptr / 2);
       RegTree tree;
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index 953d2eea4..870022724 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -1,14 +1,15 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2013 by XGBoost Contributors
  */
+#include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
-#include <gtest/gtest.h>
 
-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>
 
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
 
 namespace xgboost {
@@ -43,9 +44,11 @@ TEST(Updater, Refresh) {
   tree.Stat(cleft).base_weight = 1.2;
   tree.Stat(cright).base_weight = 1.3;
 
-  refresher->Configure(cfg);
   std::vector<HostDeviceVector<bst_node_t>> position;
-  refresher->Update(&gpair, p_dmat.get(), position, trees);
+  tree::TrainParam param;
+  param.UpdateAllowUnknown(cfg);
+
+  refresher->Update(&param, &gpair, p_dmat.get(), position, trees);
 
   bst_float constexpr kEps = 1e-6;
   ASSERT_NEAR(-0.183392, tree[cright].LeafValue(), kEps);
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 5b52534c1..4757bb3c1 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -1,7 +1,11 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/tree_model.h>
 #include <xgboost/tree_updater.h>
 
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"
 
 namespace xgboost {
@@ -21,6 +25,9 @@ class UpdaterTreeStatTest : public ::testing::Test {
   }
 
   void RunTest(std::string updater) {
+    tree::TrainParam param;
+    param.Init(Args{});
+
     Context ctx(updater == "grow_gpu_hist" ? CreateEmptyGenericParam(0)
                                            : CreateEmptyGenericParam(Context::kCpuId));
     auto up = std::unique_ptr<TreeUpdater>{
@@ -29,7 +36,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
     RegTree tree;
     tree.param.num_feature = kCols;
     std::vector<HostDeviceVector<bst_node_t>> position(1);
-    up->Update(&gpairs_, p_dmat_.get(), position, {&tree});
+    up->Update(&param, &gpairs_, p_dmat_.get(), position, {&tree});
 
     tree.WalkTree([&tree](bst_node_t nidx) {
       if (tree[nidx].IsLeaf()) {
@@ -69,28 +76,33 @@ class UpdaterEtaTest : public ::testing::Test {
   void RunTest(std::string updater) {
     Context ctx(updater == "grow_gpu_hist" ? CreateEmptyGenericParam(0)
                                            : CreateEmptyGenericParam(Context::kCpuId));
+
     float eta = 0.4;
     auto up_0 = std::unique_ptr<TreeUpdater>{
         TreeUpdater::Create(updater, &ctx, ObjInfo{ObjInfo::kClassification})};
-    up_0->Configure(Args{{"eta", std::to_string(eta)}});
+    up_0->Configure(Args{});
+    tree::TrainParam param0;
+    param0.Init(Args{{"eta", std::to_string(eta)}});
 
     auto up_1 = std::unique_ptr<TreeUpdater>{
         TreeUpdater::Create(updater, &ctx, ObjInfo{ObjInfo::kClassification})};
     up_1->Configure(Args{{"eta", "1.0"}});
+    tree::TrainParam param1;
+    param1.Init(Args{{"eta", "1.0"}});
 
     for (size_t iter = 0; iter < 4; ++iter) {
       RegTree tree_0;
       {
         tree_0.param.num_feature = kCols;
         std::vector<HostDeviceVector<bst_node_t>> position(1);
-        up_0->Update(&gpairs_, p_dmat_.get(), position, {&tree_0});
+        up_0->Update(&param0, &gpairs_, p_dmat_.get(), position, {&tree_0});
       }
 
       RegTree tree_1;
       {
         tree_1.param.num_feature = kCols;
         std::vector<HostDeviceVector<bst_node_t>> position(1);
-        up_1->Update(&gpairs_, p_dmat_.get(), position, {&tree_1});
+        up_1->Update(&param1, &gpairs_, p_dmat_.get(), position, {&tree_1});
       }
       tree_0.WalkTree([&](bst_node_t nidx) {
         if (tree_0[nidx].IsLeaf()) {
@@ -139,17 +151,18 @@ class TestMinSplitLoss : public ::testing::Test {
 
               // test gamma
               {"gamma", std::to_string(gamma)}};
+    tree::TrainParam param;
+    param.UpdateAllowUnknown(args);
 
     Context ctx(updater == "grow_gpu_hist" ? CreateEmptyGenericParam(0)
                                            : CreateEmptyGenericParam(Context::kCpuId));
-    std::cout << ctx.gpu_id << std::endl;
     auto up = std::unique_ptr<TreeUpdater>{
         TreeUpdater::Create(updater, &ctx, ObjInfo{ObjInfo::kRegression})};
-    up->Configure(args);
+    up->Configure({});
 
     RegTree tree;
     std::vector<HostDeviceVector<bst_node_t>> position(1);
-    up->Update(&gpair_, dmat_.get(), position, {&tree});
+    up->Update(&param, &gpair_, dmat_.get(), position, {&tree});
 
     auto n_nodes = tree.NumExtraNodes();
     return n_nodes;
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
index 83d1a2557..a6f50c224 100644
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -42,9 +42,15 @@ class TestGPUBasicModels:
     def test_custom_objective(self):
         self.cpu_test_bm.run_custom_objective("gpu_hist")
 
-    def test_eta_decay_gpu_hist(self):
+    def test_eta_decay(self):
         self.cpu_test_cb.run_eta_decay('gpu_hist')
 
+    @pytest.mark.parametrize(
+        "objective", ["binary:logistic", "reg:absoluteerror", "reg:quantileerror"]
+    )
+    def test_eta_decay_leaf_output(self, objective) -> None:
+        self.cpu_test_cb.run_eta_decay_leaf_output("gpu_hist", objective)
+
     def test_deterministic_gpu_hist(self):
         kRows = 1000
         kCols = 64
diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index 23e495bcc..4325b6308 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -2,6 +2,7 @@ import sys
 
 import pytest
 from hypothesis import given, settings, strategies
+
 from xgboost.testing import no_cupy
 
 sys.path.append("tests/python")
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index 2e3b29f99..6d16aa44e 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -1,10 +1,10 @@
 import sys
 
 import pytest
-from xgboost.testing.metrics import check_quantile_error
 
 import xgboost
 from xgboost import testing as tm
+from xgboost.testing.metrics import check_quantile_error
 
 sys.path.append("tests/python")
 import test_eval_metrics as test_em  # noqa
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 3f8b4557f..c4d9abba5 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -3,10 +3,10 @@ import sys
 import numpy as np
 import pytest
 from hypothesis import assume, given, settings, strategies
-from xgboost.compat import PANDAS_INSTALLED
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.compat import PANDAS_INSTALLED
 
 if PANDAS_INSTALLED:
     from hypothesis.extra.pandas import column, data_frames, range_indexes
@@ -215,6 +215,7 @@ class TestGPUPredict:
     def test_inplace_predict_cupy(self):
         self.run_inplace_predict_cupy(0)
 
+    @pytest.mark.xfail
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.mgpu
     def test_inplace_predict_cupy_specified_device(self):
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 571c4a171..6b28296b2 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -4,11 +4,11 @@ from typing import Any, Dict
 import numpy as np
 import pytest
 from hypothesis import assume, given, note, settings, strategies
-from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy
-from xgboost.testing.updater import check_init_estimation
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy
+from xgboost.testing.updater import check_init_estimation, check_quantile_loss
 
 sys.path.append("tests/python")
 import test_updaters as test_up
@@ -209,3 +209,38 @@ class TestGPUUpdaters:
 
     def test_init_estimation(self) -> None:
         check_init_estimation("gpu_hist")
+
+    @pytest.mark.parametrize("weighted", [True, False])
+    def test_quantile_loss(self, weighted: bool) -> None:
+        check_quantile_loss("gpu_hist", weighted)
+
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_issue8824(self):
+        # column sampling by node crashes because shared pointers go out of scope
+        import pandas as pd
+
+        data = pd.DataFrame(np.random.rand(1024, 8))
+        data.columns = "x" + data.columns.astype(str)
+        features = data.columns
+        data["y"] = data.sum(axis=1) < 4
+        dtrain = xgb.DMatrix(data[features], label=data["y"])
+        model = xgb.train(
+            dtrain=dtrain,
+            params={
+                "max_depth": 5,
+                "learning_rate": 0.05,
+                "objective": "binary:logistic",
+                "tree_method": "gpu_hist",
+                "colsample_bytree": 0.5,
+                "colsample_bylevel": 0.5,
+                "colsample_bynode": 0.5,  # Causes issues
+                "reg_alpha": 0.05,
+                "reg_lambda": 0.005,
+                "seed": 66,
+                "subsample": 0.5,
+                "gamma": 0.2,
+                "predictor": "auto",
+                "eval_metric": "auc",
+            },
+            num_boost_round=150,
+        )
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index 8ecb4bdc7..c9d3ab4eb 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -8,6 +8,7 @@ import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_ranking_qid_df
 
 sys.path.append("tests/python")
 import test_with_sklearn as twskl  # noqa
@@ -153,3 +154,10 @@ def test_classififer():
     y *= 10
     with pytest.raises(ValueError, match=r"Invalid classes.*"):
         clf.fit(X, y)
+
+
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_qid_df():
+    import cudf
+
+    run_ranking_qid_df(cudf, "gpu_hist")
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index 3e972345b..fabf8672e 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 from contextlib import nullcontext
@@ -355,47 +356,125 @@ class TestCallbacks:
         with warning_check:
             xgb.cv(param, dtrain, num_round, callbacks=[scheduler(eta_decay)])
 
-    @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
+    def run_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
+        # check decay has effect on leaf output.
+        num_round = 4
+        scheduler = xgb.callback.LearningRateScheduler
+
+        dpath = tm.data_dir(__file__)
+        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+
+        param = {
+            "max_depth": 2,
+            "objective": objective,
+            "eval_metric": "error",
+            "tree_method": tree_method,
+        }
+        if objective == "reg:quantileerror":
+            param["quantile_alpha"] = 0.3
+
+        def eta_decay_0(i):
+            return num_round / (i + 1)
+
+        bst0 = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            watchlist,
+            callbacks=[scheduler(eta_decay_0)],
+        )
+
+        def eta_decay_1(i: int) -> float:
+            if i > 1:
+                return 5.0
+            return num_round / (i + 1)
+
+        bst1 = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            watchlist,
+            callbacks=[scheduler(eta_decay_1)],
+        )
+        bst_json0 = bst0.save_raw(raw_format="json")
+        bst_json1 = bst1.save_raw(raw_format="json")
+
+        j0 = json.loads(bst_json0)
+        j1 = json.loads(bst_json1)
+
+        tree_2th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][2]
+        tree_2th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][2]
+        assert tree_2th_0["base_weights"] == tree_2th_1["base_weights"]
+        assert tree_2th_0["split_conditions"] == tree_2th_1["split_conditions"]
+
+        tree_3th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][3]
+        tree_3th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][3]
+        assert tree_3th_0["base_weights"] != tree_3th_1["base_weights"]
+        assert tree_3th_0["split_conditions"] != tree_3th_1["split_conditions"]
+
+    @pytest.mark.parametrize("tree_method", ["hist", "approx", "approx"])
     def test_eta_decay(self, tree_method):
         self.run_eta_decay(tree_method)
 
+    @pytest.mark.parametrize(
+        "tree_method,objective",
+        [
+            ("hist", "binary:logistic"),
+            ("hist", "reg:absoluteerror"),
+            ("hist", "reg:quantileerror"),
+            ("approx", "binary:logistic"),
+            ("approx", "reg:absoluteerror"),
+            ("approx", "reg:quantileerror"),
+        ],
+    )
+    def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
+        self.run_eta_decay_leaf_output(tree_method, objective)
+
     def test_check_point(self):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         m = xgb.DMatrix(X, y)
         with tempfile.TemporaryDirectory() as tmpdir:
-            check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                          iterations=1,
-                                                          name='model')
-            xgb.train({'objective': 'binary:logistic'}, m,
-                      num_boost_round=10,
-                      verbose_eval=False,
-                      callbacks=[check_point])
+            check_point = xgb.callback.TrainingCheckPoint(
+                directory=tmpdir, iterations=1, name="model"
+            )
+            xgb.train(
+                {"objective": "binary:logistic"},
+                m,
+                num_boost_round=10,
+                verbose_eval=False,
+                callbacks=[check_point],
+            )
             for i in range(1, 10):
-                assert os.path.exists(
-                    os.path.join(tmpdir, 'model_' + str(i) + '.json'))
+                assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json"))
 
-            check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                          iterations=1,
-                                                          as_pickle=True,
-                                                          name='model')
-            xgb.train({'objective': 'binary:logistic'}, m,
-                      num_boost_round=10,
-                      verbose_eval=False,
-                      callbacks=[check_point])
+            check_point = xgb.callback.TrainingCheckPoint(
+                directory=tmpdir, iterations=1, as_pickle=True, name="model"
+            )
+            xgb.train(
+                {"objective": "binary:logistic"},
+                m,
+                num_boost_round=10,
+                verbose_eval=False,
+                callbacks=[check_point],
+            )
             for i in range(1, 10):
-                assert os.path.exists(
-                    os.path.join(tmpdir, 'model_' + str(i) + '.pkl'))
+                assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))
 
     def test_callback_list(self):
         X, y = tm.get_california_housing()
         m = xgb.DMatrix(X, y)
         callbacks = [xgb.callback.EarlyStopping(rounds=10)]
         for i in range(4):
-            xgb.train({'objective': 'reg:squarederror',
-                       'eval_metric': 'rmse'}, m,
-                      evals=[(m, 'Train')],
-                      num_boost_round=1,
-                      verbose_eval=True,
-                      callbacks=callbacks)
+            xgb.train(
+                {"objective": "reg:squarederror", "eval_metric": "rmse"},
+                m,
+                evals=[(m, "Train")],
+                num_boost_round=1,
+                verbose_eval=True,
+                callbacks=callbacks,
+            )
         assert len(callbacks) == 1
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index 4b4258a21..0590a4954 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -4,11 +4,11 @@ import numpy as np
 import pytest
 from hypothesis import given, settings, strategies
 from scipy.sparse import csr_matrix
-from xgboost.data import SingleBatchInternalIter as SingleBatch
-from xgboost.testing import IteratorForTest, make_batches, non_increasing
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.data import SingleBatchInternalIter as SingleBatch
+from xgboost.testing import IteratorForTest, make_batches, non_increasing
 
 pytestmark = tm.timeout(30)
 
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 28797f160..c54f35046 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -146,6 +146,13 @@ def test_multioutput_reg() -> None:
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_sklearn())
+def test_quantile_reg() -> None:
+    script = os.path.join(PYTHON_DEMO_DIR, "quantile_regression.py")
+    cmd = ['python', script]
+    subprocess.check_call(cmd)
+
+
 @pytest.mark.skipif(**tm.no_ubjson())
 def test_json_model() -> None:
     script = os.path.join(DEMO_DIR, "json-model", "json_parser.py")
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index 610a46639..ef56ff656 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -6,10 +6,10 @@ import pytest
 import scipy.sparse
 from hypothesis import given, settings, strategies
 from scipy.sparse import csr_matrix, rand
-from xgboost.testing.data import np_dtypes
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import np_dtypes
 
 rng = np.random.RandomState(1)
 
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
index 000d5e347..47f58cbd6 100644
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -1,9 +1,9 @@
 import numpy as np
 import pytest
-from xgboost.testing.updater import get_basescore
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.updater import get_basescore
 
 rng = np.random.RandomState(1994)
 
diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py
index 5b4e5751c..3b7dc5b8e 100644
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -1,9 +1,9 @@
 import numpy as np
 import pytest
-from xgboost.testing.metrics import check_quantile_error
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.metrics import check_quantile_error
 
 rng = np.random.RandomState(1337)
 
diff --git a/tests/python/test_pickling.py b/tests/python/test_pickling.py
index 161a5fd4e..2f4d77bf0 100644
--- a/tests/python/test_pickling.py
+++ b/tests/python/test_pickling.py
@@ -51,11 +51,8 @@ class TestPickling:
 
     def test_model_pickling_json(self):
         def check(config):
-            updater = config["learner"]["gradient_booster"]["updater"]
-            if params["tree_method"] == "exact":
-                subsample = updater["grow_colmaker"]["train_param"]["subsample"]
-            else:
-                subsample = updater["grow_quantile_histmaker"]["train_param"]["subsample"]
+            tree_param = config["learner"]["gradient_booster"]["tree_train_param"]
+            subsample = tree_param["subsample"]
             assert float(subsample) == 0.5
 
         params = {"nthread": 8, "tree_method": "hist", "subsample": 0.5}
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 63c0ff9d7..cb400df87 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -5,11 +5,11 @@ import numpy as np
 import pandas as pd
 import pytest
 from scipy import sparse
-from xgboost.testing.data import np_dtypes, pd_dtypes
-from xgboost.testing.shared import validate_leaf_output
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import np_dtypes, pd_dtypes
+from xgboost.testing.shared import validate_leaf_output
 
 
 def run_threaded_predict(X, rows, predict_func):
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 82815d883..316d0e5f6 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -4,6 +4,8 @@ import numpy as np
 import pytest
 from hypothesis import given, settings, strategies
 from scipy import sparse
+
+import xgboost as xgb
 from xgboost.testing import (
     IteratorForTest,
     make_batches,
@@ -15,8 +17,6 @@ from xgboost.testing import (
 )
 from xgboost.testing.data import np_dtypes
 
-import xgboost as xgb
-
 
 class TestQuantileDMatrix:
     def test_basic(self) -> None:
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 130af619c..be72793e7 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -5,15 +5,15 @@ from typing import Any, Dict, List
 import numpy as np
 import pytest
 from hypothesis import given, note, settings, strategies
+
+import xgboost as xgb
+from xgboost import testing as tm
 from xgboost.testing.params import (
     cat_parameter_strategy,
     exact_parameter_strategy,
     hist_parameter_strategy,
 )
-from xgboost.testing.updater import check_init_estimation
-
-import xgboost as xgb
-from xgboost import testing as tm
+from xgboost.testing.updater import check_init_estimation, check_quantile_loss
 
 
 def train_result(param, dmat, num_rounds):
@@ -447,7 +447,8 @@ class TestTreeMethod:
             {
                 "tree_method": tree_method,
                 "objective": "reg:absoluteerror",
-                "subsample": 0.8
+                "subsample": 0.8,
+                "eta": 1.0,
             },
             Xy,
             num_boost_round=10,
@@ -469,3 +470,7 @@ class TestTreeMethod:
 
     def test_init_estimation(self) -> None:
         check_init_estimation("hist")
+
+    @pytest.mark.parametrize("weighted", [True, False])
+    def test_quantile_loss(self, weighted: bool) -> None:
+        check_quantile_loss("hist", weighted)
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index 99b34c336..e5783b24d 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -3,10 +3,10 @@ from typing import Type
 import numpy as np
 import pytest
 from test_dmatrix import set_base_margin_info
-from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes
 
 try:
     import pandas as pd
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 55e14ae97..baef690ee 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -8,11 +8,12 @@ from typing import Callable, Optional
 import numpy as np
 import pytest
 from sklearn.utils.estimator_checks import parametrize_with_checks
-from xgboost.testing.shared import get_feature_weights, validate_data_initialization
-from xgboost.testing.updater import get_basescore
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.shared import get_feature_weights, validate_data_initialization
+from xgboost.testing.updater import get_basescore
 
 rng = np.random.RandomState(1994)
 pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), tm.timeout(30)]
@@ -180,6 +181,13 @@ def test_ranking_metric() -> None:
     assert results["validation_0"]["roc_auc_score"][-1] > 0.6
 
 
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_qid_df():
+    import pandas as pd
+
+    run_ranking_qid_df(pd, "hist")
+
+
 def test_stacking_regression():
     from sklearn.datasets import load_diabetes
     from sklearn.ensemble import RandomForestRegressor, StackingRegressor
@@ -1018,14 +1026,18 @@ def test_XGBClassifier_resume():
 
 
 def test_constraint_parameters():
-    reg = xgb.XGBRegressor(interaction_constraints='[[0, 1], [2, 3, 4]]')
+    reg = xgb.XGBRegressor(interaction_constraints="[[0, 1], [2, 3, 4]]")
     X = np.random.randn(10, 10)
     y = np.random.randn(10)
     reg.fit(X, y)
 
     config = json.loads(reg.get_booster().save_config())
-    assert config['learner']['gradient_booster']['updater']['grow_colmaker'][
-        'train_param']['interaction_constraints'] == '[[0, 1], [2, 3, 4]]'
+    assert (
+        config["learner"]["gradient_booster"]["tree_train_param"][
+            "interaction_constraints"
+        ]
+        == "[[0, 1], [2, 3, 4]]"
+    )
 
 
 def test_parameter_validation():
diff --git a/tests/test_distributed/test_federated/test_federated.py b/tests/test_distributed/test_federated/test_federated.py
index a534b8121..9b8e55915 100644
--- a/tests/test_distributed/test_federated/test_federated.py
+++ b/tests/test_distributed/test_federated/test_federated.py
@@ -3,9 +3,8 @@ import multiprocessing
 import sys
 import time
 
-import xgboost.federated
-
 import xgboost as xgb
+import xgboost.federated
 
 SERVER_KEY = 'server-key.pem'
 SERVER_CERT = 'server-cert.pem'
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index cf36e92b2..2e3b031c1 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -10,10 +10,10 @@ import numpy as np
 import pytest
 from hypothesis import given, note, settings, strategies
 from hypothesis._settings import duration
-from xgboost.testing.params import hist_parameter_strategy
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.params import hist_parameter_strategy
 
 pytestmark = [
     pytest.mark.skipif(**tm.no_dask()),
@@ -42,9 +42,9 @@ try:
     from dask import array as da
     from dask.distributed import Client
     from dask_cuda import LocalCUDACluster
-    from xgboost.testing.dask import check_init_estimation
 
     from xgboost import dask as dxgb
+    from xgboost.testing.dask import check_init_estimation
 except ImportError:
     pass
 
diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
index db0650f09..1f986f96e 100644
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -12,6 +12,7 @@ pytestmark = pytest.mark.skipif(**tm.no_spark())
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
+
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
 
 gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index ba76c04db..369dcd421 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -21,6 +21,9 @@ import scipy
 import sklearn
 from hypothesis import HealthCheck, given, note, settings
 from sklearn.datasets import make_classification, make_regression
+
+import xgboost as xgb
+from xgboost import testing as tm
 from xgboost.data import _is_cudf_df
 from xgboost.testing.params import hist_parameter_strategy
 from xgboost.testing.shared import (
@@ -29,9 +32,6 @@ from xgboost.testing.shared import (
     validate_leaf_output,
 )
 
-import xgboost as xgb
-from xgboost import testing as tm
-
 pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_dask())]
 
 import dask
@@ -39,6 +39,7 @@ import dask.array as da
 import dask.dataframe as dd
 from distributed import Client, LocalCluster
 from toolz import sliding_window  # dependency of dask
+
 from xgboost.dask import DaskDMatrix
 from xgboost.testing.dask import check_init_estimation
 
diff --git a/tests/test_distributed/test_with_spark/test_data.py b/tests/test_distributed/test_with_spark/test_data.py
index af6732df7..b08fcdf1d 100644
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -8,6 +8,7 @@ from xgboost import testing as tm
 
 pytestmark = [pytest.mark.skipif(**tm.no_spark())]
 
+from xgboost import DMatrix, QuantileDMatrix
 from xgboost.spark.data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
     alias,
@@ -15,8 +16,6 @@ from xgboost.spark.data import (
     stack_series,
 )
 
-from xgboost import DMatrix, QuantileDMatrix
-
 
 def test_stack() -> None:
     a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 27f1ef06f..a8c64713f 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -8,10 +8,10 @@ from typing import Generator, Sequence, Type
 
 import numpy as np
 import pytest
-from xgboost.spark.data import pred_contribs
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.spark.data import pred_contribs
 
 pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_spark())]
 
@@ -23,6 +23,8 @@ from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
 from pyspark.sql import functions as spark_sql_func
+
+from xgboost import XGBClassifier, XGBModel, XGBRegressor
 from xgboost.spark import (
     SparkXGBClassifier,
     SparkXGBClassifierModel,
@@ -32,8 +34,6 @@ from xgboost.spark import (
 )
 from xgboost.spark.core import _non_booster_params
 
-from xgboost import XGBClassifier, XGBModel, XGBRegressor
-
 from .utils import SparkTestCase
 
 logging.getLogger("py4j").setLevel(logging.INFO)
@@ -730,6 +730,16 @@ class TestPySparkLocal:
         train_params = py_cls._get_distributed_train_params(clf_data.cls_df_train)
         assert train_params["tree_method"] == "gpu_hist"
 
+    def test_classifier_with_list_eval_metric(self, clf_data: ClfData) -> None:
+        classifier = SparkXGBClassifier(eval_metric=["auc", "rmse"])
+        model = classifier.fit(clf_data.cls_df_train)
+        model.transform(clf_data.cls_df_test).collect()
+
+    def test_classifier_with_string_eval_metric(self, clf_data: ClfData) -> None:
+        classifier = SparkXGBClassifier(eval_metric="auc")
+        model = classifier.fit(clf_data.cls_df_train)
+        model.transform(clf_data.cls_df_test).collect()
+
 
 class XgboostLocalTest(SparkTestCase):
     def setUp(self):
diff --git a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
index cd8acbb6e..528b770ff 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
@@ -11,6 +11,7 @@ from xgboost import testing as tm
 pytestmark = pytest.mark.skipif(**tm.no_spark())
 
 from pyspark.ml.linalg import Vectors
+
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
 from xgboost.spark.utils import _get_max_num_concurrent_tasks
 
@@ -421,10 +422,10 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
         self.assertTrue(hasattr(classifier, "max_depth"))
         self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
         booster_config = json.loads(model.get_booster().save_config())
-        max_depth = booster_config["learner"]["gradient_booster"]["updater"][
-            "grow_histmaker"
-        ]["train_param"]["max_depth"]
-        self.assertEqual(int(max_depth), 7)
+        max_depth = booster_config["learner"]["gradient_booster"]["tree_train_param"][
+            "max_depth"
+        ]
+        assert int(max_depth) == 7
 
     def test_repartition(self):
         # The following test case has a few partitioned datasets that are either
diff --git a/tests/test_distributed/test_with_spark/utils.py b/tests/test_distributed/test_with_spark/utils.py
index 847316fea..adc6b6069 100644
--- a/tests/test_distributed/test_with_spark/utils.py
+++ b/tests/test_distributed/test_with_spark/utils.py
@@ -13,6 +13,7 @@ from xgboost import testing as tm
 pytestmark = [pytest.mark.skipif(**tm.no_spark())]
 
 from pyspark.sql import SparkSession
+
 from xgboost.spark.utils import _get_default_params_from_func