temp merge, disable 1 line, SetValid

2023-10-12 16:16:44 -07:00
parent 2e7e9d3b2d 85d3017ca5
commit ea19555474
492 changed files with 15533 additions and 9376 deletions
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -20,16 +20,18 @@ case "${container}" in
  cpu)
    ;;

-  gpu|rmm)
+  gpu)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
-    if [[ $container == "rmm" ]]
-    then
-      BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
-    fi
    ;;

-  gpu_build_centos7|jvm_gpu_build)
+  gpu_build_centos7)
+    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    ;;
+
+  jvm_gpu_build)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
    ;;
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -2,9 +2,11 @@

 set -euo pipefail

+WHEEL_TAG=manylinux2014_x86_64
+
 source tests/buildkite/conftest.sh

-echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled"
+echo "--- Build with CUDA ${CUDA_VERSION} with RMM"

 if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
 then
@@ -13,14 +15,40 @@ else
  arch_flag=""
 fi

-command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
+command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
-                `"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "`
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"

 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
-  -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag}
+$command_wrapper tests/ci_build/prune_libnccl.sh
+$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
+  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
+  -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
+  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
+echo "--- Build binary wheel"
+$command_wrapper bash -c \
+  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
+$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
+  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+
+echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
+tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \
+  --plat ${WHEEL_TAG} python-package/dist/*.whl
+$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
+  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+mv -v wheelhouse/*.whl python-package/dist/
+# Make sure that libgomp.so is vendored in the wheel
+tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \
+  "unzip -l python-package/dist/*.whl | grep libgomp  || exit -1"
+
+echo "--- Upload Python wheel"
+buildkite-agent artifact upload python-package/dist/*.whl
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \
+    --acl public-read --no-progress
+fi

 echo "-- Stash C++ test executable (testxgboost)"
 buildkite-agent artifact upload build/testxgboost
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -17,11 +17,12 @@ fi

 command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "`
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"

 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
+$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \
  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
  -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -13,7 +13,7 @@ if ( $is_release_branch -eq 0 ) {
 mkdir build
 cd build
 cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
+  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ${arch_flag}
 $msbuild = -join @(
  "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current"
  "\\Bin\\MSBuild.exe"
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x

 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.06
+RAPIDS_VERSION=23.08
 SPARK_VERSION=3.4.0
 JDK_VERSION=8

--- a/tests/buildkite/pipeline-mgpu.yml
+++ b/tests/buildkite/pipeline-mgpu.yml
@@ -12,7 +12,7 @@ steps:
      queue: pipeline-loader
  - wait
  - block: ":rocket: Run this test job"
-    if: build.pull_request.id != null
+    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
  #### -------- CONTAINER BUILD --------
  - label: ":docker: Build containers"
    commands:
--- a/tests/buildkite/pipeline-win64.yml
+++ b/tests/buildkite/pipeline-win64.yml
@@ -6,7 +6,7 @@ steps:
      queue: pipeline-loader
  - wait
  - block: ":rocket: Run this test job"
-    if: build.pull_request.id != null
+    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
  #### -------- BUILD --------
  - label: ":windows: Build XGBoost for Windows with CUDA"
    command: "tests/buildkite/build-win64-gpu.ps1"
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -9,14 +9,13 @@ steps:
      queue: pipeline-loader
  - wait
  - block: ":rocket: Run this test job"
-    if: build.pull_request.id != null
+    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
  #### -------- CONTAINER BUILD --------
  - label: ":docker: Build containers"
    commands:
      - "tests/buildkite/build-containers.sh cpu"
      - "tests/buildkite/build-containers.sh gpu"
      - "tests/buildkite/build-containers.sh gpu_build_centos7"
-      - "tests/buildkite/build-containers.sh rmm"
    key: build-containers
    agents:
      queue: linux-amd64-cpu
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -16,8 +16,8 @@ echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
 rm -rfv build/
 buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
 chmod +x build/testxgboost
-tests/ci_build/ci_build.sh rmm nvidia-docker \
+tests/ci_build/ci_build.sh gpu nvidia-docker \
  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
-  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
-  "source activate gpu_test && build/testxgboost --use-rmm-pool"
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
+  build/testxgboost --use-rmm-pool
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
+ARG RAPIDS_VERSION_ARG

 # Install all basic requirements
 RUN \
@@ -16,8 +17,8 @@ RUN \
    bash conda.sh -b -p /opt/mambaforge && \
    /opt/mambaforge/bin/python -m pip install awscli && \
    # CMake
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    wget -nv -nc https://cmake.org/files/v3.26/cmake-3.26.4-linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.26.4-linux-x86_64.sh --skip-license --prefix=/usr

 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
@@ -33,9 +34,21 @@ ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
 ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
 ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
 ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
+ENV CUDAHOSTCXX=/opt/rh/devtoolset-9/root/usr/bin/c++

 ENV GOSU_VERSION 1.10

+# Install RMM
+RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \
+    pushd rmm && \
+    mkdir build && \
+    pushd build && \
+    cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \
+    cmake --build . --target install && \
+    popd && \
+    popd && \
+    rm -rf rmm
+
 # Install gRPC
 RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
      --recurse-submodules --depth 1 && \
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -1,49 +0,0 @@
-ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
-ARG CUDA_VERSION_ARG
-ARG RAPIDS_VERSION_ARG
-ARG NCCL_VERSION_ARG
-
-# Environment
-ENV DEBIAN_FRONTEND noninteractive
-SHELL ["/bin/bash", "-c"]   # Use Bash as shell
-
-# Install all basic requirements
-RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
-    apt-get update && \
-    apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
-    # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
-    bash conda.sh -b -p /opt/mambaforge
-
-# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
-RUN \
-    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=$NCCL_VERSION_ARG && \
-    apt-get update && \
-    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
-
-ENV PATH=/opt/mambaforge/bin:$PATH
-
-# Create new Conda environment with RMM
-RUN \
-    conda install -c conda-forge mamba && \
-    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
-    mamba clean --all
-
-ENV GOSU_VERSION 1.10
-
-# Install lightweight sudo (not bound to TTY)
-RUN set -ex; \
-    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
-    chmod +x /usr/local/bin/gosu && \
-    gosu nobody true
-
-# Default entry-point to use if running locally
-# It will preserve attributes of created files
-COPY entrypoint.sh /scripts/
-
-WORKDIR /workspace
-ENTRYPOINT ["/scripts/entrypoint.sh"]
--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -35,7 +35,7 @@ if [[ "$platform_id" == macosx_* ]]; then
        # MacOS, Intel
        wheel_tag=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
        cpython_ver=38
-        export MACOSX_DEPLOYMENT_TARGET=10.13
+        export MACOSX_DEPLOYMENT_TARGET=10.15
        #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
        OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hda6cdc1_1-osx-64.tar.bz2"
    else
--- a/tests/ci_build/build_r_pkg_with_cuda.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda.sh
@@ -19,7 +19,6 @@ cmake .. -GNinja -DUSE_CUDA=ON -DR_LIB=ON
 ninja
 cd ..

-rm xgboost
 # This super wacky hack is found in cmake/RPackageInstall.cmake.in and
 # cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
 # and have R use xgboost.so that we've already built.
--- a/tests/ci_build/build_r_pkg_with_cuda_win64.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
@@ -22,7 +22,6 @@ cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="
 cmake --build . --config Release --parallel
 cd ..

-rm xgboost
 # This super wacky hack is found in cmake/RPackageInstall.cmake.in and
 # cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
 # and have R use xgboost.dll that we've already built.
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -24,7 +24,7 @@ fi
 rm -rf build
 mkdir build
 cd build
-cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON
+cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON -DBUILD_DEPRECATED_CLI=ON
 ninja clean
 time ninja -v
 cd ..
--- a/tests/ci_build/change_version.py
+++ b/tests/ci_build/change_version.py
@@ -61,8 +61,11 @@ def pypkg(


@cd(R_PACKAGE)
-def rpkg(major: int, minor: int, patch: int) -> None:
-    version = f"{major}.{minor}.{patch}.1"
+def rpkg(major: int, minor: int, patch: int, is_dev: bool) -> None:
+    if is_dev:
+        version = f"{major}.{minor}.{patch}.0"
+    else:
+        version = f"{major}.{minor}.{patch}.1"
    # Version: 2.0.0.1
    desc_path = "DESCRIPTION"
    with open(desc_path, "r") as fd:
@@ -119,8 +122,8 @@ def main(args: argparse.Namespace) -> None:
    minor = args.minor
    patch = args.patch
    rc = args.rc
-    is_rc = args.is_rc == 1
-    is_dev = args.is_dev == 1
+    is_rc = args.is_rc
+    is_dev = args.is_dev
    if is_rc and is_dev:
        raise ValueError("It cannot be both a rc and a dev branch.")
    if is_rc:
@@ -130,7 +133,7 @@ def main(args: argparse.Namespace) -> None:

    cmake(major, minor, patch)
    pypkg(major, minor, patch, rc, is_rc, is_dev)
-    rpkg(major, minor, patch)
+    rpkg(major, minor, patch, is_dev=is_dev)
    jvmpkgs(major, minor, patch, rc, is_rc, is_dev)

    print(
@@ -149,8 +152,8 @@ if __name__ == "__main__":
    parser.add_argument("--minor", type=int)
    parser.add_argument("--patch", type=int)
    parser.add_argument("--rc", type=int, default=0)
-    parser.add_argument("--is-rc", type=int, choices=[0, 1])
-    parser.add_argument("--is-dev", type=int, choices=[0, 1])
+    parser.add_argument("--is-rc", action="store_true")
+    parser.add_argument("--is-dev", action="store_true")
    args = parser.parse_args()
    try:
        main(args)
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -23,20 +23,33 @@ class LintersPaths:
        "tests/python/test_predict.py",
        "tests/python/test_quantile_dmatrix.py",
        "tests/python/test_tree_regularization.py",
+        "tests/python/test_shap.py",
        "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/test_gpu_prediction.py",
+        "tests/python-gpu/load_pickle.py",
+        "tests/python-gpu/test_gpu_pickling.py",
+        "tests/python-gpu/test_gpu_eval_metrics.py",
+        "tests/python-gpu/test_gpu_with_sklearn.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
+        "demo/dask/",
+        "demo/rmm_plugin",
        "demo/json-model/json_parser.py",
        "demo/guide-python/cat_in_the_dat.py",
+        "demo/guide-python/callbacks.py",
        "demo/guide-python/categorical.py",
        "demo/guide-python/feature_weights.py",
        "demo/guide-python/sklearn_parallel.py",
        "demo/guide-python/spark_estimator_examples.py",
+        "demo/guide-python/external_memory.py",
        "demo/guide-python/individual_trees.py",
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
        "demo/guide-python/learning_to_rank.py",
+        "demo/guide-python/quantile_data_iterator.py",
+        "demo/guide-python/update_process.py",
+        "demo/aft_survival/aft_survival_viz_demo.py",
        # CI
        "tests/ci_build/lint_python.py",
        "tests/ci_build/test_r_package.py",
@@ -66,6 +79,7 @@ class LintersPaths:
        "tests/python/test_dt.py",
        "tests/python/test_data_iterator.py",
        "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/load_pickle.py",
        "tests/test_distributed/test_with_spark/test_data.py",
        "tests/test_distributed/test_gpu_with_spark/test_data.py",
        "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
@@ -78,6 +92,7 @@ class LintersPaths:
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
        "demo/guide-python/learning_to_rank.py",
+        "demo/aft_survival/aft_survival_viz_demo.py",
        # CI
        "tests/ci_build/lint_python.py",
        "tests/ci_build/test_r_package.py",
@@ -114,7 +129,13 @@ def run_black(rel_path: str, fix: bool) -> bool:
@cd(PY_PACKAGE)
 def run_isort(rel_path: str, fix: bool) -> bool:
    # Isort gets confused when trying to find the config file, so specified explicitly.
-    cmd = ["isort", "--settings-path", PY_PACKAGE, os.path.join(ROOT, rel_path)]
+    cmd = [
+        "isort",
+        "--settings-path",
+        PY_PACKAGE,
+        f"--src={PY_PACKAGE}",
+        os.path.join(ROOT, rel_path),
+    ]
    if not fix:
        cmd += ["--check"]

--- a/tests/ci_build/lint_r.R
+++ b/tests/ci_build/lint_r.R
@@ -20,15 +20,23 @@ my_linters <- list(
  any_duplicated = lintr::any_duplicated_linter(),
  any_is_na = lintr::any_is_na_linter(),
  assignment_linter = lintr::assignment_linter(),
+  boolean_arithmetic = lintr::boolean_arithmetic_linter(),
  brace_linter = lintr::brace_linter(),
+  class_equals = lintr::class_equals_linter(),
  commas_linter = lintr::commas_linter(),
+  empty_assignment = lintr::empty_assignment_linter(),
  equals_na = lintr::equals_na_linter(),
  fixed_regex = lintr::fixed_regex_linter(),
+  for_loop_index = lintr::for_loop_index_linter(),
+  function_return = lintr::function_return_linter(),
  infix_spaces_linter = lintr::infix_spaces_linter(),
+  is_numeric = lintr::is_numeric_linter(),
  line_length_linter = lintr::line_length_linter(length = 150L),
-  no_tab_linter = lintr::no_tab_linter(),
+  lengths = lintr::lengths_linter(),
+  matrix = lintr::matrix_apply_linter(),
  object_usage_linter = lintr::object_usage_linter(),
  object_length_linter = lintr::object_length_linter(),
+  routine_registration = lintr::routine_registration_linter(),
  semicolon = lintr::semicolon_linter(),
  seq = lintr::seq_linter(),
  spaces_inside_linter = lintr::spaces_inside_linter(),
@@ -37,9 +45,10 @@ my_linters <- list(
  trailing_blank_lines_linter = lintr::trailing_blank_lines_linter(),
  trailing_whitespace_linter = lintr::trailing_whitespace_linter(),
  true_false = lintr::T_and_F_symbol_linter(),
-  unneeded_concatenation = lintr::unneeded_concatenation_linter(),
+  unnecessary_concatenation = lintr::unnecessary_concatenation_linter(),
  unreachable_code = lintr::unreachable_code_linter(),
-  vector_logic = lintr::vector_logic_linter()
+  vector_logic = lintr::vector_logic_linter(),
+  whitespace = lintr::whitespace_linter()
 )

 noquote(paste0(length(FILES_TO_LINT), " R files need linting"))
--- a/tests/ci_build/prune_libnccl.sh
+++ b/tests/ci_build/prune_libnccl.sh
@@ -26,7 +26,7 @@ set_property(TARGET test PROPERTY CUDA_ARCHITECTURES \${CMAKE_CUDA_ARCHITECTURES
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 EOF

-cmake . -GNinja
+cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')

 nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -8,13 +8,20 @@
 #include <xgboost/learner.h>
 #include <xgboost/version_config.h>

-#include <cstddef>  // std::size_t
-#include <limits>   // std::numeric_limits
-#include <string>   // std::string
+#include <array>      // for array
+#include <cstddef>    // std::size_t
+#include <filesystem> // std::filesystem
+#include <limits>     // std::numeric_limits
+#include <string>     // std::string
 #include <vector>

 #include "../../../src/c_api/c_api_error.h"
 #include "../../../src/common/io.h"
+#include "../../../src/data/adapter.h"              // for ArrayAdapter
+#include "../../../src/data/array_interface.h"      // for ArrayInterface
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
+#include "../../../src/data/sparse_page_dmatrix.h"  // for SparsePageDMatrix
 #include "../helpers.h"

 TEST(CAPI, XGDMatrixCreateFromMatDT) {
@@ -137,9 +144,9 @@ TEST(CAPI, ConfigIO) {
  BoosterHandle handle = learner.get();
  learner->UpdateOneIter(0, p_dmat);

-  char const* out[1];
+  std::array<char const* , 1> out;
  bst_ulong len {0};
-  XGBoosterSaveJsonConfig(handle, &len, out);
+  XGBoosterSaveJsonConfig(handle, &len, out.data());

  std::string config_str_0 { out[0] };
  auto config_0 = Json::Load({config_str_0.c_str(), config_str_0.size()});
@@ -147,7 +154,7 @@ TEST(CAPI, ConfigIO) {

  bst_ulong len_1 {0};
  std::string config_str_1 { out[0] };
-  XGBoosterSaveJsonConfig(handle, &len_1, out);
+  XGBoosterSaveJsonConfig(handle, &len_1, out.data());
  auto config_1 = Json::Load({config_str_1.c_str(), config_str_1.size()});

  ASSERT_EQ(config_0, config_1);
@@ -156,7 +163,7 @@ TEST(CAPI, ConfigIO) {
 TEST(CAPI, JsonModelIO) {
  size_t constexpr kRows = 10;
  size_t constexpr kCols = 10;
-  dmlc::TemporaryDirectory tempdir;
+  auto tempdir = std::filesystem::temp_directory_path();

  auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
  std::vector<std::shared_ptr<DMatrix>> mat {p_dmat};
@@ -172,19 +179,19 @@ TEST(CAPI, JsonModelIO) {
  learner->UpdateOneIter(0, p_dmat);
  BoosterHandle handle = learner.get();

-  std::string modelfile_0 = tempdir.path + "/model_0.json";
-  XGBoosterSaveModel(handle, modelfile_0.c_str());
-  XGBoosterLoadModel(handle, modelfile_0.c_str());
+  auto modelfile_0 = tempdir / std::filesystem::u8path(u8"모델_0.json");
+  XGBoosterSaveModel(handle, modelfile_0.u8string().c_str());
+  XGBoosterLoadModel(handle, modelfile_0.u8string().c_str());

  bst_ulong num_feature {0};
  ASSERT_EQ(XGBoosterGetNumFeature(handle, &num_feature), 0);
  ASSERT_EQ(num_feature, kCols);

-  std::string modelfile_1 = tempdir.path + "/model_1.json";
-  XGBoosterSaveModel(handle, modelfile_1.c_str());
+  auto modelfile_1 = tempdir / "model_1.json";
+  XGBoosterSaveModel(handle, modelfile_1.u8string().c_str());

-  auto model_str_0 = common::LoadSequentialFile(modelfile_0);
-  auto model_str_1 = common::LoadSequentialFile(modelfile_1);
+  auto model_str_0 = common::LoadSequentialFile(modelfile_0.u8string());
+  auto model_str_1 = common::LoadSequentialFile(modelfile_1.u8string());

  ASSERT_EQ(model_str_0.front(), '{');
  ASSERT_EQ(model_str_0, model_str_1);
@@ -209,8 +216,8 @@ TEST(CAPI, JsonModelIO) {

  std::string buffer;
  Json::Dump(Json::Load(l, std::ios::binary), &buffer);
-  ASSERT_EQ(model_str_0.size() - 1, buffer.size());
-  ASSERT_EQ(model_str_0.back(), '\0');
+  ASSERT_EQ(model_str_0.size(), buffer.size());
+  ASSERT_EQ(model_str_0.back(), '}');
  ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));

  ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);
@@ -266,9 +273,9 @@ TEST(CAPI, DMatrixSetFeatureName) {
    ASSERT_EQ(std::to_string(i), c_out_features[i]);
  }

-  char const* feat_types [] {"i", "q"};
+  std::array<char const *, 2> feat_types{"i", "q"};
  static_assert(sizeof(feat_types) / sizeof(feat_types[0]) == kCols);
-  XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols);
+  XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types.data(), kCols);
  char const **c_out_types;
  XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len,
                             &c_out_types);
@@ -410,4 +417,210 @@ TEST(CAPI, JArgs) {
    ASSERT_THROW({ RequiredArg<String>(args, "null", __func__); }, dmlc::Error);
  }
 }
+
+namespace {
+void MakeLabelForTest(std::shared_ptr<DMatrix> Xy, DMatrixHandle cxy) {
+  auto n_samples = Xy->Info().num_row_;
+  std::vector<float> y(n_samples);
+  for (std::size_t i = 0; i < y.size(); ++i) {
+    y[i] = static_cast<float>(i);
+  }
+
+  Xy->Info().labels.Reshape(n_samples);
+  Xy->Info().labels.Data()->HostVector() = y;
+
+  auto y_int = GetArrayInterface(Xy->Info().labels.Data(), n_samples, 1);
+  std::string s_y_int;
+  Json::Dump(y_int, &s_y_int);
+
+  XGDMatrixSetInfoFromInterface(cxy, "label", s_y_int.c_str());
+}
+
+auto MakeSimpleDMatrixForTest(bst_row_t n_samples, bst_feature_t n_features, Json dconfig) {
+  HostDeviceVector<float> storage;
+  auto arr_int = RandomDataGenerator{n_samples, n_features, 0.5f}.GenerateArrayInterface(&storage);
+
+  data::ArrayAdapter adapter{StringView{arr_int}};
+  std::shared_ptr<DMatrix> Xy{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads())};
+
+  DMatrixHandle p_fmat;
+  std::string s_dconfig;
+  Json::Dump(dconfig, &s_dconfig);
+  CHECK_EQ(XGDMatrixCreateFromDense(arr_int.c_str(), s_dconfig.c_str(), &p_fmat), 0);
+
+  MakeLabelForTest(Xy, p_fmat);
+  return std::pair{p_fmat, Xy};
+}
+
+auto MakeQDMForTest(Context const *ctx, bst_row_t n_samples, bst_feature_t n_features,
+                    Json dconfig) {
+  bst_bin_t n_bins{16};
+  dconfig["max_bin"] = Integer{n_bins};
+
+  std::size_t n_batches{4};
+  std::unique_ptr<ArrayIterForTest> iter_0;
+  if (ctx->IsCUDA()) {
+    iter_0 = std::make_unique<CudaArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  } else {
+    iter_0 = std::make_unique<NumpyArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  }
+  std::string s_dconfig;
+  Json::Dump(dconfig, &s_dconfig);
+  DMatrixHandle p_fmat;
+  CHECK_EQ(XGQuantileDMatrixCreateFromCallback(static_cast<DataIterHandle>(iter_0.get()),
+                                               iter_0->Proxy(), nullptr, Reset, Next,
+                                               s_dconfig.c_str(), &p_fmat),
+           0);
+
+  std::unique_ptr<ArrayIterForTest> iter_1;
+  if (ctx->IsCUDA()) {
+    iter_1 = std::make_unique<CudaArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  } else {
+    iter_1 = std::make_unique<NumpyArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  }
+  auto Xy =
+      std::make_shared<data::IterativeDMatrix>(iter_1.get(), iter_1->Proxy(), nullptr, Reset, Next,
+                                               std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
+  return std::pair{p_fmat, Xy};
+}
+
+auto MakeExtMemForTest(bst_row_t n_samples, bst_feature_t n_features, Json dconfig) {
+  std::size_t n_batches{4};
+  NumpyArrayIterForTest iter_0{0.0f, n_samples, n_features, n_batches};
+  std::string s_dconfig;
+  dconfig["cache_prefix"] = String{"cache"};
+  Json::Dump(dconfig, &s_dconfig);
+  DMatrixHandle p_fmat;
+  CHECK_EQ(XGDMatrixCreateFromCallback(static_cast<DataIterHandle>(&iter_0), iter_0.Proxy(), Reset,
+                                       Next, s_dconfig.c_str(), &p_fmat),
+           0);
+
+  NumpyArrayIterForTest iter_1{0.0f, n_samples, n_features, n_batches};
+  auto Xy = std::make_shared<data::SparsePageDMatrix>(
+      &iter_1, iter_1.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, "");
+  MakeLabelForTest(Xy, p_fmat);
+  return std::pair{p_fmat, Xy};
+}
+
+template <typename Page>
+void CheckResult(Context const *ctx, bst_feature_t n_features, std::shared_ptr<DMatrix> Xy,
+                 float const *out_data, std::uint64_t const *out_indptr) {
+  for (auto const &page : Xy->GetBatches<Page>(ctx, BatchParam{16, 0.2})) {
+    auto const &cut = page.Cuts();
+    auto const &ptrs = cut.Ptrs();
+    auto const &vals = cut.Values();
+    auto const &mins = cut.MinValues();
+    for (bst_feature_t f = 0; f < Xy->Info().num_col_; ++f) {
+      ASSERT_EQ(ptrs[f] + f, out_indptr[f]);
+      ASSERT_EQ(mins[f], out_data[out_indptr[f]]);
+      auto beg = out_indptr[f];
+      auto end = out_indptr[f + 1];
+      auto val_beg = ptrs[f];
+      for (std::uint64_t i = beg + 1, j = val_beg; i < end; ++i, ++j) {
+        ASSERT_EQ(vals[j], out_data[i]);
+      }
+    }
+
+    ASSERT_EQ(ptrs[n_features] + n_features, out_indptr[n_features]);
+  }
+}
+
+void TestXGDMatrixGetQuantileCut(Context const *ctx) {
+  bst_row_t n_samples{1024};
+  bst_feature_t n_features{16};
+
+  Json dconfig{Object{}};
+  dconfig["ntread"] = Integer{Context{}.Threads()};
+  dconfig["missing"] = Number{std::numeric_limits<float>::quiet_NaN()};
+
+  auto check_result = [n_features, &ctx](std::shared_ptr<DMatrix> Xy, StringView s_out_data,
+                                         StringView s_out_indptr) {
+    auto i_out_data = ArrayInterface<1, false>{s_out_data};
+    ASSERT_EQ(i_out_data.type, ArrayInterfaceHandler::kF4);
+    auto out_data = static_cast<float const *>(i_out_data.data);
+    ASSERT_TRUE(out_data);
+
+    auto i_out_indptr = ArrayInterface<1, false>{s_out_indptr};
+    ASSERT_EQ(i_out_indptr.type, ArrayInterfaceHandler::kU8);
+    auto out_indptr = static_cast<std::uint64_t const *>(i_out_indptr.data);
+    ASSERT_TRUE(out_data);
+
+    if (ctx->IsCPU()) {
+      CheckResult<GHistIndexMatrix>(ctx, n_features, Xy, out_data, out_indptr);
+    } else {
+      CheckResult<EllpackPage>(ctx, n_features, Xy, out_data, out_indptr);
+    }
+  };
+
+  Json config{Null{}};
+  std::string s_config;
+  Json::Dump(config, &s_config);
+  char const *out_indptr;
+  char const *out_data;
+
+  {
+    // SimpleDMatrix
+    auto [p_fmat, Xy] = MakeSimpleDMatrixForTest(n_samples, n_features, dconfig);
+    // assert fail, we don't have the quantile yet.
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), -1);
+
+    std::array<DMatrixHandle, 1> mats{p_fmat};
+    BoosterHandle booster;
+    ASSERT_EQ(XGBoosterCreate(mats.data(), 1, &booster), 0);
+    ASSERT_EQ(XGBoosterSetParam(booster, "max_bin", "16"), 0);
+    if (ctx->IsCUDA()) {
+      ASSERT_EQ(XGBoosterSetParam(booster, "device", ctx->DeviceName().c_str()), 0);
+    }
+    ASSERT_EQ(XGBoosterUpdateOneIter(booster, 0, p_fmat), 0);
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
+
+    check_result(Xy, out_data, out_indptr);
+
+    XGDMatrixFree(p_fmat);
+    XGBoosterFree(booster);
+  }
+
+  {
+    // IterativeDMatrix
+    auto [p_fmat, Xy] = MakeQDMForTest(ctx, n_samples, n_features, dconfig);
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
+
+    check_result(Xy, out_data, out_indptr);
+    XGDMatrixFree(p_fmat);
+  }
+
+  {
+    // SparsePageDMatrix
+    auto [p_fmat, Xy] = MakeExtMemForTest(n_samples, n_features, dconfig);
+    // assert fail, we don't have the quantile yet.
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), -1);
+
+    std::array<DMatrixHandle, 1> mats{p_fmat};
+    BoosterHandle booster;
+    ASSERT_EQ(XGBoosterCreate(mats.data(), 1, &booster), 0);
+    ASSERT_EQ(XGBoosterSetParam(booster, "max_bin", "16"), 0);
+    if (ctx->IsCUDA()) {
+      ASSERT_EQ(XGBoosterSetParam(booster, "device", ctx->DeviceName().c_str()), 0);
+    }
+    ASSERT_EQ(XGBoosterUpdateOneIter(booster, 0, p_fmat), 0);
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
+
+    XGDMatrixFree(p_fmat);
+    XGBoosterFree(booster);
+  }
+}
+}  // namespace
+
+TEST(CAPI, XGDMatrixGetQuantileCut) {
+  Context ctx;
+  TestXGDMatrixGetQuantileCut(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST(CAPI, GPUXGDMatrixGetQuantileCut) {
+  auto ctx = MakeCUDACtx(0);
+  TestXGDMatrixGetQuantileCut(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -5,26 +5,22 @@

 #include <gtest/gtest.h>

+#include <bitset>
 #include <string>  // for string

 #if defined(XGBOOST_USE_NCCL)
-#include "../../../src/collective/nccl_device_communicator.cuh"
 #include "../../../src/collective/communicator-inl.cuh"
+#include "../../../src/collective/nccl_device_communicator.cuh"
 #elif defined(XGBOOST_USE_RCCL)
-#include "../../../src/collective/nccl_device_communicator.hip.h"
 #include "../../../src/collective/communicator-inl.hip.h"
+#include "../../../src/collective/nccl_device_communicator.hip.h"
 #endif

 namespace xgboost {
 namespace collective {

 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
-  EXPECT_THROW(construct(), dmlc::Error);
-}
-
-TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
-  auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
  EXPECT_THROW(construct(), dmlc::Error);
 }

@@ -36,6 +32,72 @@ TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
  }
 }
+
+namespace {
+void VerifyAllReduceBitwiseAND() {
+  auto const rank = collective::GetRank();
+  std::bitset<64> original{};
+  original[rank] = true;
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
+  collective::Synchronize(rank);
+  EXPECT_EQ(buffer.HostVector()[0], 0ULL);
+}
+}  // anonymous namespace
+
+TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseAND) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseAND test with # GPUs = " << n_gpus;
+  }
+  auto constexpr kUseNccl = true;
+  RunWithInMemoryCommunicator<kUseNccl>(n_gpus, VerifyAllReduceBitwiseAND);
+}
+
+namespace {
+void VerifyAllReduceBitwiseOR() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::bitset<64> original{};
+  original[rank] = true;
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
+  collective::Synchronize(rank);
+  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
+}
+}  // anonymous namespace
+
+TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseOR) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseOR test with # GPUs = " << n_gpus;
+  }
+  auto constexpr kUseNccl = true;
+  RunWithInMemoryCommunicator<kUseNccl>(n_gpus, VerifyAllReduceBitwiseOR);
+}
+
+namespace {
+void VerifyAllReduceBitwiseXOR() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::bitset<64> original{~0ULL};
+  original[rank] = false;
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
+  collective::Synchronize(rank);
+  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
+}
+}  // anonymous namespace
+
+TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseXOR) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseXOR test with # GPUs = " << n_gpus;
+  }
+  auto constexpr kUseNccl = true;
+  RunWithInMemoryCommunicator<kUseNccl>(n_gpus, VerifyAllReduceBitwiseXOR);
+}
+
 }  // namespace collective
 }  // namespace xgboost

--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/collective/socket.h>
@@ -10,8 +10,7 @@

 #include "../helpers.h"

-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 TEST(Socket, Basic) {
  system::SocketStartup();

@@ -31,15 +30,16 @@ TEST(Socket, Basic) {
    TCPSocket client;
    if (domain == SockDomain::kV4) {
      auto const& addr = SockAddrV4::Loopback().Addr();
-      ASSERT_EQ(Connect(MakeSockAddress(StringView{addr}, port), &client), std::errc{});
+      auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
    } else {
      auto const& addr = SockAddrV6::Loopback().Addr();
-      auto rc = Connect(MakeSockAddress(StringView{addr}, port), &client);
+      auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
      // some environment (docker) has restricted network configuration.
-      if (rc == std::error_code{EADDRNOTAVAIL, std::system_category()}) {
+      if (!rc.OK() && rc.Code() == std::error_code{EADDRNOTAVAIL, std::system_category()}) {
        GTEST_SKIP_(msg.c_str());
      }
-      ASSERT_EQ(rc, std::errc{});
+      ASSERT_EQ(rc, Success()) << rc.Report();
    }
    ASSERT_EQ(client.Domain(), domain);

@@ -73,5 +73,4 @@ TEST(Socket, Basic) {

  system::SocketFinalize();
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -21,8 +21,7 @@
 namespace xgboost {
 namespace common {
 void TestSegmentedArgSort() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  size_t constexpr kElements = 100, kGroups = 3;
  dh::device_vector<size_t> sorted_idx(kElements, 0);
@@ -60,8 +59,7 @@ void TestSegmentedArgSort() {
 TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }

 TEST(Algorithm, GpuArgSort) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  dh::device_vector<float> values(20);
  dh::Iota(dh::ToSpan(values));                                    // accending
--- a/tests/cpp/common/test_bitfield.cc
+++ b/tests/cpp/common/test_bitfield.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include "../../../src/common/bitfield.h"
@@ -14,7 +14,7 @@ TEST(BitField, Check) {
                static_cast<typename common::Span<LBitField64::value_type>::index_type>(
                    storage.size())});
    size_t true_bit = 190;
-    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
      ASSERT_FALSE(bits.Check(i));
    }
    ASSERT_TRUE(bits.Check(true_bit));
@@ -34,7 +34,7 @@ TEST(BitField, Check) {
      ASSERT_FALSE(bits.Check(i));
    }
    ASSERT_TRUE(bits.Check(true_bit));
-    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
      ASSERT_FALSE(bits.Check(i));
    }
  }
--- a/tests/cpp/common/test_bitfield.cu
+++ b/tests/cpp/common/test_bitfield.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -16,7 +16,7 @@ namespace xgboost {

 __global__ void TestSetKernel(LBitField64 bits) {
  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < bits.Size()) {
+  if (tid < bits.Capacity()) {
    bits.Set(tid);
  }
 }
@@ -40,20 +40,16 @@ TEST(BitField, GPUSet) {

  std::vector<LBitField64::value_type> h_storage(storage.size());
  thrust::copy(storage.begin(), storage.end(), h_storage.begin());
-
-  LBitField64 outputs {
-    common::Span<LBitField64::value_type>{h_storage.data(),
-                                       h_storage.data() + h_storage.size()}};
+  LBitField64 outputs{
+      common::Span<LBitField64::value_type>{h_storage.data(), h_storage.data() + h_storage.size()}};
  for (size_t i = 0; i < kBits; ++i) {
    ASSERT_TRUE(outputs.Check(i));
  }
 }

-__global__ void TestOrKernel(LBitField64 lhs, LBitField64 rhs) {
-  lhs |= rhs;
-}
-
-TEST(BitField, GPUAnd) {
+namespace {
+template <bool is_and, typename Op>
+void TestGPULogic(Op op) {
  uint32_t constexpr kBits = 128;
  dh::device_vector<LBitField64::value_type> lhs_storage(kBits);
  dh::device_vector<LBitField64::value_type> rhs_storage(kBits);
@@ -61,13 +57,32 @@ TEST(BitField, GPUAnd) {
  auto rhs = LBitField64(dh::ToSpan(rhs_storage));
  thrust::fill(lhs_storage.begin(), lhs_storage.end(), 0UL);
  thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<LBitField64::value_type>(0UL));
-  TestOrKernel<<<1, kBits>>>(lhs, rhs);
+  dh::LaunchN(kBits, [=] __device__(auto) mutable { op(lhs, rhs); });

  std::vector<LBitField64::value_type> h_storage(lhs_storage.size());
  thrust::copy(lhs_storage.begin(), lhs_storage.end(), h_storage.begin());
-  LBitField64 outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
-  for (size_t i = 0; i < kBits; ++i) {
-    ASSERT_TRUE(outputs.Check(i));
+  LBitField64 outputs{{h_storage.data(), h_storage.data() + h_storage.size()}};
+  if (is_and) {
+    for (size_t i = 0; i < kBits; ++i) {
+      ASSERT_FALSE(outputs.Check(i));
+    }
+  } else {
+    for (size_t i = 0; i < kBits; ++i) {
+      ASSERT_TRUE(outputs.Check(i));
+    }
  }
 }
+
+void TestGPUAnd() {
+  TestGPULogic<true>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs &= rhs; });
+}
+
+void TestGPUOr() {
+  TestGPULogic<false>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs |= rhs; });
+}
+}  // namespace
+
+TEST(BitField, GPUAnd) { TestGPUAnd(); }
+
+TEST(BitField, GPUOr) { TestGPUOr(); }
 }  // namespace xgboost
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -2,15 +2,26 @@
 * Copyright 2018-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
+#include <xgboost/base.h>     // for bst_bin_t
+#include <xgboost/context.h>  // for Context
+#include <xgboost/data.h>     // for BatchIterator, BatchSet, DMatrix, Met...

-#include "../../../src/common/column_matrix.h"
-#include "../helpers.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t, uint16_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <memory>       // for shared_ptr, __shared_ptr_access, allo...
+#include <type_traits>  // for remove_reference_t

+#include "../../../src/common/column_matrix.h"      // for ColumnMatrix, Column, DenseColumnIter
+#include "../../../src/common/hist_util.h"          // for DispatchBinType, BinTypeSize, Index
+#include "../../../src/common/ref_resource_view.h"  // for RefResourceView
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
+#include "../../../src/tree/param.h"                // for TrainParam
+#include "../helpers.h"                             // for RandomDataGenerator, NumpyArrayIterFo...

-namespace xgboost {
-namespace common {
-
-TEST(DenseColumn, Test) {
+namespace xgboost::common {
+TEST(ColumnMatrix, Basic) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -22,7 +33,7 @@ TEST(DenseColumn, Test) {
    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, sparse_thresh, ctx.Threads());
    }
    ASSERT_GE(column_matrix.GetTypeSize(), last);
    ASSERT_LE(column_matrix.GetTypeSize(), kUint32BinsTypeSize);
@@ -59,7 +70,7 @@ void CheckSparseColumn(SparseColumnIter<BinIdxType>* p_col, const GHistIndexMatr
  }
 }

-TEST(SparseColumn, Test) {
+TEST(ColumnMatrix, SparseColumn) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -69,7 +80,7 @@ TEST(SparseColumn, Test) {
    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 1.0, ctx.Threads());
    }
    common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
      using T = decltype(dtype);
@@ -83,12 +94,14 @@ template <typename BinIdxType>
 void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
                                const GHistIndexMatrix& gmat) {
  for (auto i = 0ull; i < col.Size(); i++) {
-    if (col.IsMissing(i)) continue;
+    if (col.IsMissing(i)) {
+      continue;
+    }
    EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i));
  }
 }

-TEST(DenseColumnWithMissing, Test) {
+TEST(ColumnMatrix, DenseColumnWithMissing) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -98,7 +111,7 @@ TEST(DenseColumnWithMissing, Test) {
    GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 0.2, ctx.Threads());
    }
    ASSERT_TRUE(column_matrix.AnyMissing());
    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
@@ -108,5 +121,29 @@ TEST(DenseColumnWithMissing, Test) {
    });
  }
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(ColumnMatrix, GrowMissing) {
+  float sparsity = 0.5;
+  NumpyArrayIterForTest iter(sparsity);
+  auto n_threads = 0;
+  bst_bin_t n_bins = 16;
+  BatchParam batch{n_bins, tree::TrainParam::DftSparseThreshold()};
+  Context ctx;
+  auto m = std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                    std::numeric_limits<float>::quiet_NaN(),
+                                                    n_threads, n_bins);
+  for (auto const& page : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+    auto const& column_matrix = page.Transpose();
+    auto const& missing = column_matrix.Missing();
+    auto n = NumpyArrayIterForTest::Rows() * NumpyArrayIterForTest::Cols();
+    auto expected = std::remove_reference_t<decltype(missing)>::BitFieldT::ComputeStorageSize(n);
+    auto got = missing.storage.size();
+    ASSERT_EQ(expected, got);
+    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
+      using T = decltype(dtype);
+      auto col = column_matrix.DenseColumn<T, true>(0);
+      CheckColumWithMissingValue(col, page);
+    });
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -27,8 +27,8 @@ void ParallelGHistBuilderReset() {

  for(size_t inode = 0; inode < kNodesExtended; inode++) {
    collection.AddHistRow(inode);
+    collection.AllocateData(inode);
  }
-  collection.AllocateAllData();
  ParallelGHistBuilder hist_builder;
  hist_builder.Init(kBins);
  std::vector<GHistRow> target_hist(kNodes);
@@ -83,8 +83,8 @@ void ParallelGHistBuilderReduceHist(){

  for(size_t inode = 0; inode < kNodes; inode++) {
    collection.AddHistRow(inode);
+    collection.AllocateData(inode);
  }
-  collection.AllocateAllData();
  ParallelGHistBuilder hist_builder;
  hist_builder.Init(kBins);
  std::vector<GHistRow> target_hist(kNodes);
@@ -129,7 +129,7 @@ TEST(CutsBuilder, SearchGroupInd) {

  auto p_mat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();

-  std::vector<bst_int> group(kNumGroups);
+  std::vector<bst_group_t> group(kNumGroups);
  group[0] = 2;
  group[1] = 3;
  group[2] = 7;
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -3,18 +3,23 @@
 */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
+#include <xgboost/base.h>  // for bst_bin_t
 #include <xgboost/c_api.h>
 #include <xgboost/data.h>

-#include <algorithm>
-#include <cmath>
+#include <algorithm>  // for transform
+#include <cmath>      // for floor
+#include <cstddef>    // for size_t
+#include <limits>     // for numeric_limits
+#include <string>     // for string, to_string
+#include <tuple>      // for tuple, make_tuple
+#include <vector>     // for vector

 #include "../../../include/xgboost/logging.h"
 #if defined(XGBOOST_USE_CUDA)
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
-#include "../../../src/common/math.h"
 #include "../../../src/data/device_adapter.cuh"
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/common/device_helpers.hip.h"
@@ -29,8 +34,7 @@
 #include "../helpers.h"
 #include "test_hist_util.h"

-namespace xgboost {
-namespace common {
+namespace xgboost::common {

 template <typename AdapterT>
 HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, float missing) {
@@ -40,16 +44,17 @@ HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, f
 }

 TEST(HistUtil, DeviceSketch) {
+  auto ctx = MakeCUDACtx(0);
  int num_columns = 1;
  int num_bins = 4;
  std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, -1.0f};
  int num_rows = x.size();
  auto dmat = GetDMatrixFromData(x, num_rows, num_columns);

-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);

-  Context ctx;
-  HistogramCuts host_cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
+  Context cpu_ctx;
+  HistogramCuts host_cuts = SketchOnDMatrix(&cpu_ctx, dmat.get(), num_bins);

  EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
  EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
@@ -79,6 +84,7 @@ TEST(HistUtil, SketchBatchNumElements) {
 }

 TEST(HistUtil, DeviceSketchMemory) {
+  auto ctx = MakeCUDACtx(0);
  int num_columns = 100;
  int num_rows = 1000;
  int num_bins = 256;
@@ -87,7 +93,7 @@ TEST(HistUtil, DeviceSketchMemory) {

  dh::GlobalMemoryLogger().Clear();
  ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);

  size_t bytes_required = detail::RequiredMemory(
      num_rows, num_columns, num_rows * num_columns, num_bins, false);
@@ -97,6 +103,7 @@ TEST(HistUtil, DeviceSketchMemory) {
 }

 TEST(HistUtil, DeviceSketchWeightsMemory) {
+  auto ctx = MakeCUDACtx(0);
  int num_columns = 100;
  int num_rows = 1000;
  int num_bins = 256;
@@ -106,7 +113,7 @@ TEST(HistUtil, DeviceSketchWeightsMemory) {

  dh::GlobalMemoryLogger().Clear();
  ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
  ConsoleLogger::Configure({{"verbosity", "0"}});

  size_t bytes_required = detail::RequiredMemory(
@@ -116,52 +123,56 @@ TEST(HistUtil, DeviceSketchWeightsMemory) {
 }

 TEST(HistUtil, DeviceSketchDeterminism) {
+  auto ctx = MakeCUDACtx(0);
  int num_rows = 500;
  int num_columns = 5;
  int num_bins = 256;
  auto x = GenerateRandom(num_rows, num_columns);
  auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
-  auto reference_sketch = DeviceSketch(0, dmat.get(), num_bins);
+  auto reference_sketch = DeviceSketch(&ctx, dmat.get(), num_bins);
  size_t constexpr kRounds{ 100 };
  for (size_t r = 0; r < kRounds; ++r) {
-    auto new_sketch = DeviceSketch(0, dmat.get(), num_bins);
+    auto new_sketch = DeviceSketch(&ctx, dmat.get(), num_bins);
    ASSERT_EQ(reference_sketch.Values(), new_sketch.Values());
    ASSERT_EQ(reference_sketch.MinValues(), new_sketch.MinValues());
  }
 }

 TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
-  int categorical_sizes[] = {2, 6, 8, 12};
+  auto ctx = MakeCUDACtx(0);
+  auto categorical_sizes = {2, 6, 8, 12};
  int num_bins = 256;
-  int sizes[] = {25, 100, 1000};
+  auto sizes = {25, 100, 1000};
  for (auto n : sizes) {
    for (auto num_categories : categorical_sizes) {
      auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
      auto dmat = GetDMatrixFromData(x, n, 1);
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
 }

 TEST(HistUtil, DeviceSketchCategoricalFeatures) {
-  TestCategoricalSketch(1000, 256, 32, false,
-                        [](DMatrix *p_fmat, int32_t num_bins) {
-                          return DeviceSketch(0, p_fmat, num_bins);
-                        });
-  TestCategoricalSketch(1000, 256, 32, true,
-                        [](DMatrix *p_fmat, int32_t num_bins) {
-                          return DeviceSketch(0, p_fmat, num_bins);
-                        });
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalSketch(1000, 256, 32, false, [ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return DeviceSketch(&ctx, p_fmat, num_bins);
+  });
+  TestCategoricalSketch(1000, 256, 32, true, [ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return DeviceSketch(&ctx, p_fmat, num_bins);
+  });
 }

 void TestMixedSketch() {
  size_t n_samples = 1000, n_features = 2, n_categories = 3;
+  bst_bin_t n_bins = 64;
+
  std::vector<float> data(n_samples * n_features);
  SimpleLCG gen;
  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
  SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
  for (size_t i = 0; i < n_samples * n_features; ++i) {
+    // two features, row major. The first column is numeric and the second is categorical.
    if (i % 2 == 0) {
      data[i] = std::floor(cat_d(&gen));
    } else {
@@ -173,46 +184,113 @@ void TestMixedSketch() {
  m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
  m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);

-  auto cuts = DeviceSketch(0, m.get(), 64);
-  ASSERT_EQ(cuts.Values().size(), 64 + n_categories);
+  auto ctx = MakeCUDACtx(0);
+  auto cuts = DeviceSketch(&ctx, m.get(), n_bins);
+  ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
 }

-TEST(HistUtil, DeviceSketchMixedFeatures) {
-  TestMixedSketch();
+TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
+
+TEST(HistUtil, RemoveDuplicatedCategories) {
+  bst_row_t n_samples = 512;
+  bst_feature_t n_features = 3;
+  bst_cat_t n_categories = 5;
+
+  auto ctx = MakeCUDACtx(0);
+  SimpleLCG rng;
+  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
+
+  dh::device_vector<Entry> sorted_entries(n_samples * n_features);
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    for (bst_feature_t j = 0; j < n_features; ++j) {
+      float fvalue{0.0f};
+      // The second column is categorical
+      if (j == 1) {
+        fvalue = std::floor(cat_d(&rng));
+      } else {
+        fvalue = i;
+      }
+      sorted_entries[i * n_features + j] = Entry{j, fvalue};
+    }
+  }
+
+  MetaInfo info;
+  info.num_col_ = n_features;
+  info.num_row_ = n_samples;
+  info.feature_types.HostVector() = std::vector<FeatureType>{
+      FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
+  ASSERT_EQ(info.feature_types.Size(), n_features);
+
+  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
+  cuts_ptr.SetDevice(0);
+
+  dh::device_vector<float> weight(n_samples * n_features, 0);
+  dh::Iota(dh::ToSpan(weight));
+
+  dh::caching_device_vector<bst_row_t> columns_ptr(4);
+  for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
+    columns_ptr[i] = i * n_samples;
+  }
+  // sort into column major
+  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
+                      detail::EntryCompareOp());
+
+  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+                                     &weight, &columns_ptr);
+
+  auto const& h_cptr = cuts_ptr.ConstHostVector();
+  ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
+  // check numerical
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i], i * 3);
+  }
+  auto beg = n_samples + n_categories;
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i + beg], i * 3 + 2);
+  }
+  // check categorical
+  beg = n_samples;
+  for (bst_cat_t i = 0; i < n_categories; ++i) {
+    // all from the second column
+    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
+  }
 }

 TEST(HistUtil, DeviceSketchMultipleColumns) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
 }

 TEST(HistUtil, DeviceSketchMultipleColumnsWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
    dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
 }

 TEST(HistUitl, DeviceSketchWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
@@ -222,8 +300,8 @@ TEST(HistUitl, DeviceSketchWeights) {
    h_weights.resize(num_rows);
    std::fill(h_weights.begin(), h_weights.end(), 1.0f);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
-      auto wcuts = DeviceSketch(0, weighted_dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
+      auto wcuts = DeviceSketch(&ctx, weighted_dmat.get(), num_bins);
      ASSERT_EQ(cuts.MinValues(), wcuts.MinValues());
      ASSERT_EQ(cuts.Ptrs(), wcuts.Ptrs());
      ASSERT_EQ(cuts.Values(), wcuts.Values());
@@ -234,14 +312,15 @@ TEST(HistUitl, DeviceSketchWeights) {
 }

 TEST(HistUtil, DeviceSketchBatches) {
+  auto ctx = MakeCUDACtx(0);
  int num_bins = 256;
  int num_rows = 5000;
-  int batch_sizes[] = {0, 100, 1500, 6000};
+  auto batch_sizes = {0, 100, 1500, 6000};
  int num_columns = 5;
  for (auto batch_size : batch_sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
-    auto cuts = DeviceSketch(0, dmat.get(), num_bins, batch_size);
+    auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, batch_size);
    ValidateCuts(cuts, dmat.get(), num_bins);
  }

@@ -249,8 +328,8 @@ TEST(HistUtil, DeviceSketchBatches) {
  size_t batches = 16;
  auto x = GenerateRandom(num_rows * batches, num_columns);
  auto dmat = GetDMatrixFromData(x, num_rows * batches, num_columns);
-  auto cuts_with_batches = DeviceSketch(0, dmat.get(), num_bins, num_rows);
-  auto cuts = DeviceSketch(0, dmat.get(), num_bins, 0);
+  auto cuts_with_batches = DeviceSketch(&ctx, dmat.get(), num_bins, num_rows);
+  auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, 0);

  auto const& cut_values_batched = cuts_with_batches.Values();
  auto const& cut_values = cuts.Values();
@@ -261,15 +340,16 @@ TEST(HistUtil, DeviceSketchBatches) {
 }

 TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns =5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    dmlc::TemporaryDirectory temp;
    auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
@@ -277,8 +357,9 @@ TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {

 // See https://github.com/dmlc/xgboost/issues/5866.
 TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  dmlc::TemporaryDirectory temp;
  for (auto num_rows : sizes) {
@@ -286,7 +367,7 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
    auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
    dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
    for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      ValidateCuts(cuts, dmat.get(), num_bins);
    }
  }
@@ -299,7 +380,7 @@ auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing,
  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
  MetaInfo info;
  AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
-  sketch_container.MakeCuts(&batched_cuts);
+  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
  return batched_cuts;
 }

@@ -367,7 +448,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
  AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                      &sketch_container);
  HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
  size_t bytes_required = detail::RequiredMemory(
      num_rows, num_columns, num_rows * num_columns, num_bins, false);
  EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
@@ -397,7 +478,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
                      &sketch_container);

  HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
  ConsoleLogger::Configure({{"verbosity", "0"}});
  size_t bytes_required = detail::RequiredMemory(
      num_rows, num_columns, num_rows * num_columns, num_bins, true);
@@ -430,7 +511,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
  AdapterDeviceSketch(adapter.Value(), num_bins, info,
                      std::numeric_limits<float>::quiet_NaN(), &container);
  HistogramCuts cuts;
-  container.MakeCuts(&cuts);
+  container.MakeCuts(&cuts, info.IsColumnSplit());

  thrust::sort(x.begin(), x.end());
  auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
@@ -452,9 +533,9 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
 }

 TEST(HistUtil, AdapterDeviceSketchCategorical) {
-  int categorical_sizes[] = {2, 6, 8, 12};
+  auto categorical_sizes = {2, 6, 8, 12};
  int num_bins = 256;
-  int sizes[] = {25, 100, 1000};
+  auto sizes = {25, 100, 1000};
  for (auto n : sizes) {
    for (auto num_categories : categorical_sizes) {
      auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
@@ -469,8 +550,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
 }

 TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
@@ -486,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
 TEST(HistUtil, AdapterDeviceSketchBatches) {
  int num_bins = 256;
  int num_rows = 5000;
-  int batch_sizes[] = {0, 100, 1500, 6000};
+  auto batch_sizes = {0, 100, 1500, 6000};
  int num_columns = 5;
  for (auto batch_size : batch_sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
@@ -571,14 +652,15 @@ TEST(HistUtil, GetColumnSize) {
 // Check sketching from adapter or DMatrix results in the same answer
 // Consistency here is useful for testing and user experience
 TEST(HistUtil, SketchingEquivalent) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
  int num_columns = 5;
  for (auto num_rows : sizes) {
    auto x = GenerateRandom(num_rows, num_columns);
    auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
    for (auto num_bins : bin_sizes) {
-      auto dmat_cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto dmat_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
      auto x_device = thrust::device_vector<float>(x);
      auto adapter = AdapterFromData(x_device, num_rows, num_columns);
      common::HistogramCuts adapter_cuts = MakeUnweightedCutsForTest(
@@ -593,21 +675,25 @@ TEST(HistUtil, SketchingEquivalent) {
 }

 TEST(HistUtil, DeviceSketchFromGroupWeights) {
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows = 3000, kCols = 200, kBins = 256;
  size_t constexpr kGroups = 10;
  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+
+  // sketch with group weight
  auto& h_weights = m->Info().weights_.HostVector();
-  h_weights.resize(kRows);
+  h_weights.resize(kGroups);
  std::fill(h_weights.begin(), h_weights.end(), 1.0f);
  std::vector<bst_group_t> groups(kGroups);
  for (size_t i = 0; i < kGroups; ++i) {
    groups[i] = kRows / kGroups;
  }
  m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
-  HistogramCuts weighted_cuts = DeviceSketch(0, m.get(), kBins, 0);
+  HistogramCuts weighted_cuts = DeviceSketch(&ctx, m.get(), kBins, 0);

+  // sketch with no weight
  h_weights.clear();
-  HistogramCuts cuts = DeviceSketch(0, m.get(), kBins, 0);
+  HistogramCuts cuts = DeviceSketch(&ctx, m.get(), kBins, 0);

  ASSERT_EQ(cuts.Values().size(), weighted_cuts.Values().size());
  ASSERT_EQ(cuts.MinValues().size(), weighted_cuts.MinValues().size());
@@ -662,7 +748,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                      &sketch_container);

  common::HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());

  auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
  if (with_group) {
@@ -675,9 +761,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
  ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
  ValidateCuts(cuts, dmat.get(), kBins);

+  auto cuda_ctx = MakeCUDACtx(0);
  if (with_group) {
    dmat->Info().weights_ = decltype(dmat->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = DeviceSketch(0, dmat.get(), kBins, 0);
+    HistogramCuts non_weighted = DeviceSketch(&cuda_ctx, dmat.get(), kBins, 0);
    for (size_t i = 0; i < cuts.Values().size(); ++i) {
      ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
    }
@@ -703,7 +790,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
    AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                        &sketch_container);
-    sketch_container.MakeCuts(&weighted);
+    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
    ValidateCuts(weighted, dmat.get(), kBins);
  }
 }
@@ -712,5 +799,156 @@ TEST(HistUtil, AdapterSketchFromWeights) {
  TestAdapterSketchFromWeights(false);
  TestAdapterSketchFromWeights(true);
 }
-}  // namespace common
-}  // namespace xgboost
+
+namespace {
+class DeviceSketchWithHessianTest
+    : public ::testing::TestWithParam<std::tuple<bool, bst_row_t, bst_bin_t>> {
+  bst_feature_t n_features_ = 5;
+  bst_group_t n_groups_{3};
+
+  auto GenerateHessian(Context const* ctx, bst_row_t n_samples) const {
+    HostDeviceVector<float> hessian;
+    auto& h_hess = hessian.HostVector();
+    h_hess = GenerateRandomWeights(n_samples);
+    std::mt19937 rng(0);
+    std::shuffle(h_hess.begin(), h_hess.end(), rng);
+    hessian.SetDevice(ctx->Device());
+    return hessian;
+  }
+
+  void CheckReg(Context const* ctx, std::shared_ptr<DMatrix> p_fmat, bst_bin_t n_bins,
+                HostDeviceVector<float> const& hessian, std::vector<float> const& w,
+                std::size_t n_elements) const {
+    auto const& h_hess = hessian.ConstHostVector();
+    {
+      auto& h_weight = p_fmat->Info().weights_.HostVector();
+      h_weight = w;
+    }
+
+    HistogramCuts cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+
+    // merge hessian
+    {
+      auto& h_weight = p_fmat->Info().weights_.HostVector();
+      ASSERT_EQ(h_weight.size(), h_hess.size());
+      for (std::size_t i = 0; i < h_weight.size(); ++i) {
+        h_weight[i] = w[i] * h_hess[i];
+      }
+    }
+
+    HistogramCuts cuts_wh = DeviceSketch(ctx, p_fmat.get(), n_bins, n_elements);
+    ValidateCuts(cuts_wh, p_fmat.get(), n_bins);
+    ASSERT_EQ(cuts_hess.Values().size(), cuts_wh.Values().size());
+    for (std::size_t i = 0; i < cuts_hess.Values().size(); ++i) {
+      ASSERT_NEAR(cuts_wh.Values()[i], cuts_hess.Values()[i], kRtEps);
+    }
+
+    p_fmat->Info().weights_.HostVector() = w;
+  }
+
+ protected:
+  Context ctx_ = MakeCUDACtx(0);
+
+  void TestLTR(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+               std::size_t n_elements) const {
+    auto x = GenerateRandom(n_samples, n_features_);
+
+    std::vector<bst_group_t> gptr;
+    gptr.resize(n_groups_ + 1, 0);
+    gptr[1] = n_samples / n_groups_;
+    gptr[2] = n_samples / n_groups_ + gptr[1];
+    gptr.back() = n_samples;
+
+    auto hessian = this->GenerateHessian(ctx, n_samples);
+    auto const& h_hess = hessian.ConstHostVector();
+    auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
+    p_fmat->Info().group_ptr_ = gptr;
+
+    // test with constant group weight
+    std::vector<float> w(n_groups_, 1.0f);
+    p_fmat->Info().weights_.HostVector() = w;
+    HistogramCuts cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    // make validation easier by converting it into sample weight.
+    p_fmat->Info().weights_.HostVector() = h_hess;
+    p_fmat->Info().group_ptr_.clear();
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+    // restore ltr properties
+    p_fmat->Info().weights_.HostVector() = w;
+    p_fmat->Info().group_ptr_ = gptr;
+
+    // test with random group weight
+    w = GenerateRandomWeights(n_groups_);
+    p_fmat->Info().weights_.HostVector() = w;
+    cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    // make validation easier by converting it into sample weight.
+    p_fmat->Info().weights_.HostVector() = h_hess;
+    p_fmat->Info().group_ptr_.clear();
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+
+    // merge hessian with sample weight
+    p_fmat->Info().weights_.Resize(n_samples);
+    p_fmat->Info().group_ptr_.clear();
+    for (std::size_t i = 0; i < h_hess.size(); ++i) {
+      auto gidx = dh::SegmentId(Span{gptr.data(), gptr.size()}, i);
+      p_fmat->Info().weights_.HostVector()[i] = w[gidx] * h_hess[i];
+    }
+    auto cuts = DeviceSketch(ctx, p_fmat.get(), n_bins, n_elements);
+    ValidateCuts(cuts, p_fmat.get(), n_bins);
+    ASSERT_EQ(cuts.Values().size(), cuts_hess.Values().size());
+    for (std::size_t i = 0; i < cuts.Values().size(); ++i) {
+      EXPECT_NEAR(cuts.Values()[i], cuts_hess.Values()[i], 1e-4f);
+    }
+  }
+
+  void TestRegression(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+                      std::size_t n_elements) const {
+    auto x = GenerateRandom(n_samples, n_features_);
+    auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
+    std::vector<float> w = GenerateRandomWeights(n_samples);
+
+    auto hessian = this->GenerateHessian(ctx, n_samples);
+
+    this->CheckReg(ctx, p_fmat, n_bins, hessian, w, n_elements);
+  }
+};
+
+auto MakeParamsForTest() {
+  std::vector<bst_row_t> sizes = {1, 2, 256, 512, 1000, 1500};
+  std::vector<bst_bin_t> bin_sizes = {2, 16, 256, 512};
+  std::vector<std::tuple<bool, bst_row_t, bst_bin_t>> configs;
+  for (auto n_samples : sizes) {
+    for (auto n_bins : bin_sizes) {
+      configs.emplace_back(true, n_samples, n_bins);
+      configs.emplace_back(false, n_samples, n_bins);
+    }
+  }
+  return configs;
+}
+}  // namespace
+
+TEST_P(DeviceSketchWithHessianTest, DeviceSketchWithHessian) {
+  auto param = GetParam();
+  auto n_samples = std::get<1>(param);
+  auto n_bins = std::get<2>(param);
+  if (std::get<0>(param)) {
+    this->TestLTR(&ctx_, n_samples, n_bins, 0);
+    this->TestLTR(&ctx_, n_samples, n_bins, 512);
+  } else {
+    this->TestRegression(&ctx_, n_samples, n_bins, 0);
+    this->TestRegression(&ctx_, n_samples, n_bins, 512);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    HistUtil, DeviceSketchWithHessianTest, ::testing::ValuesIn(MakeParamsForTest()),
+    [](::testing::TestParamInfo<DeviceSketchWithHessianTest::ParamType> const& info) {
+      auto task = std::get<0>(info.param) ? "ltr" : "reg";
+      auto n_samples = std::to_string(std::get<1>(info.param));
+      auto n_bins = std::to_string(std::get<2>(info.param));
+      return std::string{task} + "_" + n_samples + "_" + n_bins;
+    });
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -1,16 +1,16 @@
-/*!
- * Copyright (c) by XGBoost Contributors 2019
+/**
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>

-#include <fstream>
+#include <cstddef>  // for size_t
+#include <fstream>  // for ofstream

 #include "../../../src/common/io.h"
-#include "../helpers.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../helpers.h"

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(MemoryFixSizeBuffer, Seek) {
  size_t constexpr kSize { 64 };
  std::vector<int32_t> memory( kSize );
@@ -63,31 +63,159 @@ TEST(IO, LoadSequentialFile) {

  // Generate a JSON file.
  size_t constexpr kRows = 1000, kCols = 100;
-  std::shared_ptr<DMatrix> p_dmat{
-    RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
-  std::unique_ptr<Learner> learner { Learner::Create({p_dmat}) };
+  std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
+  std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
  learner->SetParam("tree_method", "hist");
  learner->Configure();

  for (int32_t iter = 0; iter < 10; ++iter) {
    learner->UpdateOneIter(iter, p_dmat);
  }
-  Json out { Object() };
+  Json out{Object()};
  learner->SaveModel(&out);
-  std::string str;
+  std::vector<char> str;
  Json::Dump(out, &str);

  std::string tmpfile = tempdir.path + "/model.json";
  {
-    std::unique_ptr<dmlc::Stream> fo(
-        dmlc::Stream::Create(tmpfile.c_str(), "w"));
-    fo->Write(str.c_str(), str.size());
+    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmpfile.c_str(), "w"));
+    fo->Write(str.data(), str.size());
  }

-  auto loaded = LoadSequentialFile(tmpfile, true);
+  auto loaded = LoadSequentialFile(tmpfile);
  ASSERT_EQ(loaded, str);
-
-  ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(IO, Resource) {
+  {
+    // test malloc basic
+    std::size_t n = 128;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    ASSERT_EQ(resource->Size(), n);
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
+  }
+
+  // test malloc resize
+  auto test_malloc_resize = [](bool force_malloc) {
+    std::size_t n = 64;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
+    std::iota(ptr, ptr + n, 0);
+
+    auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
+    ASSERT_TRUE(malloc_resource);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 2);
+    } else {
+      malloc_resource->Resize<false>(n * 2);
+    }
+    for (std::size_t i = 0; i < n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
+    }
+    for (std::size_t i = n; i < 2 * n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
+    }
+
+    ptr = malloc_resource->DataAs<std::uint8_t>();
+    std::fill_n(ptr, malloc_resource->Size(), 7);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 3, std::byte{3});
+    } else {
+      malloc_resource->Resize<false>(n * 3, std::byte{3});
+    }
+    for (std::size_t i = 0; i < n * 2; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 7);
+    }
+    for (std::size_t i = n * 2; i < n * 3; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 3);
+    }
+  };
+  test_malloc_resize(true);
+  test_malloc_resize(false);
+
+  {
+    // test mmap
+    dmlc::TemporaryDirectory tmpdir;
+    auto path = tmpdir.path + "/testfile";
+
+    std::ofstream fout(path, std::ios::binary);
+    double val{1.0};
+    fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
+    fout << 1.0 << std::endl;
+    fout.close();
+
+    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    ASSERT_EQ(resource->Size(), sizeof(double));
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
+    ASSERT_EQ(resource->DataAs<double>()[0], val);
+  }
+}
+
+TEST(IO, PrivateMmapStream) {
+  dmlc::TemporaryDirectory tempdir;
+  auto path = tempdir.path + "/testfile";
+
+  // The page size on Linux is usually set to 4096, while the allocation granularity on
+  // the Windows machine where this test is writted is 65536. We span the test to cover
+  // all of them.
+  std::size_t n_batches{64};
+  std::size_t multiplier{2048};
+
+  std::vector<std::vector<std::int32_t>> batches;
+  std::vector<std::size_t> offset{0ul};
+
+  using T = std::int32_t;
+
+  {
+    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    for (std::size_t i = 0; i < n_batches; ++i) {
+      std::size_t size = (i + 1) * multiplier;
+      std::vector<T> data(size, 0);
+      std::iota(data.begin(), data.end(), i * i);
+
+      fo->Write(static_cast<std::uint64_t>(data.size()));
+      fo->Write(data.data(), data.size() * sizeof(T));
+
+      std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
+      offset.push_back(bytes);
+
+      batches.emplace_back(std::move(data));
+    }
+  }
+
+  // Turn size info offset
+  std::partial_sum(offset.begin(), offset.end(), offset.begin());
+
+  // Test read
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    ASSERT_TRUE(fi->Read(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
+    data.resize(size);
+
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
+    ASSERT_EQ(data, batches[i]);
+  }
+
+  // Test consume
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    ASSERT_TRUE(fi->Consume(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
+    data.resize(size);
+
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
+    ASSERT_EQ(data, batches[i]);
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -41,7 +41,6 @@ std::string GetModelStr() {
    "num_class": "0",
    "num_feature": "10",
    "objective": "reg:linear",
-    "predictor": "gpu_predictor",
    "tree_method": "gpu_hist",
    "updater": "grow_gpu_hist"
  },
@@ -419,7 +418,7 @@ TEST(Json, AssigningString) {

 TEST(Json, LoadDump) {
  std::string ori_buffer = GetModelStr();
-  Json origin {Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
+  Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};

  dmlc::TemporaryDirectory tempdir;
  auto const& path = tempdir.path + "test_model_dump";
@@ -431,9 +430,9 @@ TEST(Json, LoadDump) {
  ASSERT_TRUE(fout);
  fout << out << std::flush;

-  std::string new_buffer = common::LoadSequentialFile(path);
+  std::vector<char> new_buffer = common::LoadSequentialFile(path);

-  Json load_back {Json::Load(StringView(new_buffer.c_str(), new_buffer.size()))};
+  Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))};
  ASSERT_EQ(load_back, origin);
 }

@@ -652,7 +651,7 @@ TEST(UBJson, Basic) {
    }

    auto data = common::LoadSequentialFile("test.ubj");
-    UBJReader reader{StringView{data}};
+    UBJReader reader{StringView{data.data(), data.size()}};
    json = reader.Load();
    return json;
  };
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -3,7 +3,7 @@
 */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
-#include <xgboost/host_device_vector.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/linalg.h>

 #include <cstddef>  // size_t
@@ -14,8 +14,8 @@

 namespace xgboost::linalg {
 namespace {
-auto kCpuId = Context::kCpuId;
-}
+DeviceOrd CPU() { return DeviceOrd::CPU(); }
+}  // namespace

 auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, std::size_t n_cols) {
  storage->Resize(n_rows * n_cols);
@@ -23,7 +23,7 @@ auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, st

  std::iota(h_storage.begin(), h_storage.end(), 0);

-  auto m = linalg::TensorView<float, 2>{h_storage, {n_rows, static_cast<size_t>(n_cols)}, -1};
+  auto m = linalg::TensorView<float, 2>{h_storage, {n_rows, static_cast<size_t>(n_cols)}, CPU()};
  return m;
 }

@@ -31,7 +31,7 @@ TEST(Linalg, MatrixView) {
  size_t kRows = 31, kCols = 77;
  HostDeviceVector<float> storage;
  auto m = MakeMatrixFromTest(&storage, kRows, kCols);
-  ASSERT_EQ(m.DeviceIdx(), kCpuId);
+  ASSERT_EQ(m.Device(), CPU());
  ASSERT_EQ(m(0, 0), 0);
  ASSERT_EQ(m(kRows - 1, kCols - 1), storage.Size() - 1);
 }
@@ -76,7 +76,7 @@ TEST(Linalg, TensorView) {

  {
    // as vector
-    TensorView<double, 1> vec{data, {data.size()}, -1};
+    TensorView<double, 1> vec{data, {data.size()}, CPU()};
    ASSERT_EQ(vec.Size(), data.size());
    ASSERT_EQ(vec.Shape(0), data.size());
    ASSERT_EQ(vec.Shape().size(), 1);
@@ -87,7 +87,7 @@ TEST(Linalg, TensorView) {

  {
    // as matrix
-    TensorView<double, 2> mat(data, {6, 4}, -1);
+    TensorView<double, 2> mat(data, {6, 4}, CPU());
    auto s = mat.Slice(2, All());
    ASSERT_EQ(s.Shape().size(), 1);
    s = mat.Slice(All(), 1);
@@ -96,7 +96,7 @@ TEST(Linalg, TensorView) {

  {
    // assignment
-    TensorView<double, 3> t{data, {2, 3, 4}, 0};
+    TensorView<double, 3> t{data, {2, 3, 4}, CPU()};
    double pi = 3.14159;
    auto old = t(1, 2, 3);
    t(1, 2, 3) = pi;
@@ -201,7 +201,7 @@ TEST(Linalg, TensorView) {
  }
  {
    // f-contiguous
-    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, kCpuId};
+    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, CPU()};
    ASSERT_TRUE(t.Contiguous());
    ASSERT_TRUE(t.FContiguous());
    ASSERT_FALSE(t.CContiguous());
@@ -210,11 +210,11 @@ TEST(Linalg, TensorView) {

 TEST(Linalg, Tensor) {
  {
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
-    auto view = t.View(kCpuId);
+    Tensor<float, 3> t{{2, 3, 4}, CPU(), Order::kC};
+    auto view = t.View(CPU());

    auto const &as_const = t;
-    auto k_view = as_const.View(kCpuId);
+    auto k_view = as_const.View(CPU());

    size_t n = 2 * 3 * 4;
    ASSERT_EQ(t.Size(), n);
@@ -229,7 +229,7 @@ TEST(Linalg, Tensor) {
  }
  {
    // Reshape
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
+    Tensor<float, 3> t{{2, 3, 4}, CPU(), Order::kC};
    t.Reshape(4, 3, 2);
    ASSERT_EQ(t.Size(), 24);
    ASSERT_EQ(t.Shape(2), 2);
@@ -247,7 +247,7 @@ TEST(Linalg, Tensor) {

 TEST(Linalg, Empty) {
  {
-    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId, Order::kC};
+    auto t = TensorView<double, 2>{{}, {0, 3}, CPU(), Order::kC};
    for (int32_t i : {0, 1, 2}) {
      auto s = t.Slice(All(), i);
      ASSERT_EQ(s.Size(), 0);
@@ -256,9 +256,9 @@ TEST(Linalg, Empty) {
    }
  }
  {
-    auto t = Tensor<double, 2>{{0, 3}, kCpuId, Order::kC};
+    auto t = Tensor<double, 2>{{0, 3}, CPU(), Order::kC};
    ASSERT_EQ(t.Size(), 0);
-    auto view = t.View(kCpuId);
+    auto view = t.View(CPU());

    for (int32_t i : {0, 1, 2}) {
      auto s = view.Slice(All(), i);
@@ -270,7 +270,7 @@ TEST(Linalg, Empty) {
 }

 TEST(Linalg, ArrayInterface) {
-  auto cpu = kCpuId;
+  auto cpu = CPU();
  auto t = Tensor<double, 2>{{3, 3}, cpu, Order::kC};
  auto v = t.View(cpu);
  std::iota(v.Values().begin(), v.Values().end(), 0);
@@ -315,16 +315,16 @@ TEST(Linalg, Popc) {
 }

 TEST(Linalg, Stack) {
-  Tensor<float, 3> l{{2, 3, 4}, kCpuId, Order::kC};
-  ElementWiseTransformHost(l.View(kCpuId), omp_get_max_threads(),
+  Tensor<float, 3> l{{2, 3, 4}, CPU(), Order::kC};
+  ElementWiseTransformHost(l.View(CPU()), omp_get_max_threads(),
                           [=](size_t i, float) { return i; });
-  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId, Order::kC};
-  ElementWiseTransformHost(r_0.View(kCpuId), omp_get_max_threads(),
+  Tensor<float, 3> r_0{{2, 3, 4}, CPU(), Order::kC};
+  ElementWiseTransformHost(r_0.View(CPU()), omp_get_max_threads(),
                           [=](size_t i, float) { return i; });

  Stack(&l, r_0);

-  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId, Order::kC};
+  Tensor<float, 3> r_1{{0, 3, 4}, CPU(), Order::kC};
  Stack(&l, r_1);
  ASSERT_EQ(l.Shape(0), 4);

@@ -335,7 +335,7 @@ TEST(Linalg, Stack) {
 TEST(Linalg, FOrder) {
  std::size_t constexpr kRows = 16, kCols = 3;
  std::vector<float> data(kRows * kCols);
-  MatrixView<float> mat{data, {kRows, kCols}, Context::kCpuId, Order::kF};
+  MatrixView<float> mat{data, {kRows, kCols}, CPU(), Order::kF};
  float k{0};
  for (std::size_t i = 0; i < kRows; ++i) {
    for (std::size_t j = 0; j < kCols; ++j) {
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -8,23 +8,25 @@
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/common/linalg_op.hip.h"
 #endif
+#include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"

 namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
+  auto device = DeviceOrd::CUDA(0);
  Tensor<float, 3> l{{2, 3, 4}, 0};
  {
    /**
     * Non-contiguous
     */
    // GPU view
-    auto t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    auto t = l.View(device).Slice(linalg::All(), 1, linalg::All());
    ASSERT_FALSE(t.CContiguous());
    ElementWiseTransformDevice(t, [] __device__(size_t i, float) { return i; });
    // CPU view
-    t = l.View(Context::kCpuId).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(DeviceOrd::CPU()).Slice(linalg::All(), 1, linalg::All());
    size_t k = 0;
    for (size_t i = 0; i < l.Shape(0); ++i) {
      for (size_t j = 0; j < l.Shape(2); ++j) {
@@ -32,7 +34,7 @@ void TestElementWiseKernel() {
      }
    }

-    t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(device).Slice(linalg::All(), 1, linalg::All());
    ElementWiseKernelDevice(t, [] XGBOOST_DEVICE(size_t i, float v) { SPAN_CHECK(v == i); });
  }

@@ -40,11 +42,11 @@ void TestElementWiseKernel() {
    /**
     * Contiguous
     */
-    auto t = l.View(0);
+    auto t = l.View(device);
    ElementWiseTransformDevice(t, [] XGBOOST_DEVICE(size_t i, float) { return i; });
    ASSERT_TRUE(t.CContiguous());
    // CPU view
-    t = l.View(Context::kCpuId);
+    t = l.View(DeviceOrd::CPU());

    size_t ind = 0;
    for (size_t i = 0; i < l.Shape(0); ++i) {
@@ -58,8 +60,7 @@ void TestElementWiseKernel() {
 }

 void TestSlice() {
-  Context ctx;
-  ctx.gpu_id = 1;
+  auto ctx = MakeCUDACtx(1);
  thrust::device_vector<double> data(2 * 3 * 4);
  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
  dh::LaunchN(1, [=] __device__(size_t) {
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -1,15 +1,21 @@
+/**
+ * Copyright 2020-2023, XGBoost contributors
+ */
 #include <gtest/gtest.h>
-#include "test_quantile.h"
-#include "../helpers.h"
+
 #if defined(XGBOOST_USE_CUDA)
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/quantile.cuh"
+#include "../../../src/data/device_adapter.cuh"  // CupyAdapter
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/collective/communicator-inl.hip.h"
 #include "../../../src/common/hist_util.hip.h"
 #include "../../../src/common/quantile.hip.h"
+#include "../../../src/data/device_adapter.hip.h"  // CupyAdapter
 #endif
+#include "../helpers.h"
+#include "test_quantile.h"

 namespace xgboost {
 namespace {
@@ -20,6 +26,9 @@ struct IsSorted {
 };
 }
 namespace common {
+
+class MGPUQuantileTest : public BaseMGPUTest {};
+
 TEST(GPUQuantile, Basic) {
  constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
  HostDeviceVector<FeatureType> ft;
@@ -349,12 +358,11 @@ TEST(GPUQuantile, MultiMerge) {
 }

 namespace {
-void TestAllReduceBasic(int32_t n_gpus) {
+void TestAllReduceBasic() {
  auto const world = collective::GetWorldSize();
-  CHECK_EQ(world, n_gpus);
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = collective::GetRank();
+    auto const device = GPUIDX;

    // Set up single node version;
    HostDeviceVector<FeatureType> ft({}, device);
@@ -398,7 +406,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(),
                        &sketch_distributed);
-    sketch_distributed.AllReduce();
+    sketch_distributed.AllReduce(false);
    sketch_distributed.Unique();

    ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
@@ -427,23 +435,66 @@ void TestAllReduceBasic(int32_t n_gpus) {
 }
 }  // anonymous namespace

-TEST(GPUQuantile, MGPUAllReduceBasic) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
+TEST_F(MGPUQuantileTest, AllReduceBasic) {
+  DoTest(TestAllReduceBasic);
 }

 namespace {
-void TestSameOnAllWorkers(std::int32_t n_gpus) {
+void TestColumnSplitBasic() {
+  auto const world = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
+
+  auto m = std::unique_ptr<DMatrix>{[=]() {
+    auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+    return dmat->SliceCol(world, rank);
+  }()};
+
+  // Generate cuts for distributed environment.
+  auto ctx = MakeCUDACtx(GPUIDX);
+  HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
+
+  // Generate cuts for single node environment
+  collective::Finalize();
+  CHECK_EQ(collective::GetWorldSize(), 1);
+  HistogramCuts single_node_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
+
+  auto const& sptrs = single_node_cuts.Ptrs();
+  auto const& dptrs = distributed_cuts.Ptrs();
+  auto const& svals = single_node_cuts.Values();
+  auto const& dvals = distributed_cuts.Values();
+  auto const& smins = single_node_cuts.MinValues();
+  auto const& dmins = distributed_cuts.MinValues();
+
+  EXPECT_EQ(sptrs.size(), dptrs.size());
+  for (size_t i = 0; i < sptrs.size(); ++i) {
+    EXPECT_EQ(sptrs[i], dptrs[i]) << "rank: " << rank << ", i: " << i;
+  }
+
+  EXPECT_EQ(svals.size(), dvals.size());
+  for (size_t i = 0; i < svals.size(); ++i) {
+    EXPECT_NEAR(svals[i], dvals[i], 2e-2f) << "rank: " << rank << ", i: " << i;
+  }
+
+  EXPECT_EQ(smins.size(), dmins.size());
+  for (size_t i = 0; i < smins.size(); ++i) {
+    EXPECT_FLOAT_EQ(smins[i], dmins[i]) << "rank: " << rank << ", i: " << i;
+  }
+}
+}  // anonymous namespace
+
+TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
+  DoTest(TestColumnSplitBasic);
+}
+
+namespace {
+void TestSameOnAllWorkers() {
  auto world = collective::GetWorldSize();
-  CHECK_EQ(world, n_gpus);
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
    auto const rank = collective::GetRank();
-    auto const device = rank;
+    auto const device = GPUIDX;
    HostDeviceVector<FeatureType> ft({}, device);
    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
    HostDeviceVector<float> storage({}, device);
@@ -455,7 +506,7 @@ void TestSameOnAllWorkers(std::int32_t n_gpus) {
    AdapterDeviceSketch(adapter.Value(), n_bins, info,
                        std::numeric_limits<float>::quiet_NaN(),
                        &sketch_distributed);
-    sketch_distributed.AllReduce();
+    sketch_distributed.AllReduce(false);
    sketch_distributed.Unique();
    TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);

@@ -497,12 +548,8 @@ void TestSameOnAllWorkers(std::int32_t n_gpus) {
 }
 }  // anonymous namespace

-TEST(GPUQuantile, MGPUSameOnAllWorkers) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
+TEST_F(MGPUQuantileTest, SameOnAllWorkers) {
+  DoTest(TestSameOnAllWorkers);
 }

 TEST(GPUQuantile, Push) {
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -30,8 +30,7 @@

 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  std::size_t n_groups = 5, n_samples_per_group = 32;

  dh::device_vector<float> scores(n_samples_per_group * n_groups);
@@ -49,7 +48,7 @@ void TestCalcQueriesInvIDCG() {
  p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});

  cuda_impl::CalcQueriesInvIDCG(&ctx, linalg::MakeTensorView(&ctx, d_scores, d_scores.size()),
-                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.gpu_id), p);
+                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.Device()), p);
  for (std::size_t i = 0; i < n_groups; ++i) {
    double inv_idcg = inv_IDCG(i);
    ASSERT_NEAR(inv_idcg, 0.00551782, kRtEps);
@@ -92,20 +91,17 @@ void TestRankingCache(Context const* ctx) {
 }  // namespace

 TEST(RankingCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestRankingCache(&ctx);
 }

 TEST(NDCGCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestNDCGCache(&ctx);
 }

 TEST(MAPCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <cstddef>  // for size_t
+#include <memory>   // for make_shared, make_unique
+#include <numeric>  // for iota
+#include <vector>   // for vector
+
+#include "../../../src/common/ref_resource_view.h"
+#include "dmlc/filesystem.h"  // for TemporaryDirectory
+
+namespace xgboost::common {
+TEST(RefResourceView, Basic) {
+  std::size_t n_bytes = 1024;
+  auto mem = std::make_shared<MallocResource>(n_bytes);
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+
+    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
+                          mem};
+    ASSERT_EQ(mem.use_count(), 3);
+    ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
+    ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
+  }
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
+                         1.5f};
+    for (auto v : view) {
+      ASSERT_EQ(v, 1.5f);
+    }
+    std::iota(view.begin(), view.end(), 0.0f);
+    ASSERT_EQ(view.front(), 0.0f);
+    ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
+
+    view.front() = 1.0f;
+    view.back() = 2.0f;
+    ASSERT_EQ(view.front(), 1.0f);
+    ASSERT_EQ(view.back(), 2.0f);
+  }
+  ASSERT_EQ(mem.use_count(), 1);
+}
+
+TEST(RefResourceView, IO) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
+  }
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+}
+
+TEST(RefResourceView, IOAligned) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, 1.0f);
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    // wrong type, float vs. double
+    auto read = MakeFixedVecWithMalloc(123, 2.0);
+    ASSERT_FALSE(ReadVec(fi.get(), &read));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, 2.0f);
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+  {
+    // Test std::vector
+    std::vector<float> data(123);
+    std::iota(data.begin(), data.end(), 0.0f);
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
+                                            sizeof(RefResourceView<std::size_t>::size_type) +
+                                            sizeof(float));
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -7,6 +7,7 @@

 #include "../../../src/common/stats.h"
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../helpers.h"

 namespace xgboost {
 namespace common {
@@ -71,7 +72,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(m, .5f);

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    ASSERT_FALSE(ctx.IsCPU());
    Median(&ctx, values, weights, &out);
    m = out(0);
@@ -80,7 +81,7 @@ TEST(Stats, Median) {
  }

  {
-    ctx.gpu_id = Context::kCpuId;
+    ctx = ctx.MakeCPU();
    // 4x2 matrix
    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
    HostDeviceVector<float> weights;
@@ -90,7 +91,7 @@ TEST(Stats, Median) {
    ASSERT_EQ(out(1), .5f);

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    Median(&ctx, values, weights, &out);
    ASSERT_EQ(out(0), .5f);
    ASSERT_EQ(out(1), .5f);
@@ -123,8 +124,7 @@ TEST(Stats, Mean) {

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 TEST(Stats, GPUMean) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
  TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -3,9 +3,9 @@
 */
 #include <gtest/gtest.h>

-#include <cstddef>                            // std::size_t
-#include <utility>                            // std::pair
-#include <vector>                             // std::vector
+#include <cstddef>  // std::size_t
+#include <utility>  // std::pair
+#include <vector>   // std::vector

 #if defined(XGBOOST_USE_CUDA)
 #include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
@@ -14,10 +14,11 @@
 #include "../../../src/common/linalg_op.hip.h"  // ElementWiseTransformDevice
 #include "../../../src/common/stats.hip.h"
 #endif
-#include "xgboost/base.h"                     // XGBOOST_DEVICE
-#include "xgboost/context.h"                  // Context
-#include "xgboost/host_device_vector.h"       // HostDeviceVector
-#include "xgboost/linalg.h"                   // Tensor
+#include "../helpers.h"
+#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor

 namespace xgboost {
 namespace common {
@@ -38,7 +39,7 @@ class StatsGPU : public ::testing::Test {
  }

 public:
-  void SetUp() override { ctx_.gpu_id = 0; }
+  void SetUp() override { ctx_  = MakeCUDACtx(0); }

  void WeightedMulti() {
    // data for one segment
@@ -51,7 +52,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
-    auto d_arr = arr.View(0);
+    auto d_arr = arr.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul),
@@ -75,8 +76,8 @@ class StatsGPU : public ::testing::Test {
  }

  void Weighted() {
-    auto d_arr = arr_.View(0);
-    auto d_key = indptr_.View(0);
+    auto d_arr = arr_.View(DeviceOrd::CUDA(0));
+    auto d_key = indptr_.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul),
@@ -85,7 +86,7 @@ class StatsGPU : public ::testing::Test {
        dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                         [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
    linalg::Tensor<float, 1> weights{{10}, 0};
-    linalg::ElementWiseTransformDevice(weights.View(0),
+    linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                       [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
    auto w_it = weights.Data()->ConstDevicePointer();
    for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) {
@@ -106,7 +107,7 @@ class StatsGPU : public ::testing::Test {
    data.insert(data.cend(), seg.begin(), seg.end());
    data.insert(data.cend(), seg.begin(), seg.end());
    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
-    auto d_arr = arr.View(0);
+    auto d_arr = arr.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul),
@@ -129,8 +130,8 @@ class StatsGPU : public ::testing::Test {
  }

  void NonWeighted() {
-    auto d_arr = arr_.View(0);
-    auto d_key = indptr_.View(0);
+    auto d_arr = arr_.View(DeviceOrd::CUDA(0));
+    auto d_key = indptr_.View(DeviceOrd::CUDA(0));

    auto key_it = dh::MakeTransformIterator<std::size_t>(
        thrust::make_counting_iterator(0ul), [=] __device__(std::size_t i) { return d_key(i); });
--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -22,7 +22,7 @@ TEST(ArrayInterface, Initialize) {

  HostDeviceVector<size_t> u64_storage(storage.Size());
  std::string u64_arr_str{ArrayInterfaceStr(linalg::TensorView<size_t const, 2>{
-      u64_storage.ConstHostSpan(), {kRows, kCols}, Context::kCpuId})};
+      u64_storage.ConstHostSpan(), {kRows, kCols}, DeviceOrd::CPU()})};
  std::copy(storage.ConstHostVector().cbegin(), storage.ConstHostVector().cend(),
            u64_storage.HostSpan().begin());
  auto u64_arr = ArrayInterface<2>{u64_arr_str};
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -12,6 +12,7 @@
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/data/ellpack_page.hip.h"
 #endif
+#include "../../../src/data/ellpack_page.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../helpers.h"
 #include "../histogram_helpers.h"
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -5,8 +5,10 @@
 #include <xgboost/data.h>

 #if defined(XGBOOST_USE_CUDA)
+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/ellpack_page.cuh"
 #elif defined(XGBOOST_USE_HIP)
+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/ellpack_page.hip.h"
 #endif
 #include "../../../src/data/sparse_page_source.h"
@@ -14,8 +16,7 @@
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(EllpackPageRawFormat, IO) {
  Context ctx{MakeCUDACtx(0)};
  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
@@ -26,15 +27,17 @@ TEST(EllpackPageRawFormat, IO) {
  dmlc::TemporaryDirectory tmpdir;
  std::string path = tmpdir.path + "/ellpack.page";

+  std::size_t n_bytes{0};
  {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
    for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
-      format->Write(ellpack, fo.get());
+      n_bytes += format->Write(ellpack, fo.get());
    }
  }

  EllpackPage page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
  format->Read(&page, fi.get());

  for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
@@ -48,5 +51,4 @@ TEST(EllpackPageRawFormat, IO) {
    ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -26,28 +26,32 @@
 #include "xgboost/context.h"                    // for Context
 #include "xgboost/host_device_vector.h"         // for HostDeviceVector

-namespace xgboost {
-namespace data {
-TEST(GradientIndex, ExternalMemory) {
+namespace xgboost::data {
+TEST(GradientIndex, ExternalMemoryBaseRowID) {
  Context ctx;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
+  auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
+                    .Device(ctx.gpu_id)
+                    .Batches(8)
+                    .GenerateSparsePageDMatrix("cache", true);
+
  std::vector<size_t> base_rowids;
-  std::vector<float> hessian(dmat->Info().num_row_, 1);
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
+  std::vector<float> hessian(p_fmat->Info().num_row_, 1);
+  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
    base_rowids.push_back(page.base_rowid);
  }
-  size_t i = 0;
-  for (auto const &page : dmat->GetBatches<SparsePage>()) {
+
+  std::size_t i = 0;
+  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
    ASSERT_EQ(base_rowids[i], page.base_rowid);
    ++i;
  }

  base_rowids.clear();
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
+  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
    base_rowids.push_back(page.base_rowid);
  }
  i = 0;
-  for (auto const &page : dmat->GetBatches<SparsePage>()) {
+  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
    ASSERT_EQ(base_rowids[i], page.base_rowid);
    ++i;
  }
@@ -167,11 +171,10 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
    ASSERT_TRUE(Xy->SingleColBlock());
    bst_bin_t constexpr kBins{17};
    auto p = BatchParam{kBins, threshold};
-    Context gpu_ctx;
-    gpu_ctx.gpu_id = 0;
+    auto gpu_ctx = MakeCUDACtx(0);
    for (auto const &page : Xy->GetBatches<EllpackPage>(
             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
-      from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
+      from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
    }

    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
@@ -199,13 +202,15 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo

      std::string from_sparse_buf;
      {
-        common::MemoryBufferStream fo{&from_sparse_buf};
-        columns_from_sparse.Write(&fo);
+        common::AlignedMemWriteStream fo{&from_sparse_buf};
+        auto n_bytes = columns_from_sparse.Write(&fo);
+        ASSERT_EQ(fo.Tell(), n_bytes);
      }
      std::string from_ellpack_buf;
      {
-        common::MemoryBufferStream fo{&from_ellpack_buf};
-        columns_from_sparse.Write(&fo);
+        common::AlignedMemWriteStream fo{&from_ellpack_buf};
+        auto n_bytes = columns_from_sparse.Write(&fo);
+        ASSERT_EQ(fo.Tell(), n_bytes);
      }
      ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
    }
@@ -228,6 +233,5 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
                                         std::make_tuple(.5f, .6),    // sparse columns
                                         std::make_tuple(.6f, .4)));  // dense columns

-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-}  // namespace data
-}  // namespace xgboost
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -2,14 +2,18 @@
 * Copyright 2021-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
+#include <xgboost/context.h>  // for Context
+
+#include <cstddef>  // for size_t
+#include <memory>   // for unique_ptr

 #include "../../../src/common/column_matrix.h"
-#include "../../../src/data/gradient_index.h"
+#include "../../../src/common/io.h"            // for MmapResource, AlignedResourceReadStream...
+#include "../../../src/data/gradient_index.h"  // for GHistIndexMatrix
 #include "../../../src/data/sparse_page_source.h"
-#include "../helpers.h"
+#include "../helpers.h"  // for RandomDataGenerator

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(GHistIndexPageRawFormat, IO) {
  Context ctx;

@@ -20,15 +24,18 @@ TEST(GHistIndexPageRawFormat, IO) {
  std::string path = tmpdir.path + "/ghistindex.page";
  auto batch = BatchParam{256, 0.5};

+  std::size_t bytes{0};
  {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
    for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
-      format->Write(index, fo.get());
+      bytes += format->Write(index, fo.get());
    }
  }

  GHistIndexMatrix page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path, 0, bytes)};
  format->Read(&page, fi.get());

  for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
@@ -37,6 +44,8 @@ TEST(GHistIndexPageRawFormat, IO) {
    ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
    ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
    ASSERT_EQ(loaded.base_rowid, page.base_rowid);
+    ASSERT_EQ(loaded.row_ptr.size(), page.row_ptr.size());
+    ASSERT_TRUE(std::equal(loaded.row_ptr.cbegin(), loaded.row_ptr.cend(), page.row_ptr.cbegin()));
    ASSERT_EQ(loaded.IsDense(), page.IsDense());
    ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(), page.index.begin()));
    ASSERT_TRUE(std::equal(loaded.index.Offset(), loaded.index.Offset() + loaded.index.OffsetSize(),
@@ -45,5 +54,4 @@ TEST(GHistIndexPageRawFormat, IO) {
    ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_iterative_dmatrix.cc
+++ b/tests/cpp/data/test_iterative_dmatrix.cc
@@ -12,8 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/data.h"  // DMatrix

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(IterativeDMatrix, Ref) {
  Context ctx;
  TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
@@ -21,7 +20,7 @@ TEST(IterativeDMatrix, Ref) {
 }

 TEST(IterativeDMatrix, IsDense) {
-  int n_bins = 16;
+  bst_bin_t n_bins = 16;
  auto test = [n_bins](float sparsity) {
    NumpyArrayIterForTest iter(sparsity);
    auto n_threads = 0;
@@ -38,5 +37,4 @@ TEST(IterativeDMatrix, IsDense) {
  test(0.1);
  test(1.0);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -10,6 +10,7 @@
 #include "../../../src/data/device_adapter.hip.h"
 #include "../../../src/data/ellpack_page.hip.h"
 #endif
+#include "../../../src/data/ellpack_page.h"
 #include "../../../src/data/iterative_dmatrix.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../helpers.h"
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -129,8 +129,8 @@ TEST(MetaInfo, SaveLoadBinary) {
    EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
    EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());

-    auto orig_margin = info.base_margin_.View(xgboost::Context::kCpuId);
-    auto read_margin = inforead.base_margin_.View(xgboost::Context::kCpuId);
+    auto orig_margin = info.base_margin_.View(xgboost::DeviceOrd::CPU());
+    auto read_margin = inforead.base_margin_.View(xgboost::DeviceOrd::CPU());
    EXPECT_TRUE(std::equal(orig_margin.Values().cbegin(), orig_margin.Values().cend(),
                           read_margin.Values().cbegin()));

@@ -267,8 +267,8 @@ TEST(MetaInfo, Validate) {
  xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
  d_groups.SetDevice(0);
  d_groups.DevicePointer();  // pull to device
-  std::string arr_interface_str{ArrayInterfaceStr(
-      xgboost::linalg::MakeVec(d_groups.ConstDevicePointer(), d_groups.Size(), 0))};
+  std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
+      d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
  EXPECT_THROW(info.SetInfo(ctx, "group", xgboost::StringView{arr_interface_str}), dmlc::Error);
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 }
@@ -307,5 +307,5 @@ TEST(MetaInfo, HostExtend) {
 }

 namespace xgboost {
-TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(Context::kCpuId); }
+TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
 }  // namespace xgboost
--- a/tests/cpp/data/test_metainfo.cu
+++ b/tests/cpp/data/test_metainfo.cu
@@ -74,7 +74,7 @@ TEST(MetaInfo, FromInterface) {
  }

  info.SetInfo(ctx, "base_margin", str.c_str());
-  auto const h_base_margin = info.base_margin_.View(Context::kCpuId);
+  auto const h_base_margin = info.base_margin_.View(DeviceOrd::CPU());
  ASSERT_EQ(h_base_margin.Size(), d_data.size());
  for (size_t i = 0; i < d_data.size(); ++i) {
    ASSERT_EQ(h_base_margin(i), d_data[i]);
@@ -92,7 +92,7 @@ TEST(MetaInfo, FromInterface) {
 }

 TEST(MetaInfo, GPUStridedData) {
-  TestMetaInfoStridedData(0);
+  TestMetaInfoStridedData(DeviceOrd::CUDA(0));
 }

 TEST(MetaInfo, Group) {
--- a/tests/cpp/data/test_metainfo.h
+++ b/tests/cpp/data/test_metainfo.h
@@ -14,10 +14,10 @@
 #include "../../../src/data/array_interface.h"

 namespace xgboost {
-inline void TestMetaInfoStridedData(int32_t device) {
+inline void TestMetaInfoStridedData(DeviceOrd device) {
  MetaInfo info;
  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(device)}});
+  ctx.UpdateAllowUnknown(Args{{"device", device.Name()}});
  {
    // labels
    linalg::Tensor<float, 3> labels;
@@ -28,9 +28,9 @@ inline void TestMetaInfoStridedData(int32_t device) {
    ASSERT_EQ(t_labels.Shape().size(), 2);

    info.SetInfo(ctx, "label", StringView{ArrayInterfaceStr(t_labels)});
-    auto const& h_result = info.labels.View(-1);
+    auto const& h_result = info.labels.View(DeviceOrd::CPU());
    ASSERT_EQ(h_result.Shape().size(), 2);
-    auto in_labels = labels.View(-1);
+    auto in_labels = labels.View(DeviceOrd::CPU());
    linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(), [&](size_t i, float& v_0) {
      auto tup = linalg::UnravelIndex(i, h_result.Shape());
      auto i0 = std::get<0>(tup);
@@ -62,9 +62,9 @@ inline void TestMetaInfoStridedData(int32_t device) {
    ASSERT_EQ(t_margin.Shape().size(), 2);

    info.SetInfo(ctx, "base_margin", StringView{ArrayInterfaceStr(t_margin)});
-    auto const& h_result = info.base_margin_.View(-1);
+    auto const& h_result = info.base_margin_.View(DeviceOrd::CPU());
    ASSERT_EQ(h_result.Shape().size(), 2);
-    auto in_margin = base_margin.View(-1);
+    auto in_margin = base_margin.View(DeviceOrd::CPU());
    linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(), [&](size_t i, float v_0) {
      auto tup = linalg::UnravelIndex(i, h_result.Shape());
      auto i0 = std::get<0>(tup);
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -298,8 +298,8 @@ TEST(SimpleDMatrix, Slice) {
        ASSERT_EQ(p_m->Info().weights_.HostVector().at(ridx),
                  out->Info().weights_.HostVector().at(i));

-        auto out_margin = out->Info().base_margin_.View(Context::kCpuId);
-        auto in_margin = margin.View(Context::kCpuId);
+        auto out_margin = out->Info().base_margin_.View(DeviceOrd::CPU());
+        auto in_margin = margin.View(DeviceOrd::CPU());
        for (size_t j = 0; j < kClasses; ++j) {
          ASSERT_EQ(out_margin(i, j), in_margin(ridx, j));
        }
@@ -372,8 +372,8 @@ TEST(SimpleDMatrix, SliceCol) {
                    out->Info().labels_upper_bound_.HostVector().at(i));
          ASSERT_EQ(p_m->Info().weights_.HostVector().at(i), out->Info().weights_.HostVector().at(i));

-          auto out_margin = out->Info().base_margin_.View(Context::kCpuId);
-          auto in_margin = margin.View(Context::kCpuId);
+          auto out_margin = out->Info().base_margin_.View(DeviceOrd::CPU());
+          auto in_margin = margin.View(DeviceOrd::CPU());
          for (size_t j = 0; j < kClasses; ++j) {
            ASSERT_EQ(out_margin(i, j), in_margin(i, j));
          }
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -76,9 +76,11 @@ TEST(SparsePageDMatrix, LoadFile) {
 // allow caller to retain pages so they can process multiple pages at the same time.
 template <typename Page>
 void TestRetainPage() {
-  auto m = CreateSparsePageDMatrix(10000);
+  std::size_t n_batches = 4;
+  auto p_fmat = RandomDataGenerator{1024, 128, 0.5f}.Batches(n_batches).GenerateSparsePageDMatrix(
+      "cache", true);
  Context ctx;
-  auto batches = m->GetBatches<Page>(&ctx);
+  auto batches = p_fmat->GetBatches<Page>(&ctx);
  auto begin = batches.begin();
  auto end = batches.end();

@@ -94,7 +96,7 @@ void TestRetainPage() {
    }
    ASSERT_EQ(pages.back().Size(), (*it).Size());
  }
-  ASSERT_GE(iterators.size(), 2);
+  ASSERT_GE(iterators.size(), n_batches);

  for (size_t i = 0; i < iterators.size(); ++i) {
    ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
@@ -102,7 +104,7 @@ void TestRetainPage() {
  }

  // make sure it's const and the caller can not modify the content of page.
-  for (auto &page : m->GetBatches<Page>({&ctx})) {
+  for (auto &page : p_fmat->GetBatches<Page>({&ctx})) {
    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
  }
 }
@@ -248,7 +250,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {

  auto cache_name =
      data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page";
-  std::string cache = common::LoadSequentialFile(cache_name);
+  auto cache = common::LoadSequentialFile(cache_name);
  return cache;
 }

@@ -256,7 +258,7 @@ TEST(SparsePageDMatrix, Determinism) {
 #if defined(_MSC_VER)
  return;
 #endif  // defined(_MSC_VER)
-  std::vector<std::string> caches;
+  std::vector<std::vector<char>> caches;
  for (size_t i = 1; i < 18; i += 2) {
    caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
  }
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -9,6 +9,7 @@
 #elif defined(XGBOOST_USE_HIP)
 #include "../../../src/data/ellpack_page.hip.h"
 #endif
+#include "../../../src/data/ellpack_page.h"
 #include "../../../src/data/sparse_page_dmatrix.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
--- a/tests/cpp/data/test_sparse_page_raw_format.cc
+++ b/tests/cpp/data/test_sparse_page_raw_format.cc
@@ -2,20 +2,20 @@
 * Copyright 2021-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>                          // for CSCPage, SortedCSCPage, SparsePage
+#include <xgboost/data.h>  // for CSCPage, SortedCSCPage, SparsePage

-#include <memory>                                  // for allocator, unique_ptr, __shared_ptr_ac...
-#include <string>                                  // for char_traits, operator+, basic_string
+#include <memory>  // for allocator, unique_ptr, __shared_ptr_ac...
+#include <string>  // for char_traits, operator+, basic_string

+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
 #include "../helpers.h"                            // for RandomDataGenerator
 #include "dmlc/filesystem.h"                       // for TemporaryDirectory
-#include "dmlc/io.h"                               // for SeekStream, Stream
+#include "dmlc/io.h"                               // for Stream
 #include "gtest/gtest_pred_impl.h"                 // for Test, AssertionResult, ASSERT_EQ, TEST
 #include "xgboost/context.h"                       // for Context

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 template <typename S> void TestSparsePageRawFormat() {
  std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
  Context ctx;
@@ -25,17 +25,19 @@ template <typename S> void TestSparsePageRawFormat() {
  dmlc::TemporaryDirectory tmpdir;
  std::string path = tmpdir.path + "/sparse.page";
  S orig;
+  std::size_t n_bytes{0};
  {
    // block code to flush the stream
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
    for (auto const &page : m->GetBatches<S>(&ctx)) {
      orig.Push(page);
-      format->Write(page, fo.get());
+      n_bytes = format->Write(page, fo.get());
    }
  }

  S page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
  format->Read(&page, fi.get());
  for (size_t i = 0; i < orig.data.Size(); ++i) {
    ASSERT_EQ(page.data.HostVector()[i].fvalue,
@@ -59,5 +61,4 @@ TEST(SparsePageRawFormat, CSCPage) {
 TEST(SparsePageRawFormat, SortedCSCPage) {
  TestSparsePageRawFormat<SortedCSCPage>();
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/gbm/test_gblinear.cu
+++ b/tests/cpp/gbm/test_gblinear.cu
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/global_config.h>  // for GlobalConfigThreadLocalStore
+#include <xgboost/json.h>           // for Json, Object
+#include <xgboost/learner.h>        // for Learner
+
+#include <algorithm>  // for transform
+#include <string>     // for string
+#include <utility>    // for swap
+
+#include "../helpers.h"  // for RandomDataGenerator
+
+namespace xgboost {
+TEST(GBlinear, DispatchUpdater) {
+  auto verbosity = 3;
+  std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
+
+  auto test = [](std::string device) {
+    auto p_fmat = RandomDataGenerator{10, 10, 0.0f}.GenerateDMatrix(true);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParams(
+        Args{{"booster", "gblinear"}, {"updater", "coord_descent"}, {"device", device}});
+    learner->Configure();
+    for (std::int32_t iter = 0; iter < 3; ++iter) {
+      learner->UpdateOneIter(iter, p_fmat);
+    }
+    Json config{Object{}};
+    ::testing::internal::CaptureStderr();
+    learner->SaveConfig(&config);
+    auto str = ::testing::internal::GetCapturedStderr();
+    std::transform(device.cbegin(), device.cend(), device.begin(),
+                   [](char c) { return std::toupper(c); });
+    ASSERT_NE(str.find(device), std::string::npos);
+  };
+  test("cpu");
+  test("gpu");
+
+  std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
+}
+}  // namespace xgboost
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,17 +1,22 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/json.h>                // for Json, Object
+#include <xgboost/learner.h>             // for Learner

-#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
+#include <limits>    // for numeric_limits
+#include <memory>    // for shared_ptr
+#include <optional>  // for optional
+#include <string>    // for string
+
+#include "../../../src/data/proxy_dmatrix.h"  // for DMatrixProxy
 #include "../../../src/gbm/gbtree.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 #include "xgboost/base.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/learner.h"
 #include "xgboost/predictor.h"

 namespace xgboost {
@@ -60,7 +65,9 @@ TEST(GBTree, PredictionCache) {

  gbtree.Configure({{"tree_method", "hist"}});
  auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
  PredictionCacheEntry out_predictions;
  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);

@@ -113,12 +120,11 @@ TEST(GBTree, WrongUpdater) {
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 TEST(GBTree, ChoosePredictor) {
  // The test ensures data don't get pulled into device.
-  size_t constexpr kRows = 17;
-  size_t constexpr kCols = 15;
+  std::size_t constexpr kRows = 17, kCols = 15;

  auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();

-  auto& data = (*(p_dmat->GetBatches<SparsePage>().begin())).data;
+  auto const& data = (*(p_dmat->GetBatches<SparsePage>().begin())).data;
  p_dmat->Info().labels.Reshape(kRows);

  auto learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
@@ -127,14 +133,13 @@ TEST(GBTree, ChoosePredictor) {
    learner->UpdateOneIter(i, p_dmat);
  }
  ASSERT_TRUE(data.HostCanWrite());
+
  dmlc::TemporaryDirectory tempdir;
  const std::string fname = tempdir.path + "/model_param.bst";
-
  {
    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
    learner->Save(fo.get());
  }
-
  // a new learner
  learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
  {
@@ -146,6 +151,8 @@ TEST(GBTree, ChoosePredictor) {
    learner->UpdateOneIter(i, p_dmat);
  }
  ASSERT_TRUE(data.HostCanWrite());
+  ASSERT_FALSE(data.DeviceCanWrite());
+  ASSERT_FALSE(data.DeviceCanRead());

  // pull data into device.
  data.HostVector();
@@ -162,7 +169,140 @@ TEST(GBTree, ChoosePredictor) {
  // data is not pulled back into host
  ASSERT_FALSE(data.HostCanWrite());
 }
-#endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
+
+TEST(GBTree, ChooseTreeMethod) {
+  bst_row_t n_samples{128};
+  bst_feature_t n_features{64};
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0.5f}.GenerateDMatrix(true);
+
+  auto with_update = [&](std::optional<std::string> device,
+                         std::optional<std::string> tree_method) {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({Xy}));
+    if (tree_method.has_value()) {
+      learner->SetParam("tree_method", tree_method.value());
+    }
+    if (device.has_value()) {
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
+    }
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(0, Xy);
+    }
+    Json config{Object{}};
+    learner->SaveConfig(&config);
+    auto updater = config["learner"]["gradient_booster"]["updater"];
+    CHECK(!IsA<Null>(updater));
+    return updater;
+  };
+
+  auto with_boost = [&](std::optional<std::string> device, std::optional<std::string> tree_method) {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({Xy}));
+    if (tree_method.has_value()) {
+      learner->SetParam("tree_method", tree_method.value());
+    }
+    if (device.has_value()) {
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
+    }
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
+      gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
+      learner->BoostOneIter(0, Xy, &gpair);
+    }
+
+    Json config{Object{}};
+    learner->SaveConfig(&config);
+    auto updater = config["learner"]["gradient_booster"]["updater"];
+    return updater;
+  };
+
+  // |        | hist    | gpu_hist | exact | NA  |
+  // |--------+---------+----------+-------+-----|
+  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU |
+  // | CPU    | CPU     | GPU (w)  | CPU   | CPU |
+  // |--------+---------+----------+-------+-----|
+  // | -1     | CPU     | GPU (w)  | CPU   | CPU |
+  // | 0      | GPU     | GPU (w)  | Err   | GPU |
+  // |--------+---------+----------+-------+-----|
+  // | NA     | CPU     | GPU (w)  | CPU   | CPU |
+  //
+  // - (w): warning
+  // - CPU: Run on CPU.
+  // - GPU: Run on CUDA.
+  // - Err: Not feasible.
+  // - NA:  Parameter is not specified.
+
+  // When GPU hist is specified with a CPU context, we should emit an error. However, it's
+  // quite difficult to detect whether the CPU context is being used because it's the
+  // default or because it's specified by the user.
+
+  std::map<std::pair<std::optional<std::string>, std::optional<std::string>>, std::string>
+      expectation{
+          // hist
+          {{"hist", "-1"}, "grow_quantile_histmaker"},
+          {{"hist", "0"}, "grow_gpu_hist"},
+          {{"hist", "cpu"}, "grow_quantile_histmaker"},
+          {{"hist", "cuda"}, "grow_gpu_hist"},
+          {{"hist", "cuda:0"}, "grow_gpu_hist"},
+          {{"hist", std::nullopt}, "grow_quantile_histmaker"},
+          // gpu_hist
+          {{"gpu_hist", "-1"}, "grow_gpu_hist"},
+          {{"gpu_hist", "0"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
+          {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
+          // exact
+          {{"exact", "-1"}, "grow_colmaker,prune"},
+          {{"exact", "0"}, "err"},
+          {{"exact", "cpu"}, "grow_colmaker,prune"},
+          {{"exact", "cuda"}, "err"},
+          {{"exact", "cuda:0"}, "err"},
+          {{"exact", std::nullopt}, "grow_colmaker,prune"},
+          // NA
+          {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
+          {{std::nullopt, "cpu"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "cuda"}, "grow_gpu_hist"},
+          {{std::nullopt, "cuda:0"}, "grow_gpu_hist"},
+          {{std::nullopt, std::nullopt}, "grow_quantile_histmaker"},
+      };
+
+  auto run_test = [&](auto fn) {
+    for (auto const& kv : expectation) {
+      auto device = kv.first.second;
+      auto tm = kv.first.first;
+
+      if (kv.second == "err") {
+        ASSERT_THROW({ fn(device, tm); }, dmlc::Error)
+            << " device:" << device.value_or("NA") << " tm:" << tm.value_or("NA");
+        continue;
+      }
+      auto up = fn(device, tm);
+      auto ups = get<Array const>(up);
+      auto exp_names = common::Split(kv.second, ',');
+      ASSERT_EQ(exp_names.size(), ups.size());
+      for (std::size_t i = 0; i < exp_names.size(); ++i) {
+        ASSERT_EQ(get<String const>(ups[i]["name"]), exp_names[i])
+            << " device:" << device.value_or("NA") << " tm:" << tm.value_or("NA");
+      }
+    }
+  };
+
+  run_test(with_update);
+  run_test(with_boost);
+}
+#endif  // XGBOOST_USE_CUDA

 // Some other parts of test are in `Tree.JsonIO'.
 TEST(GBTree, JsonIO) {
@@ -171,32 +311,52 @@ TEST(GBTree, JsonIO) {
  Context ctx;
  LearnerModelParam mparam{MakeMP(kCols, .5, 1)};

-  std::unique_ptr<GradientBooster> gbm {
-    CreateTrainedGBM("gbtree", Args{}, kRows, kCols, &mparam, &ctx) };
+  std::unique_ptr<GradientBooster> gbm{
+      CreateTrainedGBM("gbtree", Args{{"tree_method", "exact"}, {"default_direction", "left"}},
+                       kRows, kCols, &mparam, &ctx)};

-  Json model {Object()};
+  Json model{Object()};
  model["model"] = Object();
-  auto& j_model = model["model"];
+  auto j_model = model["model"];

  model["config"] = Object();
-  auto& j_param = model["config"];
+  auto j_config = model["config"];

  gbm->SaveModel(&j_model);
-  gbm->SaveConfig(&j_param);
+  gbm->SaveConfig(&j_config);

  std::string model_str;
  Json::Dump(model, &model_str);

  model = Json::Load({model_str.c_str(), model_str.size()});
-  ASSERT_EQ(get<String>(model["model"]["name"]), "gbtree");
+  j_model = model["model"];
+  j_config = model["config"];
+  ASSERT_EQ(get<String>(j_model["name"]), "gbtree");

-  auto const& gbtree_model = model["model"]["model"];
+  auto gbtree_model = j_model["model"];
  ASSERT_EQ(get<Array>(gbtree_model["trees"]).size(), 1ul);
  ASSERT_EQ(get<Integer>(get<Object>(get<Array>(gbtree_model["trees"]).front()).at("id")), 0);
  ASSERT_EQ(get<Array>(gbtree_model["tree_info"]).size(), 1ul);
-
-  auto j_train_param = model["config"]["gbtree_model_param"];
+  auto j_train_param = j_config["gbtree_model_param"];
  ASSERT_EQ(get<String>(j_train_param["num_parallel_tree"]), "1");
+
+  auto check_config = [](Json j_up_config) {
+    auto colmaker = get<Array const>(j_up_config).front();
+    auto pruner = get<Array const>(j_up_config).back();
+    ASSERT_EQ(get<String const>(colmaker["name"]), "grow_colmaker");
+    ASSERT_EQ(get<String const>(pruner["name"]), "prune");
+    ASSERT_EQ(get<String const>(colmaker["colmaker_train_param"]["default_direction"]), "left");
+  };
+  check_config(j_config["updater"]);
+
+  std::unique_ptr<GradientBooster> loaded(gbm::GBTree::Create("gbtree", &ctx, &mparam));
+  loaded->LoadModel(j_model);
+  loaded->LoadConfig(j_config);
+
+  // roundtrip test
+  Json j_config_rt{Object{}};
+  loaded->SaveConfig(&j_config_rt);
+  check_config(j_config_rt["updater"]);
 }

 TEST(Dart, JsonIO) {
@@ -232,14 +392,15 @@ TEST(Dart, JsonIO) {
 namespace {
 class Dart : public testing::TestWithParam<char const*> {
 public:
-  void Run(std::string predictor) {
+  void Run(std::string device) {
    size_t constexpr kRows = 16, kCols = 10;

    HostDeviceVector<float> data;
-    auto rng = RandomDataGenerator(kRows, kCols, 0);
-    if (predictor == "gpu_predictor") {
-      rng.Device(0);
+    Context ctx;
+    if (device == "GPU") {
+      ctx = MakeCUDACtx(0);
    }
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
    auto array_str = rng.GenerateArrayInterface(&data);
    auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);

@@ -257,15 +418,14 @@ class Dart : public testing::TestWithParam<char const*> {
    for (size_t i = 0; i < 16; ++i) {
      learner->UpdateOneIter(i, p_mat);
    }
-
-    learner->SetParam("predictor", predictor);
+    learner->SetParam("device", ctx.DeviceName());

    HostDeviceVector<float> predts_training;
    learner->Predict(p_mat, false, &predts_training, 0, 0, true);

    HostDeviceVector<float>* inplace_predts;
    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
-    if (predictor == "gpu_predictor") {
+    if (ctx.IsCUDA()) {
      x->SetCUDAArray(array_str.c_str());
    } else {
      x->SetArrayData(array_str.c_str());
@@ -295,11 +455,10 @@ class Dart : public testing::TestWithParam<char const*> {
 TEST_P(Dart, Prediction) { this->Run(GetParam()); }

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart,
-                         testing::Values("auto", "cpu_predictor", "gpu_predictor"));
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU", "GPU"));
 #else
-INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("auto", "cpu_predictor"));
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU"));
+#endif  // defined(XGBOOST_USE_CUDA)


 std::pair<Json, Json> TestModelSlice(std::string booster) {
@@ -511,4 +670,85 @@ TEST(GBTree, PredictRange) {
                 dmlc::Error);
  }
 }
+
+TEST(GBTree, InplacePredictionError) {
+  std::size_t n_samples{2048}, n_features{32};
+
+  auto test_ext_err = [&](std::string booster, Context const* ctx) {
+    std::shared_ptr<DMatrix> p_fmat =
+        RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
+            "cache", true);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParams(Args{{"booster", booster}, {"device", ctx->DeviceName()}});
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(i, p_fmat);
+    }
+    HostDeviceVector<float>* out_predt;
+    ASSERT_THROW(
+        {
+          learner->InplacePredict(p_fmat, PredictionType::kValue,
+                                  std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+        },
+        dmlc::Error);
+  };
+
+  {
+    Context ctx;
+    test_ext_err("gbtree", &ctx);
+    test_ext_err("dart", &ctx);
+  }
+
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test_ext_err("gbtree", &ctx);
+    test_ext_err("dart", &ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+
+  auto test_qdm_err = [&](std::string booster, Context const* ctx) {
+    std::shared_ptr<DMatrix> p_fmat;
+    bst_bin_t max_bins = 16;
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    if (ctx->IsCPU()) {
+      p_fmat = rng.GenerateQuantileDMatrix(true);
+    } else {
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+      p_fmat = rng.GenerateDeviceDMatrix(true);
+#else
+      CHECK(p_fmat);
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParams(Args{{"booster", booster},
+                            {"max_bin", std::to_string(max_bins)},
+                            {"device", ctx->DeviceName()}});
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(i, p_fmat);
+    }
+    HostDeviceVector<float>* out_predt;
+    ASSERT_THROW(
+        {
+          learner->InplacePredict(p_fmat, PredictionType::kValue,
+                                  std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+        },
+        dmlc::Error);
+  };
+
+  {
+    Context ctx;
+    test_qdm_err("gbtree", &ctx);
+    test_qdm_err("dart", &ctx);
+  }
+
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test_qdm_err("gbtree", &ctx);
+    test_qdm_err("dart", &ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+}
 }  // namespace xgboost
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2023, XGBoost contributors
+ */
+#include <xgboost/context.h>      // for Context
+#include <xgboost/learner.h>      // for Learner
+#include <xgboost/string_view.h>  // for StringView
+
+#include <limits>  // for numeric_limits
+#include <memory>  // for shared_ptr
+#include <string>  // for string
+#include <thread>  // for thread
+
+#include "../../../src/data/adapter.h"           // for ArrayAdapter
+#include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
+#include "../../../src/data/proxy_dmatrix.h"     // for DMatrixProxy
+#include "../helpers.h"                          // for RandomDataGenerator
+
+namespace xgboost {
+void TestInplaceFallback(Context const* ctx) {
+  // prepare data
+  bst_row_t n_samples{1024};
+  bst_feature_t n_features{32};
+  HostDeviceVector<float> X_storage;
+  // use a different device than the learner
+  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto X = RandomDataGenerator{n_samples, n_features, 0.0}
+               .Device(data_ordinal)
+               .GenerateArrayInterface(&X_storage);
+  HostDeviceVector<float> y_storage;
+  auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
+
+  std::shared_ptr<DMatrix> Xy;
+  if (data_ordinal == Context::kCpuId) {
+    auto X_adapter = data::ArrayAdapter{StringView{X}};
+    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
+  } else {
+    auto X_adapter = data::CupyAdapter{StringView{X}};
+    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
+  }
+
+  Xy->SetInfo("label", y);
+
+  // learner is configured to the device specified by ctx
+  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+  learner->SetParam("device", ctx->DeviceName());
+  for (std::int32_t i = 0; i < 3; ++i) {
+    learner->UpdateOneIter(i, Xy);
+  }
+
+  std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
+  auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+  if (data_ordinal == Context::kCpuId) {
+    proxy->SetArrayData(StringView{X});
+  } else {
+    proxy->SetCUDAArray(X.c_str());
+  }
+
+  HostDeviceVector<float>* out_predt{nullptr};
+  ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  std::string output;
+
+  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &out_predt, 0, 0);
+
+  // test when the contexts match
+  Context new_ctx = *proxy->Ctx();
+  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+
+  learner->SetParam("device", new_ctx.DeviceName());
+  HostDeviceVector<float>* out_predt_1{nullptr};
+  // no warning is raised
+  ::testing::internal::CaptureStderr();
+  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &out_predt_1, 0, 0);
+  output = testing::internal::GetCapturedStderr();
+
+  ASSERT_TRUE(output.empty());
+
+  ASSERT_EQ(out_predt->ConstHostVector(), out_predt_1->ConstHostVector());
+}
+
+TEST(GBTree, InplacePredictFallback) {
+  auto ctx = MakeCUDACtx(0);
+  TestInplaceFallback(&ctx);
+}
+}  // namespace xgboost
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -96,9 +96,9 @@ void CheckObjFunctionImpl(std::unique_ptr<xgboost::ObjFunction> const& obj,
                          std::vector<xgboost::bst_float> out_grad,
                          std::vector<xgboost::bst_float> out_hess) {
  xgboost::HostDeviceVector<xgboost::bst_float> in_preds(preds);
-  xgboost::HostDeviceVector<xgboost::GradientPair> out_gpair;
-  obj->GetGradient(in_preds, info, 1, &out_gpair);
-  std::vector<xgboost::GradientPair>& gpair = out_gpair.HostVector();
+  xgboost::linalg::Matrix<xgboost::GradientPair> out_gpair;
+  obj->GetGradient(in_preds, info, 0, &out_gpair);
+  std::vector<xgboost::GradientPair>& gpair = out_gpair.Data()->HostVector();

  ASSERT_EQ(gpair.size(), in_preds.Size());
  for (int i = 0; i < static_cast<int>(gpair.size()); ++i) {
@@ -119,8 +119,8 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                      std::vector<xgboost::bst_float> out_hess) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels =
-      xgboost::linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
+  info.labels = xgboost::linalg::Tensor<float, 2>{
+      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
  info.weights_.HostVector() = weights;

  CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@@ -155,8 +155,8 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                             std::vector<xgboost::bst_float> out_hess) {
  xgboost::MetaInfo info;
  info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Tensor<float, 2>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Matrix<float>{
+      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
  info.weights_.HostVector() = weights;
  info.group_ptr_ = groups;

@@ -210,6 +210,16 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 // Make sure it's compile time constant.
 static_assert(SimpleLCG::max() - SimpleLCG::min());

+void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
+  RandomDataGenerator{p_fmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+      p_fmat->Info().labels.Data());
+  CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
+  p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+  if (device_ != Context::kCpuId) {
+    p_fmat->Info().labels.SetDevice(device_);
+  }
+}
+
 void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
  xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
  CHECK(out);
@@ -363,8 +373,9 @@ void RandomDataGenerator::GenerateCSR(
  CHECK_EQ(columns->Size(), value->Size());
 }

-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label,
-                                                              size_t classes) const {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label,
+                                                                            bool float_label,
+                                                                            size_t classes) const {
  HostDeviceVector<float> data;
  HostDeviceVector<bst_row_t> rptrs;
  HostDeviceVector<bst_feature_t> columns;
@@ -395,6 +406,9 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, b
    for (auto const& page : out->GetBatches<SparsePage>()) {
      page.data.SetDevice(device_);
      page.offset.SetDevice(device_);
+      // pull to device
+      page.data.ConstDeviceSpan();
+      page.offset.ConstDeviceSpan();
    }
  }
  if (!ft_.empty()) {
@@ -403,13 +417,73 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, b
  return out;
 }

-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix() {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateSparsePageDMatrix(
+    std::string prefix, bool with_label) const {
+  CHECK_GE(this->rows_, this->n_batches_);
+  CHECK_GE(this->n_batches_, 1)
+      << "Must set the n_batches before generating an external memory DMatrix.";
+  std::unique_ptr<ArrayIterForTest> iter;
+  if (device_ == Context::kCpuId) {
+    iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
+  } else {
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+    iter = std::make_unique<CudaArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
+#else
+    CHECK(iter);
+#endif  // defined(XGBOOST_USE_CUDA)
+  }
+
+  std::unique_ptr<DMatrix> dmat{
+      DMatrix::Create(static_cast<DataIterHandle>(iter.get()), iter->Proxy(), Reset, Next,
+                      std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(), prefix)};
+
+  auto row_page_path =
+      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
+  EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
+
+  // Loop over the batches and count the number of pages
+  std::size_t batch_count = 0;
+  bst_row_t row_count = 0;
+  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
+    batch_count++;
+    row_count += batch.Size();
+    CHECK_NE(batch.data.Size(), 0);
+  }
+
+  EXPECT_EQ(batch_count, n_batches_);
+  EXPECT_EQ(row_count, dmat->Info().num_row_);
+
+  if (with_label) {
+    RandomDataGenerator{dmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+        dmat->Info().labels.Data());
+    CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
+    dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+  }
+  return dmat;
+}
+
+std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix(bool with_label) {
  NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
  auto m = std::make_shared<data::IterativeDMatrix>(
      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  if (with_label) {
+    this->GenerateLabels(m);
+  }
  return m;
 }

+#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
+CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
+    : ArrayIterForTest{sparsity, rows, cols, batches} {
+  common::AssertGPUSupport();
+}
+
+int CudaArrayIterForTest::Next() {
+  common::AssertGPUSupport();
+  return 0;
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
                                             size_t batches)
    : ArrayIterForTest{sparsity, rows, cols, batches} {
@@ -571,11 +645,10 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
  }
  p_dmat->Info().labels =
      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
-  HostDeviceVector<GradientPair> gpair;
-  auto& h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
+  auto h_gpair = gpair.HostView();
  for (size_t i = 0; i < kRows; ++i) {
-    h_gpair[i] = GradientPair{static_cast<float>(i), 1};
+    h_gpair(i) = GradientPair{static_cast<float>(i), 1};
  }

  PredictionCacheEntry predts;
@@ -588,7 +661,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
 ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
    : rows_{rows}, cols_{cols}, n_batches_{batches} {
  XGProxyDMatrixCreate(&proxy_);
-  rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
+  rng_ = std::make_unique<RandomDataGenerator>(rows_, cols_, sparsity);
  std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
 }

--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -28,10 +28,13 @@ int CudaArrayIterForTest::Next() {
  return 1;
 }

-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix() {
+std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label) {
  CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
  auto m = std::make_shared<data::IterativeDMatrix>(
      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  if (with_label) {
+    this->GenerateLabels(m);
+  }
  return m;
 }
 }  // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -35,9 +35,9 @@
 #endif

 #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
-#define GPUIDX 0
+#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
 #else
-#define GPUIDX -1
+#define GPUIDX (-1)
 #endif

 #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
@@ -46,12 +46,6 @@
 #define DeclareUnifiedDistributedTest(name) name
 #endif

-#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
-#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
-#else
-#define WORLD_SIZE_FOR_TEST (3)
-#endif
-
 namespace xgboost {
 class ObjFunction;
 class Metric;
@@ -183,7 +177,7 @@ class SimpleRealUniformDistribution {

    for (size_t k = m; k != 0; --k) {
      sum_value += static_cast<ResultT>((*rng)() - rng->Min()) * r_k;
-      r_k *= r;
+      r_k *= static_cast<ResultT>(r);
    }

    ResultT res = sum_value / r_k;
@@ -238,15 +232,18 @@ class RandomDataGenerator {
  bst_target_t n_targets_{1};

  std::int32_t device_{Context::kCpuId};
+  std::size_t n_batches_{0};
  std::uint64_t seed_{0};
  SimpleLCG lcg_;

-  std::size_t bins_{0};
+  bst_bin_t bins_{0};
  std::vector<FeatureType> ft_;
  bst_cat_t max_cat_;

  Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;

+  void GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const;
+
 public:
  RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
      : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lcg_{seed_} {}
@@ -263,12 +260,16 @@ class RandomDataGenerator {
    device_ = d;
    return *this;
  }
+  RandomDataGenerator& Batches(std::size_t n_batches) {
+    n_batches_ = n_batches;
+    return *this;
+  }
  RandomDataGenerator& Seed(uint64_t s) {
    seed_ = s;
    lcg_.Seed(seed_);
    return *this;
  }
-  RandomDataGenerator& Bins(size_t b) {
+  RandomDataGenerator& Bins(bst_bin_t b) {
    bins_ = b;
    return *this;
  }
@@ -309,12 +310,17 @@ class RandomDataGenerator {
  void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
                   HostDeviceVector<bst_feature_t>* columns) const;

-  std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false, bool float_label = true,
-                                           size_t classes = 1) const;
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
+                                                         bool float_label = true,
+                                                         size_t classes = 1) const;
+
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
+                                                                   bool with_label) const;
+
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  std::shared_ptr<DMatrix> GenerateDeviceDMatrix();
+  std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label);
 #endif
-  std::shared_ptr<DMatrix> GenerateQuantileDMatrix();
+  std::shared_ptr<DMatrix> GenerateQuantileDMatrix(bool with_label);
 };

 // Generate an empty DMatrix, mostly for its meta info.
@@ -322,15 +328,14 @@ inline std::shared_ptr<DMatrix> EmptyDMatrix() {
  return RandomDataGenerator{0, 0, 0.0}.GenerateDMatrix();
 }

-inline std::vector<float>
-GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
+inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
  std::vector<float> x(n);
  std::mt19937 rng(0);
  std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
  std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
  // Make sure each category is present
-  for(size_t i = 0; i < num_categories; i++) {
-    x[i] = i;
+  for (size_t i = 0; i < num_categories; i++) {
+    x[i] = static_cast<decltype(x)::value_type>(i);
  }
  return x;
 }
@@ -382,23 +387,6 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
                                                  LearnerModelParam const* learner_model_param,
                                                  Context const* generic_param);

-inline std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(
-    std::size_t rows, bst_target_t n_targets = 1) {
-  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows * n_targets);
-  auto& h_gradients = p_gradients->HostVector();
-
-  xgboost::SimpleLCG gen;
-  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
-
-  for (std::size_t i = 0; i < rows * n_targets; ++i) {
-    auto grad = dist(&gen);
-    auto hess = dist(&gen);
-    h_gradients[i] = GradientPair{grad, hess};
-  }
-
-  return p_gradients;
-}
-
 /**
 * \brief Make a context that uses CUDA if device >= 0.
 */
@@ -410,11 +398,12 @@ inline Context MakeCUDACtx(std::int32_t device) {
 }

 inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
-                                                              float lower= 0.0f, float upper = 1.0f) {
+                                                              float lower = 0.0f,
+                                                              float upper = 1.0f) {
  xgboost::SimpleLCG gen;
  xgboost::SimpleRealUniformDistribution<bst_float> dist(lower, upper);
  std::vector<GradientPair> h_gpair(n_rows);
-  for (auto &gpair : h_gpair) {
+  for (auto& gpair : h_gpair) {
    bst_float grad = dist(&gen);
    bst_float hess = dist(&gen);
    gpair = GradientPair(grad, hess);
@@ -423,6 +412,16 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
  return gpair;
 }

+inline linalg::Matrix<GradientPair> GenerateRandomGradients(Context const* ctx, bst_row_t n_rows,
+                                                            bst_target_t n_targets,
+                                                            float lower = 0.0f,
+                                                            float upper = 1.0f) {
+  auto g = GenerateRandomGradients(n_rows * n_targets, lower, upper);
+  linalg::Matrix<GradientPair> gpair({n_rows, static_cast<bst_row_t>(n_targets)}, ctx->Device());
+  gpair.Data()->Copy(g);
+  return gpair;
+}
+
 typedef void *DMatrixHandle;  // NOLINT(*);

 class ArrayIterForTest {
@@ -444,11 +443,11 @@ class ArrayIterForTest {
  size_t static constexpr Cols() { return 13; }

 public:
-  std::string AsArray() const { return interface_; }
+  [[nodiscard]] std::string AsArray() const { return interface_; }

  virtual int Next() = 0;
  virtual void Reset() { iter_ = 0; }
-  size_t Iter() const { return iter_; }
+  [[nodiscard]] std::size_t Iter() const { return iter_; }
  auto Proxy() -> decltype(proxy_) { return proxy_; }

  explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
@@ -511,11 +510,15 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint

 inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }

-template <typename Function, typename... Args>
+template <bool use_nccl = false, typename Function, typename... Args>
 void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
  auto run = [&](auto rank) {
    Json config{JsonObject()};
-    config["xgboost_communicator"] = String("in-memory");
+    if constexpr (use_nccl) {
+      config["xgboost_communicator"] = String("in-memory-nccl");
+    } else {
+      config["xgboost_communicator"] = String("in-memory");
+    }
    config["in_memory_world_size"] = world_size;
    config["in_memory_rank"] = rank;
    xgboost::collective::Init(config);
@@ -537,16 +540,35 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
 #endif
 }

-class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
+class BaseMGPUTest : public ::testing::Test {
 protected:
  int world_size_;
+  bool use_nccl_{false};

  void SetUp() override {
-    world_size_ = WORLD_SIZE_FOR_TEST;
-    if (world_size_ <= 1) {
-      GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
+    auto const n_gpus = common::AllVisibleGPUs();
+    if (n_gpus <= 1) {
+      // Use a single GPU to simulate distributed environment.
+      world_size_ = 3;
+      // NCCL doesn't like sharing a single GPU, so we use the adapter instead.
+      use_nccl_ = false;
+    } else {
+      // Use multiple GPUs for real.
+      world_size_ = n_gpus;
+      use_nccl_ = true;
+    }
+  }
+
+  template <typename Function, typename... Args>
+  void DoTest(Function&& function, Args&&... args) {
+    if (use_nccl_) {
+      RunWithInMemoryCommunicator<true>(world_size_, function, args...);
+    } else {
+      RunWithInMemoryCommunicator<false>(world_size_, function, args...);
    }
  }
 };

+class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
+
 }  // namespace xgboost
--- a/tests/cpp/helpers.hip
+++ b/tests/cpp/helpers.hip
@@ -1,2 +1,4 @@

+#if defined(XGBOOST_USE_HIP)
 #include "helpers.cu"
+#endif
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -2,6 +2,10 @@
 #include "../../src/data/ellpack_page.cuh"
 #endif

+#include <xgboost/data.h>  // for SparsePage
+
+#include "./helpers.h"  // for RandomDataGenerator
+
 namespace xgboost {
 #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
 namespace {
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@@ -24,8 +24,8 @@ TEST(Linear, Shotgun) {
    auto updater =
        std::unique_ptr<xgboost::LinearUpdater>(xgboost::LinearUpdater::Create("shotgun", &ctx));
    updater->Configure({{"eta", "1."}});
-    xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-        p_fmat->Info().num_row_, xgboost::GradientPair(-5, 1.0));
+    linalg::Matrix<xgboost::GradientPair> gpair{
+        linalg::Constant(&ctx, xgboost::GradientPair(-5, 1.0), p_fmat->Info().num_row_, 1)};
    xgboost::gbm::GBLinearModel model{&mparam};
    model.LazyInitModel();
    updater->Update(&gpair, p_fmat.get(), &model, gpair.Size());
@@ -55,8 +55,8 @@ TEST(Linear, coordinate) {
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("coord_descent", &ctx));
  updater->Configure({{"eta", "1."}});
-  xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-      p_fmat->Info().num_row_, xgboost::GradientPair(-5, 1.0));
+  linalg::Matrix<xgboost::GradientPair> gpair{
+      linalg::Constant(&ctx, xgboost::GradientPair(-5, 1.0), p_fmat->Info().num_row_, 1)};
  xgboost::gbm::GBLinearModel model{&mparam};
  model.LazyInitModel();
  updater->Update(&gpair, p_fmat.get(), &model, gpair.Size());
--- a/tests/cpp/linear/test_linear.cu
+++ b/tests/cpp/linear/test_linear.cu
@@ -1,4 +1,6 @@
-// Copyright by Contributors
+/**
+ * Copyright 2018-2023, XGBoost Contributors
+ */
 #include <xgboost/linear_updater.h>
 #include <xgboost/gbm.h>

@@ -19,8 +21,7 @@ TEST(Linear, GPUCoordinate) {
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("gpu_coord_descent", &ctx));
  updater->Configure({{"eta", "1."}});
-  xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-      mat->Info().num_row_, xgboost::GradientPair(-5, 1.0));
+  auto gpair = linalg::Constant(&ctx, xgboost::GradientPair(-5, 1.0), mat->Info().num_row_, 1);
  xgboost::gbm::GBLinearModel model{&mparam};

  model.LazyInitModel();
--- a/tests/cpp/metric/test_auc.cc
+++ b/tests/cpp/metric/test_auc.cc
@@ -18,51 +18,51 @@ TEST(Metric, DeclareUnifiedTest(MultiClassPRAUC)) { VerifyMultiClassPRAUC(); }
 TEST(Metric, DeclareUnifiedTest(RankingPRAUC)) { VerifyRankingPRAUC(); }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kRow);
+  DoTest(VerifyBinaryAUC, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kCol);
+  DoTest(VerifyBinaryAUC, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassAUC, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassAUC, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kRow);
+  DoTest(VerifyRankingAUC, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kCol);
+  DoTest(VerifyRankingAUC, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kRow);
+  DoTest(VerifyPRAUC, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kCol);
+  DoTest(VerifyPRAUC, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassPRAUC, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassPRAUC, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kRow);
+  DoTest(VerifyRankingPRAUC, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kCol);
+  DoTest(VerifyRankingPRAUC, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -26,83 +26,83 @@ TEST(Metric, DeclareUnifiedTest(MultiRMSE)) { VerifyMultiRMSE(); }
 TEST(Metric, DeclareUnifiedTest(Quantile)) { VerifyQuantile(); }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kRow);
+  DoTest(VerifyRMSE, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kCol);
+  DoTest(VerifyRMSE, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kRow);
+  DoTest(VerifyRMSLE, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kCol);
+  DoTest(VerifyRMSLE, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kRow);
+  DoTest(VerifyMAE, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kCol);
+  DoTest(VerifyMAE, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kRow);
+  DoTest(VerifyMAPE, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kCol);
+  DoTest(VerifyMAPE, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kRow);
+  DoTest(VerifyMPHE, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kCol);
+  DoTest(VerifyMPHE, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kRow);
+  DoTest(VerifyLogLoss, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kCol);
+  DoTest(VerifyLogLoss, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kRow);
+  DoTest(VerifyError, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kCol);
+  DoTest(VerifyError, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kRow);
+  DoTest(VerifyPoissonNegLogLik, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kCol);
+  DoTest(VerifyPoissonNegLogLik, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kRow);
+  DoTest(VerifyMultiRMSE, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kCol);
+  DoTest(VerifyMultiRMSE, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kRow);
+  DoTest(VerifyQuantile, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kCol);
+  DoTest(VerifyQuantile, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@@ -8,14 +8,10 @@ TEST(Metric, UnknownMetric) {
  xgboost::Metric* metric = nullptr;
  EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &ctx));
  EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &ctx));
-  if (metric) {
-    delete metric;
-  }
+  delete metric;
  metric = nullptr;
  EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name@1", &ctx));
  EXPECT_NO_THROW(metric = xgboost::Metric::Create("error@0.5f", &ctx));
-  if (metric) {
-    delete metric;
-  }
+  delete metric;
 }
 }  // namespace xgboost
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@@ -11,19 +11,19 @@ TEST(Metric, DeclareUnifiedTest(MultiClassError)) { VerifyMultiClassError(); }
 TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) { VerifyMultiClassLogLoss(); }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassError, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassError, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassLogLoss, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassLogLoss, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -46,7 +46,6 @@ inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device)

 inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
  auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "merror");
@@ -67,7 +66,6 @@ inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode:

 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
  auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
  xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
  metric->Configure({});
  ASSERT_STREQ(metric->Name(), "mlogloss");
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -50,35 +50,35 @@ TEST(Metric, DeclareUnifiedTest(MAP)) { VerifyMAP(); }
 TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) { VerifyNDCGExpGain(); }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kRow);
+  DoTest(VerifyPrecision, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kCol);
+  DoTest(VerifyPrecision, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kRow);
+  DoTest(VerifyNDCG, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kCol);
+  DoTest(VerifyNDCG, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kRow);
+  DoTest(VerifyMAP, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kCol);
+  DoTest(VerifyMAP, DataSplitMode::kCol);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kRow);
+  DoTest(VerifyNDCGExpGain, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kCol);
+  DoTest(VerifyNDCGExpGain, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/tests/cpp/metric/test_survival_metric.cu
+++ b/tests/cpp/metric/test_survival_metric.cu
@@ -12,21 +12,21 @@ namespace common {
 TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) { VerifyAFTNegLogLik(); }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kRow);
+  DoTest(VerifyAFTNegLogLik, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kCol);
+  DoTest(VerifyAFTNegLogLik, DataSplitMode::kCol);
 }

 TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) { VerifyIntervalRegressionAccuracy(); }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kRow);
+  DoTest(VerifyIntervalRegressionAccuracy, DataSplitMode::kRow);
 }

 TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kCol);
+  DoTest(VerifyIntervalRegressionAccuracy, DataSplitMode::kCol);
 }

 // Test configuration of AFT metric
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) by Contributors 2020
+/**
+ * Copyright 2020-2023, XGBoost Contributors 
 */
 #include <gtest/gtest.h>
 #include <memory>
@@ -12,9 +12,7 @@
 #include "../helpers.h"
 #include "../../../src/common/survival_util.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
  auto ctx = MakeCUDACtx(GPUIDX);
  std::unique_ptr<ObjFunction> objective(ObjFunction::Create("survival:aft", &ctx));
@@ -65,14 +63,14 @@ static inline void CheckGPairOverGridPoints(
    preds[i] = std::log(std::pow(2.0, i * (log_y_high - log_y_low) / (num_point - 1) + log_y_low));
  }

-  HostDeviceVector<GradientPair> out_gpair;
+  linalg::Matrix<GradientPair> out_gpair;
  obj->GetGradient(HostDeviceVector<bst_float>(preds), info, 1, &out_gpair);
-  const auto& gpair = out_gpair.HostVector();
+  const auto gpair = out_gpair.HostView();
  CHECK_EQ(num_point, expected_grad.size());
  CHECK_EQ(num_point, expected_hess.size());
  for (int i = 0; i < num_point; ++i) {
-    EXPECT_NEAR(gpair[i].GetGrad(), expected_grad[i], ftol);
-    EXPECT_NEAR(gpair[i].GetHess(), expected_hess[i], ftol);
+    EXPECT_NEAR(gpair(i).GetGrad(), expected_grad[i], ftol);
+    EXPECT_NEAR(gpair(i).GetHess(), expected_hess[i], ftol);
  }
 }

@@ -169,5 +167,4 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairIntervalCensoredLabels)) {
      0.2757f, 0.1776f, 0.1110f, 0.0682f, 0.0415f, 0.0251f, 0.0151f, 0.0091f, 0.0055f, 0.0033f });
 }

-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -74,35 +74,35 @@ void TestNDCGGPair(Context const* ctx) {
  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
  info.group_ptr_ = {0, 2, 4};
  info.num_row_ = 4;
-  HostDeviceVector<GradientPair> gpairs;
+  linalg::Matrix<GradientPair> gpairs;
  obj->GetGradient(predts, info, 0, &gpairs);
  ASSERT_EQ(gpairs.Size(), predts.Size());

  {
    predts = {1, 0, 1, 0};
-    HostDeviceVector<GradientPair> gpairs;
+    linalg::Matrix<GradientPair> gpairs;
    obj->GetGradient(predts, info, 0, &gpairs);
-    for (size_t i = 0; i < gpairs.Size(); ++i) {
-      ASSERT_GT(gpairs.HostSpan()[i].GetHess(), 0);
+    for (std::size_t i = 0; i < gpairs.Size(); ++i) {
+      ASSERT_GT(gpairs.HostView()(i).GetHess(), 0);
    }
-    ASSERT_LT(gpairs.HostSpan()[1].GetGrad(), 0);
-    ASSERT_LT(gpairs.HostSpan()[3].GetGrad(), 0);
+    ASSERT_LT(gpairs.HostView()(1).GetGrad(), 0);
+    ASSERT_LT(gpairs.HostView()(3).GetGrad(), 0);

-    ASSERT_GT(gpairs.HostSpan()[0].GetGrad(), 0);
-    ASSERT_GT(gpairs.HostSpan()[2].GetGrad(), 0);
+    ASSERT_GT(gpairs.HostView()(0).GetGrad(), 0);
+    ASSERT_GT(gpairs.HostView()(2).GetGrad(), 0);

    info.weights_ = {2, 3};
-    HostDeviceVector<GradientPair> weighted_gpairs;
+    linalg::Matrix<GradientPair> weighted_gpairs;
    obj->GetGradient(predts, info, 0, &weighted_gpairs);
-    auto const& h_gpairs = gpairs.ConstHostSpan();
-    auto const& h_weighted_gpairs = weighted_gpairs.ConstHostSpan();
+    auto const& h_gpairs = gpairs.HostView();
+    auto const& h_weighted_gpairs = weighted_gpairs.HostView();
    for (size_t i : {0ul, 1ul}) {
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 2.0f);
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 2.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetGrad(), h_gpairs(i).GetGrad() * 2.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetHess(), h_gpairs(i).GetHess() * 2.0f);
    }
    for (size_t i : {2ul, 3ul}) {
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 3.0f);
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 3.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetGrad(), h_gpairs(i).GetGrad() * 3.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetHess(), h_gpairs(i).GetHess() * 3.0f);
    }
  }

@@ -125,7 +125,7 @@ void TestUnbiasedNDCG(Context const* ctx) {
  std::sort(h_label.begin(), h_label.end(), std::greater<>{});
  HostDeviceVector<float> predt(p_fmat->Info().num_row_, 1.0f);

-  HostDeviceVector<GradientPair> out_gpair;
+  linalg::Matrix<GradientPair> out_gpair;
  obj->GetGradient(predt, p_fmat->Info(), 0, &out_gpair);

  Json config{Object{}};
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -18,26 +18,22 @@

 namespace xgboost::obj {
 TEST(LambdaRank, GPUNDCGJsonIO) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestNDCGJsonIO(&ctx);
 }

 TEST(LambdaRank, GPUMAPStat) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestMAPStat(&ctx);
 }

 TEST(LambdaRank, GPUNDCGGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestNDCGGPair(&ctx);
 }

 void TestGPUMakePair() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);

  MetaInfo info;
  HostDeviceVector<float> predt;
@@ -48,23 +44,24 @@ void TestGPUMakePair() {
  auto make_args = [&](std::shared_ptr<ltr::RankingCache> p_cache, auto rank_idx,
                       common::Span<std::size_t const> y_sorted_idx) {
    linalg::Vector<double> dummy;
-    auto d = dummy.View(ctx.gpu_id);
+    auto d = dummy.View(ctx.Device());
    linalg::Vector<GradientPair> dgpair;
-    auto dg = dgpair.View(ctx.gpu_id);
-    cuda_impl::KernelInputs args{d,
-                                 d,
-                                 d,
-                                 d,
-                                 p_cache->DataGroupPtr(&ctx),
-                                 p_cache->CUDAThreadsGroupPtr(),
-                                 rank_idx,
-                                 info.labels.View(ctx.gpu_id),
-                                 predt.ConstDeviceSpan(),
-                                 {},
-                                 dg,
-                                 nullptr,
-                                 y_sorted_idx,
-                                 0};
+    auto dg = dgpair.View(ctx.Device());
+    cuda_impl::KernelInputs args{
+        d,
+        d,
+        d,
+        d,
+        p_cache->DataGroupPtr(&ctx),
+        p_cache->CUDAThreadsGroupPtr(),
+        rank_idx,
+        info.labels.View(ctx.Device()),
+        predt.ConstDeviceSpan(),
+        linalg::MatrixView<GradientPair>{common::Span<GradientPair>{}, {0}, DeviceOrd::CUDA(0)},
+        dg,
+        nullptr,
+        y_sorted_idx,
+        0};
    return args;
  };

@@ -131,8 +128,7 @@ void TestGPUMakePair() {
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }

 TEST(LambdaRank, GPUUnbiasedNDCG) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestUnbiasedNDCG(&ctx);
 }

@@ -166,8 +162,7 @@ TEST(LambdaRank, RankItemCountOnRight) {
 }

 TEST(LambdaRank, GPUMAPGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
  TestMAPGPair(&ctx);
 }
 }  // namespace xgboost::obj
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -122,8 +122,8 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
  EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.197f, 0.01f);
  EXPECT_NEAR(obj->ProbToMargin(0.5f), 0, 0.01f);
  EXPECT_NEAR(obj->ProbToMargin(0.9f), 2.197f, 0.01f);
-  EXPECT_ANY_THROW(obj->ProbToMargin(10))
-    << "Expected error when base_score not in range [0,1f] for LogisticRegression";
+  EXPECT_ANY_THROW((void)obj->ProbToMargin(10))
+      << "Expected error when base_score not in range [0,1f] for LogisticRegression";

  // test PredTransform
  HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
@@ -282,9 +282,9 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
 TEST(Objective, CPU_vs_CUDA) {
  Context ctx = MakeCUDACtx(GPUIDX);

-  ObjFunction* obj = ObjFunction::Create("reg:squarederror", &ctx);
-  HostDeviceVector<GradientPair> cpu_out_preds;
-  HostDeviceVector<GradientPair> cuda_out_preds;
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:squarederror", &ctx)};
+  linalg::Matrix<GradientPair> cpu_out_preds;
+  linalg::Matrix<GradientPair> cuda_out_preds;

  constexpr size_t kRows = 400;
  constexpr size_t kCols = 100;
@@ -300,33 +300,31 @@ TEST(Objective, CPU_vs_CUDA) {
  info.labels.Reshape(kRows);
  auto& h_labels = info.labels.Data()->HostVector();
  for (size_t i = 0; i < h_labels.size(); ++i) {
-    h_labels[i] = 1 / (float)(i+1);
+    h_labels[i] = 1 / static_cast<float>(i+1);
  }

  {
    // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
    obj->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
    // CUDA
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
    obj->GetGradient(preds, info, 0, &cuda_out_preds);
  }

-  auto& h_cpu_out = cpu_out_preds.HostVector();
-  auto& h_cuda_out = cuda_out_preds.HostVector();
+  auto h_cpu_out = cpu_out_preds.HostView();
+  auto h_cuda_out = cuda_out_preds.HostView();

  float sgrad = 0;
  float shess = 0;
  for (size_t i = 0; i < kRows; ++i) {
-    sgrad += std::pow(h_cpu_out[i].GetGrad() - h_cuda_out[i].GetGrad(), 2);
-    shess += std::pow(h_cpu_out[i].GetHess() - h_cuda_out[i].GetHess(), 2);
+    sgrad += std::pow(h_cpu_out(i).GetGrad() - h_cuda_out(i).GetGrad(), 2);
+    shess += std::pow(h_cpu_out(i).GetHess() - h_cuda_out(i).GetHess(), 2);
  }
  ASSERT_NEAR(sgrad, 0.0f, kRtEps);
  ASSERT_NEAR(shess, 0.0f, kRtEps);
-
-  delete obj;
 }
 #endif

--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -37,7 +37,14 @@ class ServerForTest {
  }

  ~ServerForTest() {
+    using namespace std::chrono_literals;
+    while (!server_) {
+      std::this_thread::sleep_for(100ms);
+    }
    server_->Shutdown();
+    while (!server_thread_) {
+      std::this_thread::sleep_for(100ms);
+    }
    server_thread_->join();
  }

@@ -56,7 +63,7 @@ class BaseFederatedTest : public ::testing::Test {

  void TearDown() override { server_.reset(nullptr); }

-  static int constexpr kWorldSize{3};
+  static int constexpr kWorldSize{2};
  std::unique_ptr<ServerForTest> server_;
 };

--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -9,7 +9,9 @@
 #include <thread>

 #include "../../../plugin/federated/federated_communicator.h"
+#include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/device_communicator_adapter.cuh"
+#include "../helpers.h"
 #include "./helpers.h"

 namespace xgboost::collective {
@@ -17,67 +19,80 @@ namespace xgboost::collective {
 class FederatedAdapterTest : public BaseFederatedTest {};

 TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { DeviceCommunicatorAdapter adapter{-1, nullptr}; };
+  auto construct = []() { DeviceCommunicatorAdapter adapter{-1}; };
  EXPECT_THROW(construct(), dmlc::Error);
 }

-TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
-  auto construct = []() { DeviceCommunicatorAdapter adapter{0, nullptr}; };
-  EXPECT_THROW(construct(), dmlc::Error);
-}
-
-TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
-  std::vector<std::thread> threads;
-  for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back([rank, server_address = server_->Address()] {
-      FederatedCommunicator comm{kWorldSize, rank, server_address};
-      // Assign device 0 to all workers, since we run gtest in a single-GPU machine
-      DeviceCommunicatorAdapter adapter{0, &comm};
-      int count = 3;
-      thrust::device_vector<double> buffer(count, 0);
-      thrust::sequence(buffer.begin(), buffer.end());
-      adapter.AllReduce(buffer.data().get(), count, DataType::kDouble, Operation::kSum);
-      thrust::host_vector<double> host_buffer = buffer;
-      EXPECT_EQ(host_buffer.size(), count);
-      for (auto i = 0; i < count; i++) {
-        EXPECT_EQ(host_buffer[i], i * kWorldSize);
-      }
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
+namespace {
+void VerifyAllReduceSum() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  auto const device = GPUIDX;
+  int count = 3;
+  common::SetDevice(device);
+  thrust::device_vector<double> buffer(count, 0);
+  thrust::sequence(buffer.begin(), buffer.end());
+  collective::AllReduce<collective::Operation::kSum>(device, buffer.data().get(), count);
+  thrust::host_vector<double> host_buffer = buffer;
+  EXPECT_EQ(host_buffer.size(), count);
+  for (auto i = 0; i < count; i++) {
+    EXPECT_EQ(host_buffer[i], i * world_size);
  }
 }
+}  // anonymous namespace

-TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
-  std::vector<std::thread> threads;
-  for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back([rank, server_address = server_->Address()] {
-      FederatedCommunicator comm{kWorldSize, rank, server_address};
-      // Assign device 0 to all workers, since we run gtest in a single-GPU machine
-      DeviceCommunicatorAdapter adapter{0, &comm};
-
-      int const count = rank + 2;
-      thrust::device_vector<char> buffer(count, 0);
-      thrust::sequence(buffer.begin(), buffer.end());
-      std::vector<std::size_t> segments(kWorldSize);
-      dh::caching_device_vector<char> receive_buffer{};
-
-      adapter.AllGatherV(buffer.data().get(), count, &segments, &receive_buffer);
-
-      EXPECT_EQ(segments[0], 2);
-      EXPECT_EQ(segments[1], 3);
-      thrust::host_vector<char> host_buffer = receive_buffer;
-      EXPECT_EQ(host_buffer.size(), 9);
-      int expected[] = {0, 1, 0, 1, 2, 0, 1, 2, 3};
-      for (auto i = 0; i < 9; i++) {
-        EXPECT_EQ(host_buffer[i], expected[i]);
-      }
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
-  }
+TEST_F(FederatedAdapterTest, MGPUAllReduceSum) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllReduceSum);
 }

+namespace {
+void VerifyAllGather() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  auto const device = GPUIDX;
+  common::SetDevice(device);
+  thrust::device_vector<double> send_buffer(1, rank);
+  thrust::device_vector<double> receive_buffer(world_size, 0);
+  collective::AllGather(device, send_buffer.data().get(), receive_buffer.data().get(),
+                        sizeof(double));
+  thrust::host_vector<double> host_buffer = receive_buffer;
+  EXPECT_EQ(host_buffer.size(), world_size);
+  for (auto i = 0; i < world_size; i++) {
+    EXPECT_EQ(host_buffer[i], i);
+  }
+}
+}  // anonymous namespace
+
+TEST_F(FederatedAdapterTest, MGPUAllGather) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllGather);
+}
+
+namespace {
+void VerifyAllGatherV() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  auto const device = GPUIDX;
+  int const count = rank + 2;
+  common::SetDevice(device);
+  thrust::device_vector<char> buffer(count, 0);
+  thrust::sequence(buffer.begin(), buffer.end());
+  std::vector<std::size_t> segments(world_size);
+  dh::caching_device_vector<char> receive_buffer{};
+
+  collective::AllGatherV(device, buffer.data().get(), count, &segments, &receive_buffer);
+
+  EXPECT_EQ(segments[0], 2);
+  EXPECT_EQ(segments[1], 3);
+  thrust::host_vector<char> host_buffer = receive_buffer;
+  EXPECT_EQ(host_buffer.size(), 5);
+  int expected[] = {0, 1, 0, 1, 2};
+  for (auto i = 0; i < 5; i++) {
+    EXPECT_EQ(host_buffer[i], expected[i]);
+  }
+}
+}  // anonymous namespace
+
+TEST_F(FederatedAdapterTest, MGPUAllGatherV) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllGatherV);
+}
 }  // namespace xgboost::collective
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -31,7 +31,7 @@ class FederatedCommunicatorTest : public BaseFederatedTest {

 protected:
  static void CheckAllgather(FederatedCommunicator &comm, int rank) {
-    int buffer[kWorldSize] = {0, 0, 0};
+    int buffer[kWorldSize] = {0, 0};
    buffer[rank] = rank;
    comm.AllGather(buffer, sizeof(buffer));
    for (auto i = 0; i < kWorldSize; i++) {
@@ -42,7 +42,7 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
  static void CheckAllreduce(FederatedCommunicator &comm) {
    int buffer[] = {1, 2, 3, 4, 5};
    comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
-    int expected[] = {3, 6, 9, 12, 15};
+    int expected[] = {2, 4, 6, 8, 10};
    for (auto i = 0; i < 5; i++) {
      EXPECT_EQ(buffer[i], expected[i]);
    }
--- a/tests/cpp/plugin/test_federated_data.cc
+++ b/tests/cpp/plugin/test_federated_data.cc
@@ -30,7 +30,7 @@ void VerifyLoadUri() {
  std::string uri = path + "?format=csv";
  dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kCol));

-  ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 3);
+  ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 1);
  ASSERT_EQ(dmat->Info().num_row_, kRows);

  for (auto const& page : dmat->GetBatches<SparsePage>()) {
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -15,9 +15,11 @@

 namespace xgboost {
 namespace {
-auto MakeModel(std::string tree_method, std::string objective, std::shared_ptr<DMatrix> dmat) {
+auto MakeModel(std::string tree_method, std::string device, std::string objective,
+               std::shared_ptr<DMatrix> dmat) {
  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
  learner->SetParam("tree_method", tree_method);
+  learner->SetParam("device", device);
  learner->SetParam("objective", objective);
  if (objective.find("quantile") != std::string::npos) {
    learner->SetParam("quantile_alpha", "0.5");
@@ -35,7 +37,7 @@ auto MakeModel(std::string tree_method, std::string objective, std::shared_ptr<D
 }

 void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json expected_model,
-                     std::string tree_method, std::string objective) {
+                     std::string tree_method, std::string device, std::string objective) {
  auto const world_size = collective::GetWorldSize();
  auto const rank = collective::GetRank();
  std::shared_ptr<DMatrix> dmat{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
@@ -61,14 +63,14 @@ void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json e
  }
  std::shared_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};

-  auto model = MakeModel(tree_method, objective, sliced);
+  auto model = MakeModel(tree_method, device, objective, sliced);
  auto base_score = GetBaseScore(model);
-  ASSERT_EQ(base_score, expected_base_score);
-  ASSERT_EQ(model, expected_model);
+  ASSERT_EQ(base_score, expected_base_score) << " rank " << rank;
+  ASSERT_EQ(model, expected_model) << " rank " << rank;
 }
 }  // namespace

-class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
+class VerticalFederatedLearnerTest : public ::testing::TestWithParam<std::string> {
  std::unique_ptr<ServerForTest> server_;
  static int constexpr kWorldSize{3};

@@ -76,7 +78,7 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
  void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
  void TearDown() override { server_.reset(nullptr); }

-  void Run(std::string tree_method, std::string objective) {
+  void Run(std::string tree_method, std::string device, std::string objective) {
    static auto constexpr kRows{16};
    static auto constexpr kCols{16};

@@ -99,27 +101,35 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
      }
    }

-    auto model = MakeModel(tree_method, objective, dmat);
+    auto model = MakeModel(tree_method, device, objective, dmat);
    auto score = GetBaseScore(model);

    RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyObjective, kRows, kCols,
-                                 score, model, tree_method, objective);
+                                 score, model, tree_method, device, objective);
  }
 };

-TEST_P(FederatedLearnerTest, Approx) {
+TEST_P(VerticalFederatedLearnerTest, Approx) {
  std::string objective = GetParam();
-  this->Run("approx", objective);
+  this->Run("approx", "cpu", objective);
 }

-TEST_P(FederatedLearnerTest, Hist) {
+TEST_P(VerticalFederatedLearnerTest, Hist) {
  std::string objective = GetParam();
-  this->Run("hist", objective);
+  this->Run("hist", "cpu", objective);
 }

-INSTANTIATE_TEST_SUITE_P(FederatedLearnerObjective, FederatedLearnerTest,
-                         ::testing::ValuesIn(MakeObjNamesForTest()),
-                         [](const ::testing::TestParamInfo<FederatedLearnerTest::ParamType> &info) {
-                           return ObjTestNameGenerator(info);
-                         });
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST_P(VerticalFederatedLearnerTest, GPUHist) {
+  std::string objective = GetParam();
+  this->Run("hist", "cuda:0", objective);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+
+INSTANTIATE_TEST_SUITE_P(
+    FederatedLearnerObjective, VerticalFederatedLearnerTest,
+    ::testing::ValuesIn(MakeObjNamesForTest()),
+    [](const ::testing::TestParamInfo<VerticalFederatedLearnerTest::ParamType> &info) {
+      return ObjTestNameGenerator(info);
+    });
 }  // namespace xgboost
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -39,7 +39,7 @@ class FederatedServerTest : public BaseFederatedTest {

 protected:
  static void CheckAllgather(federated::FederatedClient& client, int rank) {
-    int data[kWorldSize] = {0, 0, 0};
+    int data[kWorldSize] = {0, 0};
    data[rank] = rank;
    std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
    auto reply = client.Allgather(send_buffer);
@@ -54,7 +54,7 @@ class FederatedServerTest : public BaseFederatedTest {
    std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
    auto reply = client.Allreduce(send_buffer, federated::INT32, federated::SUM);
    auto const* result = reinterpret_cast<int const*>(reply.data());
-    int expected[] = {3, 6, 9, 12, 15};
+    int expected[] = {2, 4, 6, 8, 10};
    for (auto i = 0; i < 5; i++) {
      EXPECT_EQ(result[i], expected[i]);
    }
--- a/tests/cpp/plugin/test_regression_obj_oneapi.cc
+++ b/tests/cpp/plugin/test_regression_obj_oneapi.cc
@@ -148,7 +148,7 @@ TEST(Plugin, CPUvsOneAPI) {

  {
    // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
    obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds);
  }
  {
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -122,11 +122,13 @@ TEST(CpuPredictor, BasicColumnSplit) {
 }

 TEST(CpuPredictor, IterationRange) {
-  TestIterationRange("cpu_predictor");
+  Context ctx;
+  TestIterationRange(&ctx);
 }

 TEST(CpuPredictor, IterationRangeColmnSplit) {
-  TestIterationRangeColumnSplit("cpu_predictor");
+  Context ctx;
+  TestIterationRangeColumnSplit(&ctx);
 }

 TEST(CpuPredictor, ExternalMemory) {
@@ -139,7 +141,8 @@ TEST(CpuPredictor, ExternalMemory) {
 TEST(CpuPredictor, InplacePredict) {
  bst_row_t constexpr kRows{128};
  bst_feature_t constexpr kCols{64};
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(-1);
+  Context ctx;
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id);
  {
    HostDeviceVector<float> data;
    gen.GenerateDense(&data);
@@ -149,7 +152,7 @@ TEST(CpuPredictor, InplacePredict) {
    std::string arr_str;
    Json::Dump(array_interface, &arr_str);
    x->SetArrayData(arr_str.data());
-    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, Context::kCpuId);
+    TestInplacePrediction(&ctx, x, kRows, kCols);
  }

  {
@@ -166,76 +169,80 @@ TEST(CpuPredictor, InplacePredict) {
    Json::Dump(col_interface, &col_str);
    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy};
    x->SetCSRData(rptr_str.data(), col_str.data(), data_str.data(), kCols, true);
-    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, Context::kCpuId);
+    TestInplacePrediction(&ctx, x, kRows, kCols);
  }
 }

+namespace {
 void TestUpdatePredictionCache(bool use_subsampling) {
-  size_t constexpr kRows = 64, kCols = 16, kClasses = 4;
+  std::size_t constexpr kRows = 64, kCols = 16, kClasses = 4;
  LearnerModelParam mparam{MakeMP(kCols, .0, kClasses)};
  Context ctx;

  std::unique_ptr<gbm::GBTree> gbm;
  gbm.reset(static_cast<gbm::GBTree*>(GradientBooster::Create("gbtree", &ctx, &mparam)));
-  std::map<std::string, std::string> cfg;
-  cfg["tree_method"] = "hist";
-  cfg["predictor"]   = "cpu_predictor";
+  Args args{{"tree_method", "hist"}};
  if (use_subsampling) {
-    cfg["subsample"] = "0.5";
+    args.emplace_back("subsample", "0.5");
  }
-  Args args = {cfg.cbegin(), cfg.cend()};
  gbm->Configure(args);

  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);

-  HostDeviceVector<GradientPair> gpair;
-  auto& h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows*kClasses);
-  for (size_t i = 0; i < kRows*kClasses; ++i) {
-    h_gpair[i] = {static_cast<float>(i), 1};
+  linalg::Matrix<GradientPair> gpair({kRows, kClasses}, ctx.Device());
+  auto h_gpair = gpair.HostView();
+  for (size_t i = 0; i < kRows * kClasses; ++i) {
+    std::apply(h_gpair, linalg::UnravelIndex(i, kRows, kClasses)) = {static_cast<float>(i), 1};
  }

  PredictionCacheEntry predtion_cache;
-  predtion_cache.predictions.Resize(kRows*kClasses, 0);
-  // after one training iteration predtion_cache is filled with cached in QuantileHistMaker::Builder prediction values
+  predtion_cache.predictions.Resize(kRows * kClasses, 0);
+  // after one training iteration predtion_cache is filled with cached in QuantileHistMaker
+  // prediction values
  gbm->DoBoost(dmat.get(), &gpair, &predtion_cache, nullptr);

  PredictionCacheEntry out_predictions;
-  // perform fair prediction on the same input data, should be equal to cached result
+  // perform prediction from scratch on the same input data, should be equal to cached result
  gbm->PredictBatch(dmat.get(), &out_predictions, false, 0, 0);

-  std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
-  std::vector<float> &predtion_cache_from_train = predtion_cache.predictions.HostVector();
+  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+  std::vector<float>& predtion_cache_from_train = predtion_cache.predictions.HostVector();
  for (size_t i = 0; i < out_predictions_h.size(); ++i) {
    ASSERT_NEAR(out_predictions_h[i], predtion_cache_from_train[i], kRtEps);
  }
 }
+}  // namespace

-TEST(CPUPredictor, GHistIndex) {
+TEST(CPUPredictor, GHistIndexTraining) {
  size_t constexpr kRows{128}, kCols{16}, kBins{64};
-  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix();
+  Context ctx;
+  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
  auto adapter = data::ArrayAdapter(columnar.c_str());
  std::shared_ptr<DMatrix> p_full{
      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(kRows, kBins, "hist", p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
 }

 TEST(CPUPredictor, CategoricalPrediction) {
-  TestCategoricalPrediction("cpu_predictor");
+  Context ctx;
+  TestCategoricalPrediction(&ctx, false);
 }

 TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
-  TestCategoricalPredictionColumnSplit("cpu_predictor");
+  Context ctx;
+  TestCategoricalPredictionColumnSplit(&ctx);
 }

 TEST(CPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(StringView{"cpu_predictor"});
+  Context ctx;
+  TestCategoricalPredictLeaf(&ctx, false);
 }

 TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
-  TestCategoricalPredictLeafColumnSplit(StringView{"cpu_predictor"});
+  Context ctx;
+  TestCategoricalPredictLeafColumnSplit(&ctx);
 }

 TEST(CpuPredictor, UpdatePredictionCache) {
@@ -244,21 +251,25 @@ TEST(CpuPredictor, UpdatePredictionCache) {
 }

 TEST(CpuPredictor, LesserFeatures) {
-  TestPredictionWithLesserFeatures("cpu_predictor");
+  Context ctx;
+  TestPredictionWithLesserFeatures(&ctx);
 }

 TEST(CpuPredictor, LesserFeaturesColumnSplit) {
-  TestPredictionWithLesserFeaturesColumnSplit("cpu_predictor");
+  Context ctx;
+  TestPredictionWithLesserFeaturesColumnSplit(&ctx);
 }

 TEST(CpuPredictor, Sparse) {
-  TestSparsePrediction(0.2, "cpu_predictor");
-  TestSparsePrediction(0.8, "cpu_predictor");
+  Context ctx;
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
 }

 TEST(CpuPredictor, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(0.2, "cpu_predictor");
-  TestSparsePredictionColumnSplit(0.8, "cpu_predictor");
+  Context ctx;
+  TestSparsePredictionColumnSplit(&ctx, 0.2);
+  TestSparsePredictionColumnSplit(&ctx, 0.8);
 }

 TEST(CpuPredictor, Multi) {
@@ -266,4 +277,6 @@ TEST(CpuPredictor, Multi) {
  ctx.nthread = 1;
  TestVectorLeafPrediction(&ctx);
 }
+
+TEST(CpuPredictor, Access) { TestPredictionDeviceAccess(); }
 }  // namespace xgboost
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -19,8 +19,7 @@
 #include "../helpers.h"
 #include "test_predictor.h"

-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {

 TEST(GPUPredictor, Basic) {
  auto cpu_lparam = MakeCUDACtx(-1);
@@ -38,9 +37,8 @@ TEST(GPUPredictor, Basic) {
    int n_row = i, n_col = i;
    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();

-    Context ctx;
-    ctx.gpu_id = 0;
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    auto ctx = MakeCUDACtx(0);
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

    // Test predict batch
@@ -61,30 +59,92 @@ TEST(GPUPredictor, Basic) {
  }
 }

+namespace {
+void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_result) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+
+  auto ctx = MakeCUDACtx(GPUIDX);
+  std::unique_ptr<Predictor> predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
+  predictor->Configure({});
+
+  for (size_t i = 1; i < 33; i *= 2) {
+    size_t n_row = i, n_col = i;
+    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
+    std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
+
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+
+    // Test predict batch
+    PredictionCacheEntry out_predictions;
+
+    predictor->InitOutPredictions(sliced->Info(), &out_predictions.predictions, model);
+    predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
+
+    std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+    EXPECT_EQ(out_predictions_h, expected_result[i - 1]);
+  }
+}
+}  // anonymous namespace
+
+class MGPUPredictorTest : public BaseMGPUTest {};
+
+TEST_F(MGPUPredictorTest, BasicColumnSplit) {
+  auto ctx = MakeCUDACtx(0);
+  std::unique_ptr<Predictor> predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
+  predictor->Configure({});
+
+  std::array<std::vector<float>, 32> result{};
+  for (size_t i = 1; i < 33; i *= 2) {
+    size_t n_row = i, n_col = i;
+    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
+
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+
+    // Test predict batch
+    PredictionCacheEntry out_predictions;
+
+    predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
+    predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+
+    std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+    result[i - 1] = out_predictions_h;
+  }
+
+  DoTest(VerifyBasicColumnSplit, result);
+}
+
 TEST(GPUPredictor, EllpackBasic) {
-  size_t constexpr kCols {8};
+  size_t constexpr kCols{8};
+  auto ctx = MakeCUDACtx(0);
  for (size_t bins = 2; bins < 258; bins += 16) {
    size_t rows = bins * 16;
-    auto p_m = RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix();
+    auto p_m =
+        RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false);
    ASSERT_FALSE(p_m->PageExists<SparsePage>());
-    TestPredictionFromGradientIndex<EllpackPage>("gpu_predictor", rows, kCols, p_m);
-    TestPredictionFromGradientIndex<EllpackPage>("gpu_predictor", bins, kCols, p_m);
+    TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
+    TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
  }
 }

 TEST(GPUPredictor, EllpackTraining) {
-  size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
-  auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix();
+  auto ctx = MakeCUDACtx(0);
+  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
+                       .Bins(kBins)
+                       .Device(ctx.Ordinal())
+                       .GenerateDeviceDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
-  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
-       .Device(0)
-       .GenerateArrayInterface(&storage);
+  auto columnar =
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
  auto adapter = data::CupyAdapter(columnar);
-  std::shared_ptr<DMatrix> p_full {
-    DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
-  };
-  TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
+  std::shared_ptr<DMatrix> p_full{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_ellpack);
 }

 TEST(GPUPredictor, ExternalMemoryTest) {
@@ -94,9 +154,8 @@ TEST(GPUPredictor, ExternalMemoryTest) {
  gpu_predictor->Configure({});

  const int n_classes = 3;
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.gpu_id)};
+  Context ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};

  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
  std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -123,29 +182,32 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 }

 TEST(GPUPredictor, InplacePredictCupy) {
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(0);
+  gen.Device(ctx.Ordinal());
  HostDeviceVector<float> data;
  std::string interface_str = gen.GenerateArrayInterface(&data);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
-  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
+  TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }

 TEST(GPUPredictor, InplacePredictCuDF) {
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(0);
+  gen.Device(ctx.Ordinal());
  std::vector<HostDeviceVector<float>> storage(kCols);
  auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
-  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
+  TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }

 TEST(GpuPredictor, LesserFeatures) {
-  TestPredictionWithLesserFeatures("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestPredictionWithLesserFeatures(&ctx);
 }

 // Very basic test of empty model
@@ -156,13 +218,12 @@ TEST(GPUPredictor, ShapStump) {
  hipSetDevice(0);
 #endif

-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
-  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
+  trees.push_back(std::make_unique<RegTree>());
  model.CommitModelGroup(std::move(trees), 0);

  auto gpu_lparam = MakeCUDACtx(0);
@@ -173,7 +234,7 @@ TEST(GPUPredictor, ShapStump) {
  auto dmat = RandomDataGenerator(3, 1, 0).GenerateDMatrix();
  gpu_predictor->PredictContribution(dmat.get(), &predictions, model);
  auto& phis = predictions.HostVector();
-  auto base_score = mparam.BaseScore(Context::kCpuId)(0);
+  auto base_score = mparam.BaseScore(DeviceOrd::CPU())(0);
  EXPECT_EQ(phis[0], 0.0);
  EXPECT_EQ(phis[1], base_score);
  EXPECT_EQ(phis[2], 0.0);
@@ -183,13 +244,12 @@ TEST(GPUPredictor, ShapStump) {
 }

 TEST(GPUPredictor, Shap) {
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
-  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
+  trees.push_back(std::make_unique<RegTree>());
  trees[0]->ExpandNode(0, 0, 0.5, true, 1.0, -1.0, 1.0, 0.0, 5.0, 2.0, 3.0);
  model.CommitModelGroup(std::move(trees), 0);

@@ -214,15 +274,18 @@ TEST(GPUPredictor, Shap) {
 }

 TEST(GPUPredictor, IterationRange) {
-  TestIterationRange("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestIterationRange(&ctx);
 }

 TEST(GPUPredictor, CategoricalPrediction) {
-  TestCategoricalPrediction("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalPrediction(&ctx, false);
 }

 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(StringView{"gpu_predictor"});
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalPredictLeaf(&ctx, false);
 }

 TEST(GPUPredictor, PredictLeafBasic) {
@@ -246,8 +309,8 @@ TEST(GPUPredictor, PredictLeafBasic) {
 }

 TEST(GPUPredictor, Sparse) {
-  TestSparsePrediction(0.2, "gpu_predictor");
-  TestSparsePrediction(0.8, "gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
 }
-}  // namespace predictor
-}  // namespace xgboost
+}  // namespace xgboost::predictor
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -8,9 +8,11 @@
 #include <xgboost/data.h>                         // for DMatrix, BatchIterator, BatchSet, MetaInfo
 #include <xgboost/host_device_vector.h>           // for HostDeviceVector
 #include <xgboost/predictor.h>                    // for PredictionCacheEntry, Predictor, Predic...
+#include <xgboost/string_view.h>                  // for StringView

 #include <algorithm>                              // for max
 #include <limits>                                 // for numeric_limits
+#include <memory>                                 // for shared_ptr
 #include <unordered_map>                          // for unordered_map

 #include "../../../src/common/bitfield.h"         // for LBitField32
@@ -42,67 +44,56 @@ TEST(Predictor, PredictionCache) {
  EXPECT_ANY_THROW(container.Entry(m));
 }

-void TestTrainingPrediction(size_t rows, size_t bins,
-                            std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist) {
+void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
  size_t constexpr kCols = 16;
  size_t constexpr kClasses = 3;
  size_t constexpr kIters = 3;

  std::unique_ptr<Learner> learner;
-  auto train = [&](std::string predictor) {
-    p_hist->Info().labels.Reshape(rows, 1);
-    auto &h_label = p_hist->Info().labels.Data()->HostVector();

-    for (size_t i = 0; i < rows; ++i) {
-      h_label[i] = i % kClasses;
-    }
+  p_hist->Info().labels.Reshape(rows, 1);
+  auto &h_label = p_hist->Info().labels.Data()->HostVector();

-    learner.reset(Learner::Create({}));
-    learner->SetParam("tree_method", tree_method);
-    learner->SetParam("objective", "multi:softprob");
-    learner->SetParam("num_feature", std::to_string(kCols));
-    learner->SetParam("num_class", std::to_string(kClasses));
-    learner->SetParam("max_bin", std::to_string(bins));
-    learner->SetParam("predictor", predictor);
-    learner->Configure();
+  for (size_t i = 0; i < rows; ++i) {
+    h_label[i] = i % kClasses;
+  }

-    for (size_t i = 0; i < kIters; ++i) {
-      learner->UpdateOneIter(i, p_hist);
-    }
+  learner.reset(Learner::Create({}));
+  learner->SetParams(Args{{"objective", "multi:softprob"},
+                          {"num_feature", std::to_string(kCols)},
+                          {"num_class", std::to_string(kClasses)},
+                          {"max_bin", std::to_string(bins)},
+                          {"device", ctx->DeviceName()}});
+  learner->Configure();

-    Json model{Object{}};
-    learner->SaveModel(&model);
+  for (size_t i = 0; i < kIters; ++i) {
+    learner->UpdateOneIter(i, p_hist);
+  }

-    learner.reset(Learner::Create({}));
-    learner->LoadModel(model);
-    learner->SetParam("predictor", predictor);
-    learner->Configure();
+  Json model{Object{}};
+  learner->SaveModel(&model);

-    HostDeviceVector<float> from_full;
-    learner->Predict(p_full, false, &from_full, 0, 0);
+  learner.reset(Learner::Create({}));
+  learner->LoadModel(model);
+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();

-    HostDeviceVector<float> from_hist;
-    learner->Predict(p_hist, false, &from_hist, 0, 0);
+  HostDeviceVector<float> from_full;
+  learner->Predict(p_full, false, &from_full, 0, 0);

-    for (size_t i = 0; i < rows; ++i) {
-      EXPECT_NEAR(from_hist.ConstHostVector()[i],
-                  from_full.ConstHostVector()[i], kRtEps);
-    }
-  };
+  HostDeviceVector<float> from_hist;
+  learner->Predict(p_hist, false, &from_hist, 0, 0);

-  if (tree_method == "gpu_hist") {
-    train("gpu_predictor");
-  } else {
-    train("cpu_predictor");
+  for (size_t i = 0; i < rows; ++i) {
+    EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
  }
 }

-void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bst_row_t rows,
-                           bst_feature_t cols, int32_t device) {
-  size_t constexpr kClasses { 4 };
-  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(device);
+void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
+                           bst_feature_t cols) {
+  std::size_t constexpr kClasses { 4 };
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->gpu_id);
  std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);

  std::unique_ptr<Learner> learner {
@@ -113,12 +104,14 @@ void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bs
  learner->SetParam("num_class", std::to_string(kClasses));
  learner->SetParam("seed", "0");
  learner->SetParam("subsample", "0.5");
-  learner->SetParam("gpu_id", std::to_string(device));
-  learner->SetParam("predictor", predictor);
+  learner->SetParam("tree_method", "hist");
  for (int32_t it = 0; it < 4; ++it) {
    learner->UpdateOneIter(it, m);
  }

+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();
+
  HostDeviceVector<float> *p_out_predictions_0{nullptr};
  learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits<float>::quiet_NaN(),
                          &p_out_predictions_0, 0, 2);
@@ -149,67 +142,37 @@ void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bs
    ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
  }

-  learner->SetParam("gpu_id", "-1");
+  learner->SetParam("device", "cpu");
  learner->Configure();
 }

 namespace {
-std::unique_ptr<Learner> LearnerForTest(std::shared_ptr<DMatrix> dmat, size_t iters,
-                                        size_t forest = 1) {
+std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMatrix> dmat,
+                                        size_t iters, size_t forest = 1) {
  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParams(Args{{"num_parallel_tree", std::to_string(forest)}});
+  learner->SetParams(
+      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}});
  for (size_t i = 0; i < iters; ++i) {
    learner->UpdateOneIter(i, dmat);
  }
+
  return learner;
 }

-void VerifyPredictionWithLesserFeatures(Learner *learner, std::string const &predictor_name,
-                                        size_t rows, std::shared_ptr<DMatrix> const &m_test,
-                                        std::shared_ptr<DMatrix> const &m_invalid) {
+void VerifyPredictionWithLesserFeatures(Learner *learner, bst_row_t kRows,
+                                        std::shared_ptr<DMatrix> m_test,
+                                        std::shared_ptr<DMatrix> m_invalid) {
  HostDeviceVector<float> prediction;
-  learner->SetParam("predictor", predictor_name);
-  learner->Configure();
  Json config{Object()};
  learner->SaveConfig(&config);
-  ASSERT_EQ(get<String>(config["learner"]["gradient_booster"]["gbtree_train_param"]["predictor"]),
-            predictor_name);

  learner->Predict(m_test, false, &prediction, 0, 0);
-  ASSERT_EQ(prediction.Size(), rows);
+  ASSERT_EQ(prediction.Size(), kRows);

  ASSERT_THROW({ learner->Predict(m_invalid, false, &prediction, 0, 0); }, dmlc::Error);
-
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  HostDeviceVector<float> from_cpu;
-  learner->SetParam("predictor", "cpu_predictor");
-  learner->Predict(m_test, false, &from_cpu, 0, 0);
-
-  HostDeviceVector<float> from_cuda;
-  learner->SetParam("predictor", "gpu_predictor");
-  learner->Predict(m_test, false, &from_cuda, 0, 0);
-
-  auto const &h_cpu = from_cpu.ConstHostVector();
-  auto const &h_gpu = from_cuda.ConstHostVector();
-  for (size_t i = 0; i < h_cpu.size(); ++i) {
-    ASSERT_NEAR(h_cpu[i], h_gpu[i], kRtEps);
-  }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-}
-}  // anonymous namespace
-
-void TestPredictionWithLesserFeatures(std::string predictor_name) {
-  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
-  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
-  auto learner = LearnerForTest(m_train, kIters);
-  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
-  auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
-  VerifyPredictionWithLesserFeatures(learner.get(), predictor_name, kRows, m_test, m_invalid);
 }

-namespace {
-void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner,
-                                                   std::string const &predictor_name, size_t rows,
+void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner, size_t rows,
                                                   std::shared_ptr<DMatrix> m_test,
                                                   std::shared_ptr<DMatrix> m_invalid) {
  auto const world_size = collective::GetWorldSize();
@@ -217,20 +180,65 @@ void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner,
  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};

-  VerifyPredictionWithLesserFeatures(learner, predictor_name, rows, sliced_test, sliced_invalid);
+  VerifyPredictionWithLesserFeatures(learner, rows, sliced_test, sliced_invalid);
 }
 }  // anonymous namespace

-void TestPredictionWithLesserFeaturesColumnSplit(std::string predictor_name) {
+void TestPredictionWithLesserFeatures(Context const *ctx) {
  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
-  auto learner = LearnerForTest(m_train, kIters);
+  auto learner = LearnerForTest(ctx, m_train, kIters);
+  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
+  auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
+  VerifyPredictionWithLesserFeatures(learner.get(), kRows, m_test, m_invalid);
+}
+
+void TestPredictionDeviceAccess() {
+  Context ctx;
+  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
+  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
+  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
+  auto learner = LearnerForTest(&ctx, m_train, kIters);
+
+  HostDeviceVector<float> from_cpu;
+  {
+    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
+    Context cpu_ctx;
+    learner->SetParam("device", cpu_ctx.DeviceName());
+    learner->Predict(m_test, false, &from_cpu, 0, 0);
+    ASSERT_TRUE(from_cpu.HostCanWrite());
+    ASSERT_FALSE(from_cpu.DeviceCanRead());
+  }
+
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+  HostDeviceVector<float> from_cuda;
+  {
+    Context cuda_ctx = MakeCUDACtx(0);
+    learner->SetParam("device", cuda_ctx.DeviceName());
+    learner->Predict(m_test, false, &from_cuda, 0, 0);
+    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
+    ASSERT_TRUE(from_cuda.DeviceCanWrite());
+    ASSERT_FALSE(from_cuda.HostCanRead());
+  }
+
+  auto const &h_cpu = from_cpu.ConstHostVector();
+  auto const &h_gpu = from_cuda.ConstHostVector();
+  for (size_t i = 0; i < h_cpu.size(); ++i) {
+    ASSERT_NEAR(h_cpu[i], h_gpu[i], kRtEps);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+}
+
+void TestPredictionWithLesserFeaturesColumnSplit(Context const *ctx) {
+  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
+  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
+  auto learner = LearnerForTest(ctx, m_train, kIters);
  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
  auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);

  auto constexpr kWorldSize = 2;
  RunWithInMemoryCommunicator(kWorldSize, VerifyPredictionWithLesserFeaturesColumnSplit,
-                              learner.get(), predictor_name, kRows, m_test, m_invalid);
+                              learner.get(), kRows, m_test, m_invalid);
 }

 void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
@@ -252,7 +260,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
  model->CommitModelGroup(std::move(trees), 0);
 }

-void TestCategoricalPrediction(std::string name, bool is_column_split) {
+void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
  size_t constexpr kCols = 10;
  PredictionCacheEntry out_predictions;

@@ -262,13 +270,10 @@ void TestCategoricalPrediction(std::string name, bool is_column_split) {
  float left_weight = 1.3f;
  float right_weight = 1.7f;

-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model(&mparam, &ctx);
+  gbm::GBTreeModel model(&mparam, ctx);
  GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);

-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-  std::unique_ptr<Predictor> predictor{Predictor::Create(name.c_str(), &ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};

  std::vector<float> row(kCols);
  row[split_ind] = split_cat;
@@ -282,7 +287,7 @@ void TestCategoricalPrediction(std::string name, bool is_column_split) {

  predictor->InitOutPredictions(m->Info(), &out_predictions.predictions, model);
  predictor->PredictBatch(m.get(), &out_predictions, model, 0);
-  auto score = mparam.BaseScore(Context::kCpuId)(0);
+  auto score = mparam.BaseScore(DeviceOrd::CPU())(0);
  ASSERT_EQ(out_predictions.predictions.Size(), 1ul);
  ASSERT_EQ(out_predictions.predictions.HostVector()[0],
            right_weight + score);  // go to right for matching cat
@@ -298,12 +303,12 @@ void TestCategoricalPrediction(std::string name, bool is_column_split) {
  ASSERT_EQ(out_predictions.predictions.HostVector()[0], left_weight + score);
 }

-void TestCategoricalPredictionColumnSplit(std::string name) {
+void TestCategoricalPredictionColumnSplit(Context const *ctx) {
  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, name, true);
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, ctx, true);
 }

-void TestCategoricalPredictLeaf(StringView name, bool is_column_split) {
+void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
  size_t constexpr kCols = 10;
  PredictionCacheEntry out_predictions;

@@ -314,14 +319,10 @@ void TestCategoricalPredictLeaf(StringView name, bool is_column_split) {
  float left_weight = 1.3f;
  float right_weight = 1.7f;

-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-
-  gbm::GBTreeModel model(&mparam, &ctx);
+  gbm::GBTreeModel model(&mparam, ctx);
  GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);

-  ctx.gpu_id = 0;
-  std::unique_ptr<Predictor> predictor{Predictor::Create(name.c_str(), &ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};

  std::vector<float> row(kCols);
  row[split_ind] = split_cat;
@@ -346,19 +347,21 @@ void TestCategoricalPredictLeaf(StringView name, bool is_column_split) {
  ASSERT_EQ(out_predictions.predictions.HostVector()[0], 1);
 }

-void TestCategoricalPredictLeafColumnSplit(StringView name) {
+void TestCategoricalPredictLeafColumnSplit(Context const *ctx) {
  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, name, true);
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, ctx, true);
 }

-void TestIterationRange(std::string name) {
+void TestIterationRange(Context const* ctx) {
  size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
-  auto learner = LearnerForTest(dmat, kIters, kForest);
-  learner->SetParams(Args{{"predictor", name}});
+  auto dmat = RandomDataGenerator(kRows, kCols, 0)
+                  .Device(ctx->gpu_id)
+                  .GenerateDMatrix(true, true, kClasses);
+  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);

  bool bound = false;
-  std::unique_ptr<Learner> sliced {learner->Slice(0, 3, 1, &bound)};
+  bst_layer_t lend{3};
+  std::unique_ptr<Learner> sliced{learner->Slice(0, lend, 1, &bound)};
  ASSERT_FALSE(bound);

  HostDeviceVector<float> out_predt_sliced;
@@ -366,11 +369,8 @@ void TestIterationRange(std::string name) {

  // margin
  {
-    sliced->Predict(dmat, true, &out_predt_sliced, 0, 0, false, false, false,
-                    false, false);
-
-    learner->Predict(dmat, true, &out_predt_ranged, 0, 3, false, false, false,
-                     false, false);
+    sliced->Predict(dmat, true, &out_predt_sliced, 0, 0, false, false, false, false, false);
+    learner->Predict(dmat, true, &out_predt_ranged, 0, lend, false, false, false, false, false);

    auto const &h_sliced = out_predt_sliced.HostVector();
    auto const &h_range = out_predt_ranged.HostVector();
@@ -380,11 +380,8 @@ void TestIterationRange(std::string name) {

  // SHAP
  {
-    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false,
-                    true, false, false);
-
-    learner->Predict(dmat, false, &out_predt_ranged, 0, 3, false, false, true,
-                     false, false);
+    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false, true, false, false);
+    learner->Predict(dmat, false, &out_predt_ranged, 0, lend, false, false, true, false, false);

    auto const &h_sliced = out_predt_sliced.HostVector();
    auto const &h_range = out_predt_ranged.HostVector();
@@ -394,10 +391,8 @@ void TestIterationRange(std::string name) {

  // SHAP interaction
  {
-    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false,
-                    false, false, true);
-    learner->Predict(dmat, false, &out_predt_ranged, 0, 3, false, false, false,
-                     false, true);
+    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false, false, false, true);
+    learner->Predict(dmat, false, &out_predt_ranged, 0, lend, false, false, false, false, true);
    auto const &h_sliced = out_predt_sliced.HostVector();
    auto const &h_range = out_predt_ranged.HostVector();
    ASSERT_EQ(h_sliced.size(), h_range.size());
@@ -406,10 +401,8 @@ void TestIterationRange(std::string name) {

  // Leaf
  {
-    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, true,
-                    false, false, false);
-    learner->Predict(dmat, false, &out_predt_ranged, 0, 3, false, true, false,
-                     false, false);
+    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, true, false, false, false);
+    learner->Predict(dmat, false, &out_predt_ranged, 0, lend, false, true, false, false, false);
    auto const &h_sliced = out_predt_sliced.HostVector();
    auto const &h_range = out_predt_ranged.HostVector();
    ASSERT_EQ(h_sliced.size(), h_range.size());
@@ -456,11 +449,12 @@ void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *s
 }
 }  // anonymous namespace

-void TestIterationRangeColumnSplit(std::string name) {
+void TestIterationRangeColumnSplit(Context const* ctx) {
  size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
-  auto learner = LearnerForTest(dmat, kIters, kForest);
-  learner->SetParams(Args{{"predictor", name}});
+  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
+
+  learner->SetParam("device", ctx->DeviceName());

  bool bound = false;
  std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -488,10 +482,10 @@ void TestIterationRangeColumnSplit(std::string name) {
                              leaf_ranged, leaf_sliced);
 }

-void TestSparsePrediction(float sparsity, std::string predictor) {
+void TestSparsePrediction(Context const *ctx, float sparsity) {
  size_t constexpr kRows = 512, kCols = 128, kIters = 4;
  auto Xy = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(true);
-  auto learner = LearnerForTest(Xy, kIters);
+  auto learner = LearnerForTest(ctx, Xy, kIters);

  HostDeviceVector<float> sparse_predt;

@@ -501,11 +495,14 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
  learner.reset(Learner::Create({Xy}));
  learner->LoadModel(model);

-  learner->SetParam("predictor", predictor);
+  if (ctx->IsCUDA()) {
+    learner->SetParam("tree_method", "gpu_hist");
+    learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  }
  learner->Predict(Xy, false, &sparse_predt, 0, 0);

  HostDeviceVector<float> with_nan(kRows * kCols, std::numeric_limits<float>::quiet_NaN());
-  auto& h_with_nan = with_nan.HostVector();
+  auto &h_with_nan = with_nan.HostVector();
  for (auto const &page : Xy->GetBatches<SparsePage>()) {
    auto batch = page.GetView();
    for (size_t i = 0; i < batch.Size(); ++i) {
@@ -516,7 +513,8 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
    }
  }

-  learner->SetParam("predictor", "cpu_predictor");
+  learner->SetParam("tree_method", "hist");
+  learner->SetParam("gpu_id", "-1");
  // Xcode_12.4 doesn't compile with `std::make_shared`.
  auto dense = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
  auto array_interface = GetArrayInterface(&with_nan, kRows, kCols);
@@ -527,8 +525,8 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
  learner->InplacePredict(dense, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                          &p_dense_predt, 0, 0);

-  auto const& dense_predt = *p_dense_predt;
-  if (predictor == "cpu_predictor") {
+  auto const &dense_predt = *p_dense_predt;
+  if (ctx->IsCPU()) {
    ASSERT_EQ(dense_predt.HostVector(), sparse_predt.HostVector());
  } else {
    auto const &h_dense = dense_predt.HostVector();
@@ -556,10 +554,10 @@ void VerifySparsePredictionColumnSplit(DMatrix *dmat, Learner *learner,
 }
 }  // anonymous namespace

-void TestSparsePredictionColumnSplit(float sparsity, std::string predictor) {
+void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
  size_t constexpr kRows = 512, kCols = 128, kIters = 4;
  auto Xy = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(true);
-  auto learner = LearnerForTest(Xy, kIters);
+  auto learner = LearnerForTest(ctx, Xy, kIters);

  HostDeviceVector<float> sparse_predt;

@@ -569,7 +567,7 @@ void TestSparsePredictionColumnSplit(float sparsity, std::string predictor) {
  learner.reset(Learner::Create({Xy}));
  learner->LoadModel(model);

-  learner->SetParam("predictor", predictor);
+  learner->SetParam("device", ctx->DeviceName());
  learner->Predict(Xy, false, &sparse_predt, 0, 0);

  auto constexpr kWorldSize = 2;
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -31,8 +31,17 @@ inline gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, Context
  return model;
 }

+inline auto CreatePredictorForTest(Context const* ctx) {
+  if (ctx->IsCPU()) {
+    return Predictor::Create("cpu_predictor", ctx);
+  } else {
+    return Predictor::Create("gpu_predictor", ctx);
+  }
+}
+
+// fixme: cpu test
 template <typename Page>
-void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
+void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t cols,
                                     std::shared_ptr<DMatrix> p_hist) {
  constexpr size_t kClasses { 3 };

@@ -40,12 +49,10 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
  auto cuda_ctx = MakeCUDACtx(0);

  std::unique_ptr<Predictor> predictor =
-      std::unique_ptr<Predictor>(Predictor::Create(name, &cuda_ctx));
+      std::unique_ptr<Predictor>(CreatePredictorForTest(&cuda_ctx));
  predictor->Configure({});

-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, kClasses);
+  gbm::GBTreeModel model = CreateTestModel(&mparam, ctx, kClasses);

  {
    auto p_precise = RandomDataGenerator(rows, cols, 0).GenerateDMatrix();
@@ -77,32 +84,33 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
 }

 // p_full and p_hist should come from the same data set.
-void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist);
+void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);

-void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bst_row_t rows,
-                           bst_feature_t cols, int32_t device = -1);
+void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
+                           bst_feature_t cols);

-void TestPredictionWithLesserFeatures(std::string preditor_name);
+void TestPredictionWithLesserFeatures(Context const* ctx);

-void TestPredictionWithLesserFeaturesColumnSplit(std::string preditor_name);
+void TestPredictionDeviceAccess();

-void TestCategoricalPrediction(std::string name, bool is_column_split = false);
+void TestCategoricalPrediction(Context const* ctx, bool is_column_split);

-void TestCategoricalPredictionColumnSplit(std::string name);
+void TestCategoricalPredictionColumnSplit(Context const* ctx);

-void TestCategoricalPredictLeaf(StringView name, bool is_column_split = false);
+void TestPredictionWithLesserFeaturesColumnSplit(Context const* ctx);

-void TestCategoricalPredictLeafColumnSplit(StringView name);
+void TestCategoricalPredictLeaf(Context const* ctx, bool is_column_split);

-void TestIterationRange(std::string name);
+void TestCategoricalPredictLeafColumnSplit(Context const* ctx);

-void TestIterationRangeColumnSplit(std::string name);
+void TestIterationRange(Context const* ctx);

-void TestSparsePrediction(float sparsity, std::string predictor);
+void TestIterationRangeColumnSplit(Context const* ctx);

-void TestSparsePredictionColumnSplit(float sparsity, std::string predictor);
+void TestSparsePrediction(Context const* ctx, float sparsity);
+
+void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity);

 void TestVectorLeafPrediction(Context const* ctx);
 }  // namespace xgboost
--- a/tests/cpp/test_context.cc
+++ b/tests/cpp/test_context.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>
+#include <xgboost/context.h>
+
+namespace xgboost {
+TEST(Context, CPU) {
+  Context ctx;
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, -1);
+
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "oops"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "-1"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CUDA"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU:0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:+0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:0-"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+}
+}  // namespace xgboost
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>  // for Args
+#include <xgboost/context.h>
+#include <xgboost/json.h>  // for FromJson, ToJson
+
+#include <string>  // for string, to_string
+
+#include "../../src/common/common.h"  // for AllVisibleGPUs
+
+namespace xgboost {
+namespace {
+void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
+  ASSERT_EQ(ctx.gpu_id, ord);
+  ASSERT_EQ(ctx.Device().ordinal, ord);
+  ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
+  ASSERT_EQ(ctx.Ordinal(), ord);
+  ASSERT_TRUE(ctx.IsCUDA());
+  ASSERT_FALSE(ctx.IsCPU());
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CUDA(ord));
+
+  Json jctx{ToJson(ctx)};
+  Context new_ctx;
+  FromJson(jctx, &new_ctx);
+  ASSERT_EQ(new_ctx.Device(), ctx.Device());
+  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+}
+}  // namespace
+
+TEST(Context, DeviceOrdinal) {
+  Context ctx;
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+
+  std::string device = "cuda:" + std::to_string(ord);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  TestCUDA(ctx, ord);
+
+  device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+
+  TestCUDA(ctx, ord);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, 1);
+
+  Context new_ctx = ctx;
+  TestCUDA(new_ctx, ctx.Ordinal());
+
+  auto cpu_ctx = ctx.MakeCPU();
+  ASSERT_TRUE(cpu_ctx.IsCPU());
+  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
+
+  auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
+  TestCUDA(cuda_ctx, ctx.Ordinal());
+
+  cuda_ctx.UpdateAllowUnknown(Args{{"fail_on_invalid_gpu_id", "true"}});
+  ASSERT_THROW({ cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:9999"}}); }, dmlc::Error);
+  cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:00"}});
+  ASSERT_EQ(cuda_ctx.Ordinal(), 0);
+
+  ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  // Test alias
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu:0"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu"}});
+  TestCUDA(ctx, 0);
+
+  // Test the thread local memory in dmlc is not linking different instances together.
+  cpu_ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{});
+  TestCUDA(ctx, 0);
+}
+
+TEST(Context, GPUId) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestCUDA(ctx, 0);
+
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
+  TestCUDA(ctx, ord);
+
+  auto device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+  TestCUDA(ctx, ord);
+
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "-1"}});
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+}
+}  // namespace xgboost
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -27,7 +27,6 @@
 #include "../../src/common/io.h"                    // for LoadSequentialFile
 #include "../../src/common/linalg_op.h"             // for ElementWiseTransformHost, begin, end
 #include "../../src/common/random.h"                // for GlobalRandom
-#include "../../src/common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
 #include "dmlc/registry.h"                          // for Registry
@@ -35,14 +34,13 @@
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
 #include "objective_helpers.h"                      // for MakeObjNamesForTest, ObjTestNameGenerator
 #include "xgboost/base.h"                           // for bst_float, Args, bst_feature_t, bst_int
-#include "xgboost/context.h"                        // for Context
+#include "xgboost/context.h"                        // for Context, DeviceOrd
 #include "xgboost/data.h"                           // for DMatrix, MetaInfo, DataType
 #include "xgboost/host_device_vector.h"             // for HostDeviceVector
 #include "xgboost/json.h"                           // for Json, Object, get, String, IsA, opera...
 #include "xgboost/linalg.h"                         // for Tensor, TensorView
 #include "xgboost/logging.h"                        // for ConsoleLogger
 #include "xgboost/predictor.h"                      // for PredictionCacheEntry
-#include "xgboost/span.h"                           // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"                    // for StringView

 namespace xgboost {
@@ -58,9 +56,9 @@ TEST(Learner, Basic) {
  auto minor = XGBOOST_VER_MINOR;
  auto patch = XGBOOST_VER_PATCH;

-  static_assert(std::is_integral<decltype(major)>::value, "Wrong major version type");
-  static_assert(std::is_integral<decltype(minor)>::value, "Wrong minor version type");
-  static_assert(std::is_integral<decltype(patch)>::value, "Wrong patch version type");
+  static_assert(std::is_integral_v<decltype(major)>, "Wrong major version type");
+  static_assert(std::is_integral_v<decltype(minor)>, "Wrong minor version type");
+  static_assert(std::is_integral_v<decltype(patch)>, "Wrong patch version type");
 }

 TEST(Learner, ParameterValidation) {
@@ -92,10 +90,9 @@ TEST(Learner, CheckGroup) {
  size_t constexpr kNumRows = 17;
  bst_feature_t constexpr kNumCols = 15;

-  std::shared_ptr<DMatrix> p_mat{
-      RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
+  std::shared_ptr<DMatrix> p_mat{RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
  std::vector<bst_float> weight(kNumGroups, 1);
-  std::vector<bst_int> group(kNumGroups);
+  std::vector<bst_group_t> group(kNumGroups);
  group[0] = 2;
  group[1] = 3;
  group[2] = 7;
@@ -187,7 +184,7 @@ TEST(Learner, JsonModelIO) {
    fout.close();

    auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json");
-    Json loaded = Json::Load(StringView{loaded_str.c_str(), loaded_str.size()});
+    Json loaded = Json::Load(StringView{loaded_str.data(), loaded_str.size()});

    learner->LoadModel(loaded);
    learner->Configure();
@@ -218,6 +215,34 @@ TEST(Learner, JsonModelIO) {
  }
 }

+TEST(Learner, ConfigIO) {
+  bst_row_t n_samples = 128;
+  bst_feature_t n_features = 12;
+  std::shared_ptr<DMatrix> p_fmat{
+      RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true, false, 2)};
+
+  auto serialised_model_tmp = std::string{};
+  std::string eval_res_0;
+  std::string eval_res_1;
+  {
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParams(Args{{"eval_metric", "ndcg"}, {"eval_metric", "map"}});
+    learner->Configure();
+    learner->UpdateOneIter(0, p_fmat);
+    eval_res_0 = learner->EvalOneIter(0, {p_fmat}, {"Train"});
+    common::MemoryBufferStream fo(&serialised_model_tmp);
+    learner->Save(&fo);
+  }
+
+  {
+    common::MemoryBufferStream fi(&serialised_model_tmp);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->Load(&fi);
+    eval_res_1 = learner->EvalOneIter(0, {p_fmat}, {"Train"});
+  }
+  ASSERT_EQ(eval_res_0, eval_res_1);
+}
+
 // Crashes the test runner if there are race condiditions.
 //
 // Build with additional cmake flags to enable thread sanitizer
@@ -312,45 +337,36 @@ TEST(Learner, GPUConfiguration) {
    learner->SetParams({Arg{"booster", "gblinear"},
                        Arg{"updater", "gpu_coord_descent"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
+    std::unique_ptr<Learner> learner{Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"}});
+    learner->Configure();
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "gpu_hist"},
                        Arg{"gpu_id", "-1"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
  {
    // with CPU algorithm
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
    learner->SetParams({Arg{"tree_method", "hist"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, -1);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CPU());
  }
  {
    // with CPU algorithm, but `gpu_id` takes priority
    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"gpu_id", "0"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"gpu_id", "0"}});
    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
-  }
-  {
-    // With CPU algorithm but GPU Predictor, this is to simulate when
-    // XGBoost is only used for prediction, so tree method is not
-    // specified.
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"predictor", "gpu_predictor"}});
-    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
  }
 }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -379,6 +395,8 @@ TEST(Learner, Seed) {
 TEST(Learner, ConstantSeed) {
  auto m = RandomDataGenerator{10, 10, 0}.GenerateDMatrix(true);
  std::unique_ptr<Learner> learner{Learner::Create({m})};
+  // Use exact as it doesn't initialize column sampler at construction, which alters the rng.
+  learner->SetParam("tree_method", "exact");
  learner->Configure();  // seed the global random

  std::uniform_real_distribution<float> dist;
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@@ -68,10 +68,12 @@ class TestL1MultiTarget : public ::testing::Test {
    }
  }

-  void RunTest(std::string const& tree_method, bool weight) {
+  void RunTest(Context const* ctx, std::string const& tree_method, bool weight) {
    auto p_fmat = weight ? Xyw_ : Xy_;
    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParams(Args{{"tree_method", tree_method}, {"objective", "reg:absoluteerror"}});
+    learner->SetParams(Args{{"tree_method", tree_method},
+                            {"objective", "reg:absoluteerror"},
+                            {"device", ctx->DeviceName()}});
    learner->Configure();
    for (auto i = 0; i < 4; ++i) {
      learner->UpdateOneIter(i, p_fmat);
@@ -87,7 +89,9 @@ class TestL1MultiTarget : public ::testing::Test {
    for (bst_target_t t{0}; t < p_fmat->Info().labels.Shape(1); ++t) {
      auto t_Xy = weight ? single_w_[t] : single_[t];
      std::unique_ptr<Learner> sl{Learner::Create({t_Xy})};
-      sl->SetParams(Args{{"tree_method", tree_method}, {"objective", "reg:absoluteerror"}});
+      sl->SetParams(Args{{"tree_method", tree_method},
+                         {"objective", "reg:absoluteerror"},
+                         {"device", ctx->DeviceName()}});
      sl->Configure();
      sl->UpdateOneIter(0, t_Xy);
      Json s_config{Object{}};
@@ -104,21 +108,33 @@ class TestL1MultiTarget : public ::testing::Test {
    ASSERT_FLOAT_EQ(mean, base_score);
  }

-  void RunTest(std::string const& tree_method) {
-    this->RunTest(tree_method, false);
-    this->RunTest(tree_method, true);
+  void RunTest(Context const* ctx, std::string const& tree_method) {
+    this->RunTest(ctx, tree_method, false);
+    this->RunTest(ctx, tree_method, true);
  }
 };

-TEST_F(TestL1MultiTarget, Hist) { this->RunTest("hist"); }
+TEST_F(TestL1MultiTarget, Hist) {
+  Context ctx;
+  this->RunTest(&ctx, "hist");
+}

-TEST_F(TestL1MultiTarget, Exact) { this->RunTest("exact"); }
+TEST_F(TestL1MultiTarget, Exact) {
+  Context ctx;
+  this->RunTest(&ctx, "exact");
+}

-TEST_F(TestL1MultiTarget, Approx) { this->RunTest("approx"); }
+TEST_F(TestL1MultiTarget, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "approx");
+}

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-TEST_F(TestL1MultiTarget, GpuHist) { this->RunTest("gpu_hist"); }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST_F(TestL1MultiTarget, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "hist");
+}
+#endif  // defined(XGBOOST_USE_CUDA)

 TEST(MultiStrategy, Configure) {
  auto p_fmat = RandomDataGenerator{12ul, 3ul, 0.0}.GenerateDMatrix();
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -698,10 +698,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
-                            // Somehow rebuilding the cache can generate slightly
-                            // different result (1e-7) with CPU predictor for some
-                            // entries.
-                            {"predictor", "gpu_predictor"},
                            // Mitigate the difference caused by hardware fused multiply
                            // add to tree weight during update prediction cache.
                            {"learning_rate", "1.0"},
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -2,6 +2,7 @@
 * Copyright 2020-2022 by XGBoost contributors
 */
 #include <gtest/gtest.h>
+#include <thrust/host_vector.h>

 #if defined(XGBOOST_USE_CUDA)
 #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
@@ -11,25 +12,23 @@
 #include "../../helpers.h"
 #include "../../histogram_helpers.h"
 #include "../test_evaluate_splits.h"  // TestPartitionBasedSplit
-#include <thrust/host_vector.h>

 namespace xgboost {
 namespace tree {
+
 namespace {
 auto ZeroParam() {
-  auto args = Args{{"min_child_weight", "0"},
-                   {"lambda", "0"}};
+  auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}};
  TrainParam tparam;
  tparam.UpdateAllowUnknown(args);
  return tparam;
 }
-
 }  // anonymous namespace

 inline GradientQuantiser DummyRoundingFactor() {
  thrust::device_vector<GradientPair> gpair(1);
  gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return GradientQuantiser(dh::ToSpan(gpair));
+  return {dh::ToSpan(gpair), MetaInfo()};
 }

 thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPairPrecise> x) {
@@ -41,7 +40,6 @@ thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPa
  return y;
 }

-
 TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
  GPUTrainingParam param{param_};
@@ -65,12 +63,13 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {

  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};

-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;

  ASSERT_EQ(result.thresh, 1);
  this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat,
-                    result.dir == kLeftDir, quantiser.ToFloatingPoint(result.left_sum), quantiser.ToFloatingPoint(result.right_sum));
+                    result.dir == kLeftDir, quantiser.ToFloatingPoint(result.left_sum),
+                    quantiser.ToFloatingPoint(result.right_sum));
 }

 TEST(GpuHist, PartitionBasic) {
@@ -106,7 +105,7 @@ TEST(GpuHist, PartitionBasic) {
  };

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);

  {
    // -1.0s go right
@@ -147,7 +146,8 @@ TEST(GpuHist, PartitionBasic) {
    EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
  }
  // With 3.0/3.0 missing values
-  // Forward, first 2 categories are selected, while the last one go to left along with missing value
+  // Forward, first 2 categories are selected, while the last one go to left along with missing
+  // value
  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 6.0});
    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
@@ -217,11 +217,12 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                          false};

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram = ConvertToInteger({        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(
+        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
    EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                              dh::ToSpan(feature_histogram)};
    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
@@ -233,7 +234,8 @@ TEST(GpuHist, PartitionTwoFeatures) {

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram = ConvertToInteger({        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
+    auto feature_histogram = ConvertToInteger(
+        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
    EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                              dh::ToSpan(feature_histogram)};
    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
@@ -275,12 +277,12 @@ TEST(GpuHist, PartitionTwoNodes) {
                                          false};

  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);

  {
    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram_a = ConvertToInteger({{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0},
-                                         {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram_a = ConvertToInteger(
+        {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
    thrust::device_vector<EvaluateSplitInputs> inputs(2);
    inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set),
                                    dh::ToSpan(feature_histogram_a)};
@@ -289,8 +291,6 @@ TEST(GpuHist, PartitionTwoNodes) {
                                    dh::ToSpan(feature_histogram_b)};
    thrust::device_vector<GPUExpandEntry> results(2);
    evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results));
-    GPUExpandEntry result_a = results[0];
-    GPUExpandEntry result_b = results[1];
    EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]),
              std::bitset<32>("10000000000000000000000000000000"));
    EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]),
@@ -310,8 +310,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
  // Setup gradients so that second feature gets higher gain
  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});

-  dh::device_vector<FeatureType> feature_types(feature_set.size(),
-                                               FeatureType::kCategorical);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
  common::Span<FeatureType> d_feature_types;
  if (is_categorical) {
    auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
@@ -330,9 +329,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                          cuts.min_vals_.ConstDeviceSpan(),
                                          false};

-  GPUHistEvaluator evaluator{
-      tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;

  EXPECT_EQ(result.findex, 1);
@@ -344,31 +342,23 @@ void TestEvaluateSingleSplit(bool is_categorical) {
  EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
 }

-TEST(GpuHist, EvaluateSingleSplit) {
-  TestEvaluateSingleSplit(false);
-}
+TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); }

-TEST(GpuHist, EvaluateSingleCategoricalSplit) {
-  TestEvaluateSingleSplit(true);
-}
+TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); }

 TEST(GpuHist, EvaluateSingleSplitMissing) {
  auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
  TrainParam tparam = ZeroParam();
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2};
  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
  EvaluateSplitSharedInputs shared_inputs{param,
                                          quantiser,
                                          {},
@@ -383,7 +373,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
  EXPECT_EQ(result.findex, 0);
  EXPECT_EQ(result.fvalue, 1.0);
  EXPECT_EQ(result.dir, kRightDir);
-  EXPECT_EQ(result.left_sum,quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
+  EXPECT_EQ(result.left_sum, quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
  EXPECT_EQ(result.right_sum, quantiser.ToFixedPoint(GradientPairPrecise(1.5, 1.0)));
 }

@@ -404,24 +394,18 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) {
 // Feature 0 has a better split, but the algorithm must select feature 1
 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
  auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
  TrainParam tparam = ZeroParam();
  tparam.UpdateAllowUnknown(Args{});
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({          {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
+  auto feature_histogram = ConvertToInteger({{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
  EvaluateSplitSharedInputs shared_inputs{param,
                                          quantiser,
                                          {},
@@ -435,31 +419,25 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {

  EXPECT_EQ(result.findex, 1);
  EXPECT_EQ(result.fvalue, 11.0);
-  EXPECT_EQ(result.left_sum,quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
+  EXPECT_EQ(result.left_sum, quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
  EXPECT_EQ(result.right_sum, quantiser.ToFixedPoint(GradientPairPrecise(0.5, 0.5)));
 }

 // Features 0 and 1 have identical gain, the algorithm must select 0
 TEST(GpuHist, EvaluateSingleSplitBreakTies) {
  auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
  TrainParam tparam = ZeroParam();
  tparam.UpdateAllowUnknown(Args{});
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
+  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
  EvaluateSplitSharedInputs shared_inputs{param,
                                          quantiser,
                                          {},
@@ -469,7 +447,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                          false};

  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;

  EXPECT_EQ(result.findex, 0);
  EXPECT_EQ(result.fvalue, 1.0);
@@ -483,41 +461,31 @@ TEST(GpuHist, EvaluateSplits) {
  tparam.UpdateAllowUnknown(Args{});
  GPUTrainingParam param{tparam};

-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 0.0};
-  auto feature_histogram_left = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
-  auto feature_histogram_right = ConvertToInteger({          {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input_left{
-      1,0,
-      parent_sum,
-      dh::ToSpan(feature_set),
-      dh::ToSpan(feature_histogram_left)};
-  EvaluateSplitInputs input_right{
-      2,0,
-      parent_sum,
-      dh::ToSpan(feature_set),
-      dh::ToSpan(feature_histogram_right)};
-  EvaluateSplitSharedInputs shared_inputs{
-      param,
-      quantiser,
-      {},
-      dh::ToSpan(feature_segments),
-      dh::ToSpan(feature_values),
-      dh::ToSpan(feature_min_values),
-      false
-  };
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 0.0};
+  auto feature_histogram_left =
+      ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram_right =
+      ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input_left{1, 0, parent_sum, dh::ToSpan(feature_set),
+                                 dh::ToSpan(feature_histogram_left)};
+  EvaluateSplitInputs input_right{2, 0, parent_sum, dh::ToSpan(feature_set),
+                                  dh::ToSpan(feature_histogram_right)};
+  EvaluateSplitSharedInputs shared_inputs{param,
+                                          quantiser,
+                                          {},
+                                          dh::ToSpan(feature_segments),
+                                          dh::ToSpan(feature_values),
+                                          dh::ToSpan(feature_min_values),
+                                          false};

-  GPUHistEvaluator evaluator{
-      tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
-  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input_left,input_right};
-  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator.GetEvaluator(),
-                           dh::ToSpan(out_splits));
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
+  dh::device_vector<EvaluateSplitInputs> inputs =
+      std::vector<EvaluateSplitInputs>{input_left, input_right};
+  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
+                                 evaluator.GetEvaluator(), dh::ToSpan(out_splits));

  DeviceSplitCandidate result_left = out_splits[0];
  EXPECT_EQ(result_left.findex, 1);
@@ -536,18 +504,19 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
  cuts_.cut_values_.SetDevice(0);
  cuts_.min_vals_.SetDevice(0);

-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0);

  // Convert the sample histogram to fixed point
  auto quantiser = DummyRoundingFactor();
  thrust::host_vector<GradientPairInt64> h_hist;
-  for(auto e: hist_[0]){
+  for (auto e : hist_[0]) {
    h_hist.push_back(quantiser.ToFixedPoint(e));
  }
  dh::device_vector<GradientPairInt64> d_hist = h_hist;
  dh::device_vector<bst_feature_t> feature_set{std::vector<bst_feature_t>{0}};

-  EvaluateSplitInputs input{0, 0, quantiser.ToFixedPoint(total_gpair_), dh::ToSpan(feature_set), dh::ToSpan(d_hist)};
+  EvaluateSplitInputs input{0, 0, quantiser.ToFixedPoint(total_gpair_), dh::ToSpan(feature_set),
+                            dh::ToSpan(d_hist)};
  EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param_},
                                          quantiser,
                                          dh::ToSpan(ft),
@@ -558,5 +527,65 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
  auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
  ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
 }
+
+class MGPUHistTest : public BaseMGPUTest {};
+
+namespace {
+void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
+  auto rank = collective::GetRank();
+  auto quantiser = DummyRoundingFactor();
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  TrainParam tparam = ZeroParam();
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts{rank == 0
+                                 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX)
+                                 : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+
+  // Setup gradients so that second feature gets higher gain
+  auto feature_histogram = rank == 0 ? ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}})
+                                     : ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}});
+
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types;
+  if (is_categorical) {
+    auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
+                                     cuts.cut_values_.HostVector().end());
+    cuts.SetCategorical(true, max_cat);
+    d_feature_types = dh::ToSpan(feature_types);
+  }
+
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
+  EvaluateSplitSharedInputs shared_inputs{param,
+                                          quantiser,
+                                          d_feature_types,
+                                          cuts.cut_ptrs_.ConstDeviceSpan(),
+                                          cuts.cut_values_.ConstDeviceSpan(),
+                                          cuts.min_vals_.ConstDeviceSpan(),
+                                          false};
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), GPUIDX};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX);
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+
+  EXPECT_EQ(result.findex, 1) << "rank: " << rank;
+  if (is_categorical) {
+    ASSERT_TRUE(std::isnan(result.fvalue));
+  } else {
+    EXPECT_EQ(result.fvalue, 11.0) << "rank: " << rank;
+  }
+  EXPECT_EQ(result.left_sum + result.right_sum, parent_sum) << "rank: " << rank;
+}
+}  // anonymous namespace
+
+TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
+  DoTest(VerifyColumnSplitEvaluateSingleSplit, false);
+}
+
+TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
+  DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
+}
 }  // namespace tree
 }  // namespace xgboost
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -43,7 +43,8 @@ void VerifySampling(size_t page_size,
    EXPECT_NE(page->n_rows, kRows);
  }

-  GradientBasedSampler sampler(&ctx, page, kRows, param, subsample, sampling_method);
+  GradientBasedSampler sampler(&ctx, kRows, param, subsample, sampling_method,
+                               !fixed_size_sampling);
  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());

  if (fixed_size_sampling) {
@@ -97,7 +98,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
  auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
  EXPECT_NE(page->n_rows, kRows);

-  GradientBasedSampler sampler(&ctx, page, kRows, param, kSubsample, TrainParam::kUniform);
+  GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
  auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
  auto sampled_page = sample.page;
  EXPECT_EQ(sample.sample_rows, kRows);
@@ -145,7 +146,8 @@ TEST(GradientBasedSampler, GradientBasedSampling) {
  constexpr size_t kPageSize = 0;
  constexpr float kSubsample = 0.8;
  constexpr int kSamplingMethod = TrainParam::kGradientBased;
-  VerifySampling(kPageSize, kSubsample, kSamplingMethod);
+  constexpr bool kFixedSizeSampling = true;
+  VerifySampling(kPageSize, kSubsample, kSamplingMethod, kFixedSizeSampling);
 }

 TEST(GradientBasedSampler, GradientBasedSamplingExternalMemory) {
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -44,7 +44,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                 sizeof(GradientPairInt64));

-    auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+    auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                           feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
                           quantiser);
@@ -64,7 +64,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
      dh::device_vector<GradientPairInt64> new_histogram(num_bins);
      auto d_new_histogram = dh::ToSpan(new_histogram);

-      auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+      auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                             feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
                             d_new_histogram, quantiser);
@@ -154,7 +154,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
  dh::device_vector<GradientPairInt64> cat_hist(num_categories);
  auto gpair = GenerateRandomGradients(kRows, 0, 2);
  gpair.SetDevice(0);
-  auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+  auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
  /**
   * Generate hist with cat data.
   */
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -34,7 +34,7 @@ void TestUpdatePositionBatch() {
  std::vector<int> extra_data = {0};
  // Send the first five training instances to the right node
  // and the second 5 to the left node
-  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) {
+  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
    return ridx > 4;
  });
  rows = rp.GetRowsHost(1);
@@ -47,7 +47,7 @@ void TestUpdatePositionBatch() {
  }

  // Split the left node again
-  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) {
+  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
    return ridx < 7;
  });
  EXPECT_EQ(rp.GetRows(3).size(), 2);
@@ -61,7 +61,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
  thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
  thrust::device_vector<bst_uint> counts(segments.size());

-  auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
+  auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
  std::vector<int> op_data(segments.size());
  std::vector<PerNodeData<int>> h_batch_info(segments.size());
  dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
@@ -84,7 +84,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
  dh::device_vector<int8_t> tmp;
  SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                                 dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
-                                                 total_rows, op, &tmp, nullptr);
+                                                 total_rows, op, &tmp);

  auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
  for (size_t i = 0; i < segments.size(); i++) {
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -4,13 +4,13 @@
 #include "../test_evaluate_splits.h"

 #include <gtest/gtest.h>
-#include <xgboost/base.h>                               // for GradientPairPrecise, Args, Gradie...
-#include <xgboost/context.h>                            // for Context
-#include <xgboost/data.h>                               // for FeatureType, DMatrix, MetaInfo
-#include <xgboost/logging.h>                            // for CHECK_EQ
-#include <xgboost/tree_model.h>                         // for RegTree, RTreeNodeStat
+#include <xgboost/base.h>        // for GradientPairPrecise, Args, Gradie...
+#include <xgboost/context.h>     // for Context
+#include <xgboost/data.h>        // for FeatureType, DMatrix, MetaInfo
+#include <xgboost/logging.h>     // for CHECK_EQ
+#include <xgboost/tree_model.h>  // for RegTree, RTreeNodeStat

-#include <memory>                                       // for make_shared, shared_ptr, addressof
+#include <memory>  // for make_shared, shared_ptr, addressof

 #include "../../../../src/common/hist_util.h"           // for HistCollection, HistogramCuts
 #include "../../../../src/common/random.h"              // for ColumnSampler
@@ -18,6 +18,8 @@
 #include "../../../../src/data/gradient_index.h"        // for GHistIndexMatrix
 #include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator
 #include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
+#include "../../../../src/tree/hist/hist_cache.h"       // for BoundedHistCollection
+#include "../../../../src/tree/hist/param.h"            // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                 // for GradStats, TrainParam
 #include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...

@@ -34,7 +36,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
  auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();

  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
-  common::HistCollection hist;
+  BoundedHistCollection hist;
  std::vector<GradientPair> row_gpairs = {
      {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
      {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f}};
@@ -48,12 +50,10 @@ void TestEvaluateSplits(bool force_read_by_column) {
  std::iota(row_indices.begin(), row_indices.end(), 0);
  row_set_collection.Init();

-  auto hist_builder = common::GHistBuilder(gmat.cut.Ptrs().back());
-  hist.Init(gmat.cut.Ptrs().back());
-  hist.AddHistRow(0);
-  hist.AllocateAllData();
-  hist_builder.template BuildHist<false>(row_gpairs, row_set_collection[0],
-                                         gmat, hist[0], force_read_by_column);
+  HistMakerTrainParam hist_param;
+  hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
+  hist.AllocateHistograms({0});
+  common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);

  // Compute total gradient for all data points
  GradientPairPrecise total_gpair;
@@ -113,13 +113,13 @@ TEST(HistMultiEvaluator, Evaluate) {
      RandomDataGenerator{n_samples, n_features, 0.5}.Targets(n_targets).GenerateDMatrix(true);

  HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
-  std::vector<common::HistCollection> histogram(n_targets);
+  HistMakerTrainParam hist_param;
+  std::vector<BoundedHistCollection> histogram(n_targets);
  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
  for (bst_target_t t{0}; t < n_targets; ++t) {
    auto &hist = histogram[t];
-    hist.Init(n_bins * n_features);
-    hist.AddHistRow(0);
-    hist.AllocateAllData();
+    hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
+    hist.AllocateHistograms({0});
    auto node_hist = hist[0];
    node_hist[0] = {-0.5, 0.5};
    node_hist[1] = {2.0, 0.5};
@@ -145,7 +145,7 @@ TEST(HistMultiEvaluator, Evaluate) {

  std::vector<MultiExpandEntry> entries(1, {/*nidx=*/0, /*depth=*/0});

-  std::vector<common::HistCollection const *> ptrs;
+  std::vector<BoundedHistCollection const *> ptrs;
  std::transform(histogram.cbegin(), histogram.cend(), std::back_inserter(ptrs),
                 [](auto const &h) { return std::addressof(h); });

@@ -227,16 +227,16 @@ auto CompareOneHotAndPartition(bool onehot) {
  auto sampler = std::make_shared<common::ColumnSampler>();
  auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
  std::vector<CPUExpandEntry> entries(1);
+  HistMakerTrainParam hist_param;

  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
-    common::HistCollection hist;
+    BoundedHistCollection hist;

    entries.front().nid = 0;
    entries.front().depth = 0;

-    hist.Init(gmat.cut.TotalBins());
-    hist.AddHistRow(0);
-    hist.AllocateAllData();
+    hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
+    hist.AllocateHistograms({0});
    auto node_hist = hist[0];

    CHECK_EQ(node_hist.size(), n_cats);
@@ -263,10 +263,10 @@ TEST(HistEvaluator, Categorical) {
 }

 TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
-  common::HistCollection hist;
-  hist.Init(cuts_.TotalBins());
-  hist.AddHistRow(0);
-  hist.AllocateAllData();
+  BoundedHistCollection hist;
+  HistMakerTrainParam hist_param;
+  hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+  hist.AllocateHistograms({0});
  auto node_hist = hist[0];
  ASSERT_EQ(node_hist.size(), feature_histogram_.size());
  std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin());
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -2,19 +2,40 @@
 * Copyright 2018-2023 by Contributors
 */
 #include <gtest/gtest.h>
-#include <xgboost/context.h>  // Context
+#include <xgboost/base.h>                // for bst_node_t, bst_bin_t, Gradient...
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for BatchIterator, BatchSet, DMatrix
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/linalg.h>              // for MakeTensorView
+#include <xgboost/logging.h>             // for Error, LogCheck_EQ, LogCheck_LT
+#include <xgboost/span.h>                // for Span, operator!=
+#include <xgboost/tree_model.h>          // for RegTree

-#include <limits>
+#include <algorithm>   // for max
+#include <cstddef>     // for size_t
+#include <cstdint>     // for int32_t, uint32_t
+#include <functional>  // for function
+#include <iterator>    // for back_inserter
+#include <limits>      // for numeric_limits
+#include <memory>      // for shared_ptr, allocator, unique_ptr
+#include <numeric>     // for iota, accumulate
+#include <vector>      // for vector

-#include "../../../../src/common/categorical.h"
-#include "../../../../src/common/row_set.h"
-#include "../../../../src/tree/hist/expand_entry.h"
-#include "../../../../src/tree/hist/histogram.h"
-#include "../../categorical_helpers.h"
-#include "../../helpers.h"
+#include "../../../../src/collective/communicator-inl.h"  // for GetRank, GetWorldSize
+#include "../../../../src/common/hist_util.h"             // for GHistRow, HistogramCuts, Sketch...
+#include "../../../../src/common/ref_resource_view.h"     // for RefResourceView
+#include "../../../../src/common/row_set.h"               // for RowSetCollection
+#include "../../../../src/common/threading_utils.h"       // for BlockedSpace2d
+#include "../../../../src/data/gradient_index.h"          // for GHistIndexMatrix
+#include "../../../../src/tree/common_row_partitioner.h"  // for CommonRowPartitioner
+#include "../../../../src/tree/hist/expand_entry.h"       // for CPUExpandEntry
+#include "../../../../src/tree/hist/hist_cache.h"         // for BoundedHistCollection
+#include "../../../../src/tree/hist/histogram.h"          // for HistogramBuilder
+#include "../../../../src/tree/hist/param.h"              // for HistMakerTrainParam
+#include "../../categorical_helpers.h"                    // for OneHotEncodeFeature
+#include "../../helpers.h"                                // for RandomDataGenerator, GenerateRa...

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples, size_t base_rowid = 0) {
  auto &row_indices = *row_set->Data();
@@ -26,10 +47,8 @@ void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples

 void TestAddHistRows(bool is_distributed) {
  Context ctx;
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
-  std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
-  int starting_index = std::numeric_limits<int>::max();
-  int sync_count = 0;
+  std::vector<bst_node_t> nodes_to_build;
+  std::vector<bst_node_t> nodes_to_sub;

  size_t constexpr kNRows = 8, kNCols = 16;
  int32_t constexpr kMaxBins = 4;
@@ -42,26 +61,22 @@ void TestAddHistRows(bool is_distributed) {
  tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
  tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
  tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
-  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4));
-  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5));
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
+  nodes_to_build.emplace_back(3);
+  nodes_to_build.emplace_back(4);
+  nodes_to_sub.emplace_back(5);
+  nodes_to_sub.emplace_back(6);

-  HistogramBuilder<CPUExpandEntry> histogram_builder;
-  histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
-                          is_distributed, false);
-  histogram_builder.AddHistRows(&starting_index, &sync_count,
-                                nodes_for_explicit_hist_build_,
-                                nodes_for_subtraction_trick_, &tree);
+  HistMakerTrainParam hist_param;
+  HistogramBuilder histogram_builder;
+  histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false,
+                          &hist_param);
+  histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false);

-  ASSERT_EQ(sync_count, 2);
-  ASSERT_EQ(starting_index, 3);
-
-  for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
-    ASSERT_EQ(histogram_builder.Histogram().RowExists(node.nid), true);
+  for (bst_node_t const &nidx : nodes_to_build) {
+    ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
  }
-  for (const CPUExpandEntry &node : nodes_for_subtraction_trick_) {
-    ASSERT_EQ(histogram_builder.Histogram().RowExists(node.nid), true);
+  for (bst_node_t const &nidx : nodes_to_sub) {
+    ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
  }
 }

@@ -72,87 +87,77 @@ TEST(CPUHistogram, AddRows) {
 }

 void TestSyncHist(bool is_distributed) {
-  size_t constexpr kNRows = 8, kNCols = 16;
-  int32_t constexpr kMaxBins = 4;
+  std::size_t constexpr kNRows = 8, kNCols = 16;
+  bst_bin_t constexpr kMaxBins = 4;
  Context ctx;

-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
-  std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
-  int starting_index = std::numeric_limits<int>::max();
-  int sync_count = 0;
+  std::vector<bst_bin_t> nodes_for_explicit_hist_build;
+  std::vector<bst_bin_t> nodes_for_subtraction_trick;
  RegTree tree;

  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
  auto const &gmat =
      *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());

-  HistogramBuilder<CPUExpandEntry> histogram;
+  HistogramBuilder histogram;
  uint32_t total_bins = gmat.cut.Ptrs().back();
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed, false);
+  HistMakerTrainParam hist_param;
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, &hist_param);

-  common::RowSetCollection row_set_collection_;
+  common::RowSetCollection row_set_collection;
  {
-    row_set_collection_.Clear();
-    std::vector<size_t> &row_indices = *row_set_collection_.Data();
+    row_set_collection.Clear();
+    std::vector<size_t> &row_indices = *row_set_collection.Data();
    row_indices.resize(kNRows);
    std::iota(row_indices.begin(), row_indices.end(), 0);
-    row_set_collection_.Init();
+    row_set_collection.Init();
  }

  // level 0
-  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0));
-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  nodes_for_explicit_hist_build.emplace_back(0);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);

  tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.clear();
-  nodes_for_subtraction_trick_.clear();
+  nodes_for_explicit_hist_build.clear();
+  nodes_for_subtraction_trick.clear();

  // level 1
-  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(), tree.GetDepth(1));
-  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(), tree.GetDepth(2));
+  nodes_for_explicit_hist_build.emplace_back(tree[0].LeftChild());
+  nodes_for_subtraction_trick.emplace_back(tree[0].RightChild());

-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);

  tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
  tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);

-  nodes_for_explicit_hist_build_.clear();
-  nodes_for_subtraction_trick_.clear();
+  nodes_for_explicit_hist_build.clear();
+  nodes_for_subtraction_trick.clear();
  // level 2
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
-  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4));
-  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5));
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
+  nodes_for_explicit_hist_build.emplace_back(3);
+  nodes_for_subtraction_trick.emplace_back(4);
+  nodes_for_explicit_hist_build.emplace_back(5);
+  nodes_for_subtraction_trick.emplace_back(6);

-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);

-  const size_t n_nodes = nodes_for_explicit_hist_build_.size();
+  const size_t n_nodes = nodes_for_explicit_hist_build.size();
  ASSERT_EQ(n_nodes, 2ul);
-  row_set_collection_.AddSplit(0, tree[0].LeftChild(), tree[0].RightChild(), 4,
-                               4);
-  row_set_collection_.AddSplit(1, tree[1].LeftChild(), tree[1].RightChild(), 2,
-                               2);
-  row_set_collection_.AddSplit(2, tree[2].LeftChild(), tree[2].RightChild(), 2,
-                               2);
+  row_set_collection.AddSplit(0, tree[0].LeftChild(), tree[0].RightChild(), 4, 4);
+  row_set_collection.AddSplit(1, tree[1].LeftChild(), tree[1].RightChild(), 2, 2);
+  row_set_collection.AddSplit(2, tree[2].LeftChild(), tree[2].RightChild(), 2, 2);

  common::BlockedSpace2d space(
      n_nodes,
-      [&](size_t node) {
-        const int32_t nid = nodes_for_explicit_hist_build_[node].nid;
-        return row_set_collection_[nid].Size();
+      [&](std::size_t nidx_in_set) {
+        bst_node_t nidx = nodes_for_explicit_hist_build[nidx_in_set];
+        return row_set_collection[nidx].Size();
      },
      256);

  std::vector<common::GHistRow> target_hists(n_nodes);
-  for (size_t i = 0; i < nodes_for_explicit_hist_build_.size(); ++i) {
-    const int32_t nid = nodes_for_explicit_hist_build_[i].nid;
-    target_hists[i] = histogram.Histogram()[nid];
+  for (size_t i = 0; i < nodes_for_explicit_hist_build.size(); ++i) {
+    bst_node_t nidx = nodes_for_explicit_hist_build[i];
+    target_hists[i] = histogram.Histogram()[nidx];
  }

  // set values to specific nodes hist
@@ -176,14 +181,7 @@ void TestSyncHist(bool is_distributed) {

  histogram.Buffer().Reset(1, n_nodes, space, target_hists);
  // sync hist
-  if (is_distributed) {
-    histogram.SyncHistogramDistributed(&tree, nodes_for_explicit_hist_build_,
-                                       nodes_for_subtraction_trick_,
-                                       starting_index, sync_count);
-  } else {
-    histogram.SyncHistogramLocal(&tree, nodes_for_explicit_hist_build_,
-                                 nodes_for_subtraction_trick_);
-  }
+  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);

  using GHistRowT = common::GHistRow;
  auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -196,11 +194,10 @@ void TestSyncHist(bool is_distributed) {
    }
  };
  size_t node_id = 0;
-  for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
-    auto this_hist = histogram.Histogram()[node.nid];
-    const size_t parent_id = tree[node.nid].Parent();
-    const size_t subtraction_node_id =
-        nodes_for_subtraction_trick_[node_id].nid;
+  for (auto const &nidx : nodes_for_explicit_hist_build) {
+    auto this_hist = histogram.Histogram()[nidx];
+    const size_t parent_id = tree[nidx].Parent();
+    const size_t subtraction_node_id = nodes_for_subtraction_trick[node_id];
    auto parent_hist = histogram.Histogram()[parent_id];
    auto sibling_hist = histogram.Histogram()[subtraction_node_id];

@@ -208,11 +205,10 @@ void TestSyncHist(bool is_distributed) {
    ++node_id;
  }
  node_id = 0;
-  for (const CPUExpandEntry &node : nodes_for_subtraction_trick_) {
-    auto this_hist = histogram.Histogram()[node.nid];
-    const size_t parent_id = tree[node.nid].Parent();
-    const size_t subtraction_node_id =
-        nodes_for_explicit_hist_build_[node_id].nid;
+  for (auto const &nidx : nodes_for_subtraction_trick) {
+    auto this_hist = histogram.Histogram()[nidx];
+    const size_t parent_id = tree[nidx].Parent();
+    const size_t subtraction_node_id = nodes_for_explicit_hist_build[node_id];
    auto parent_hist = histogram.Histogram()[parent_id];
    auto sibling_hist = histogram.Histogram()[subtraction_node_id];

@@ -246,9 +242,9 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
      {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f}};

  bst_node_t nid = 0;
-  HistogramBuilder<CPUExpandEntry> histogram;
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed,
-                  is_col_split);
+  HistogramBuilder histogram;
+  HistMakerTrainParam hist_param;
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);

  RegTree tree;

@@ -260,12 +256,17 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
  row_set_collection.Init();

  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
-  nodes_for_explicit_hist_build.push_back(node);
+  std::vector<bst_node_t> nodes_to_build{node.nid};
+  std::vector<bst_node_t> dummy_sub;
+
+  histogram.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};
  for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {kMaxBins, 0.5})) {
-    histogram.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                        gpair, force_read_by_column);
+    histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                        linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column);
  }
+  histogram.SyncHistogram(&tree, nodes_to_build, {});

  // Check if number of histogram bins is correct
  ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -326,18 +327,18 @@ void ValidateCategoricalHistogram(size_t n_categories,

 void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
  size_t constexpr kRows = 340;
-  int32_t constexpr kBins = 256;
+  bst_bin_t constexpr kBins = 256;
  auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories);
  auto cat_m = GetDMatrixFromData(x, kRows, 1);
  cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
  Context ctx;

-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  BatchParam batch_param{0, kBins};

  RegTree tree;
-  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
-  nodes_for_explicit_hist_build.push_back(node);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(RegTree::kRoot)};
+  std::vector<bst_node_t> nodes_to_build;
+  nodes_to_build.push_back(node.nid);

  auto gpair = GenerateRandomGradients(kRows, 0, 2);

@@ -347,30 +348,41 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
  row_indices.resize(kRows);
  std::iota(row_indices.begin(), row_indices.end(), 0);
  row_set_collection.Init();
+  HistMakerTrainParam hist_param;
+  std::vector<bst_node_t> dummy_sub;
+
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};

  /**
   * Generate hist with cat data.
   */
-  HistogramBuilder<CPUExpandEntry> cat_hist;
+  HistogramBuilder cat_hist;
  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
    auto total_bins = gidx.cut.TotalBins();
-    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    cat_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                       gpair.HostVector(), force_read_by_column);
+    cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                       linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
+                       force_read_by_column);
  }
+  cat_hist.SyncHistogram(&tree, nodes_to_build, {});

  /**
   * Generate hist with one hot encoded data.
   */
  auto x_encoded = OneHotEncodeFeature(x, n_categories);
  auto encode_m = GetDMatrixFromData(x_encoded, kRows, n_categories);
-  HistogramBuilder<CPUExpandEntry> onehot_hist;
+  HistogramBuilder onehot_hist;
  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
    auto total_bins = gidx.cut.TotalBins();
-    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                          gpair.HostVector(), force_read_by_column);
+    onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                          linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
+                          force_read_by_column);
  }
+  onehot_hist.SyncHistogram(&tree, nodes_to_build, {});

  auto cat = cat_hist.Histogram()[0];
  auto onehot = onehot_hist.Histogram()[0];
@@ -397,19 +409,22 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
    batch_param.hess = hess;
  }

-  std::vector<size_t> partition_size(1, 0);
-  size_t total_bins{0};
-  size_t n_samples{0};
+  std::vector<std::size_t> partition_size(1, 0);
+  bst_bin_t total_bins{0};
+  bst_row_t n_samples{0};

  auto gpair = GenerateRandomGradients(m->Info().num_row_, 0.0, 1.0);
  auto const &h_gpair = gpair.HostVector();

  RegTree tree;
-  std::vector<CPUExpandEntry> nodes;
-  nodes.emplace_back(0, tree.GetDepth(0));
+  std::vector<bst_node_t> nodes{RegTree::kRoot};
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return partition_size.at(nidx_in_set); }, 256};

  common::GHistRow multi_page;
-  HistogramBuilder<CPUExpandEntry> multi_build;
+  HistogramBuilder multi_build;
+  HistMakerTrainParam hist_param;
+  std::vector<bst_node_t> dummy_sub;
  {
    /**
     * Multi page
@@ -427,23 +442,21 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
    }
    ASSERT_EQ(n_samples, m->Info().num_row_);

-    common::BlockedSpace2d space{
-        1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
-        256};
-
-    multi_build.Reset(total_bins, batch_param, ctx->Threads(), rows_set.size(), false, false);
-
-    size_t page_idx{0};
+    multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+    multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    std::size_t page_idx{0};
    for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
-      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {}, h_gpair,
+      multi_build.BuildHist(page_idx, space, page, rows_set[page_idx], nodes,
+                            linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
                            force_read_by_column);
      ++page_idx;
    }
-    ASSERT_EQ(page_idx, 2);
-    multi_page = multi_build.Histogram()[0];
+    multi_build.SyncHistogram(&tree, nodes, {});
+
+    multi_page = multi_build.Histogram()[RegTree::kRoot];
  }

-  HistogramBuilder<CPUExpandEntry> single_build;
+  HistogramBuilder single_build;
  common::GHistRow single_page;
  {
    /**
@@ -452,18 +465,24 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
    common::RowSetCollection row_set_collection;
    InitRowPartitionForTest(&row_set_collection, n_samples);

-    single_build.Reset(total_bins, batch_param, ctx->Threads(), 1, false, false);
+    single_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
    SparsePage concat;
    std::vector<float> hess(m->Info().num_row_, 1.0f);
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    for (auto const &page : m->GetBatches<SparsePage>()) {
      concat.Push(page);
    }

    auto cut = common::SketchOnDMatrix(ctx, m.get(), batch_param.max_bin, false, hess);
    GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
                          std::numeric_limits<double>::quiet_NaN(), ctx->Threads());
-    single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column);
-    single_page = single_build.Histogram()[0];
+
+    single_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    single_build.BuildHist(0, space, gmat, row_set_collection, nodes,
+                           linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
+                           force_read_by_column);
+    single_build.SyncHistogram(&tree, nodes, {});
+
+    single_page = single_build.Histogram()[RegTree::kRoot];
  }

  for (size_t i = 0; i < single_page.size(); ++i) {
@@ -487,5 +506,108 @@ TEST(CPUHistogram, ExternalMemory) {
  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
  TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
-}  // namespace tree
-}  // namespace xgboost
+
+namespace {
+class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+ public:
+  std::vector<GradientPairPrecise> TestOverflow(bool limit, bool is_distributed,
+                                                bool is_col_split) {
+    bst_bin_t constexpr kBins = 256;
+    Context ctx;
+    HistMakerTrainParam hist_param;
+    if (limit) {
+      hist_param.Init(Args{{"max_cached_hist_node", "1"}});
+    }
+
+    std::shared_ptr<DMatrix> Xy =
+        is_col_split ? RandomDataGenerator{8192, 16, 0.5}.GenerateDMatrix(true)
+                     : RandomDataGenerator{8192, 16, 0.5}.Bins(kBins).GenerateQuantileDMatrix(true);
+    if (is_col_split) {
+      Xy =
+          std::shared_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+    }
+
+    double sparse_thresh{TrainParam::DftSparseThreshold()};
+    auto batch = BatchParam{kBins, sparse_thresh};
+    bst_bin_t n_total_bins{0};
+    float split_cond{0};
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+      n_total_bins = page.cut.TotalBins();
+      // use a cut point in the second column for split
+      split_cond = page.cut.Values()[kBins + kBins / 2];
+    }
+
+    RegTree tree;
+    MultiHistogramBuilder hist_builder;
+    CHECK_EQ(Xy->Info().IsColumnSplit(), is_col_split);
+
+    hist_builder.Reset(&ctx, n_total_bins, tree.NumTargets(), batch, is_distributed,
+                       Xy->Info().IsColumnSplit(), &hist_param);
+
+    std::vector<CommonRowPartitioner> partitioners;
+    partitioners.emplace_back(&ctx, Xy->Info().num_row_, /*base_rowid=*/0,
+                              Xy->Info().IsColumnSplit());
+
+    auto gpair = GenerateRandomGradients(Xy->Info().num_row_, 0.0, 1.0);
+
+    CPUExpandEntry best;
+    hist_builder.BuildRootHist(Xy.get(), &tree, partitioners,
+                               linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1),
+                               best, batch);
+
+    best.split.Update(1.0f, 1, split_cond, false, false, GradStats{1.0, 1.0}, GradStats{1.0, 1.0});
+    tree.ExpandNode(best.nid, best.split.SplitIndex(), best.split.split_value, false,
+                    /*base_weight=*/2.0f,
+                    /*left_leaf_weight=*/1.0f, /*right_leaf_weight=*/1.0f, best.GetLossChange(),
+                    /*sum_hess=*/2.0f, best.split.left_sum.GetHess(),
+                    best.split.right_sum.GetHess());
+
+    std::vector<CPUExpandEntry> valid_candidates{best};
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+      partitioners.front().UpdatePosition(&ctx, page, valid_candidates, &tree);
+    }
+    CHECK_NE(partitioners.front()[tree.LeftChild(best.nid)].Size(), 0);
+    CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0);
+
+    hist_builder.BuildHistLeftRight(
+        Xy.get(), &tree, partitioners, valid_candidates,
+        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch);
+
+    if (limit) {
+      CHECK(!hist_builder.Histogram(0).HistogramExists(best.nid));
+    } else {
+      CHECK(hist_builder.Histogram(0).HistogramExists(best.nid));
+    }
+
+    std::vector<GradientPairPrecise> result;
+    auto hist = hist_builder.Histogram(0)[tree.LeftChild(best.nid)];
+    std::copy(hist.cbegin(), hist.cend(), std::back_inserter(result));
+    hist = hist_builder.Histogram(0)[tree.RightChild(best.nid)];
+    std::copy(hist.cbegin(), hist.cend(), std::back_inserter(result));
+
+    return result;
+  }
+
+  void RunTest() {
+    auto param = GetParam();
+    auto res0 = this->TestOverflow(false, std::get<0>(param), std::get<1>(param));
+    auto res1 = this->TestOverflow(true, std::get<0>(param), std::get<1>(param));
+    ASSERT_EQ(res0, res1);
+  }
+};
+
+auto MakeParamsForTest() {
+  std::vector<std::tuple<bool, bool>> configs;
+  for (auto i : {true, false}) {
+    for (auto j : {true, false}) {
+      configs.emplace_back(i, j);
+    }
+  }
+  return configs;
+}
+}  // anonymous namespace
+
+TEST_P(OverflowTest, Overflow) { this->RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(CPUHistogram, OverflowTest, ::testing::ValuesIn(MakeParamsForTest()));
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_constraints.cu
+++ b/tests/cpp/tree/test_constraints.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -59,7 +59,7 @@ void CompareBitField(LBitField64 d_field, std::set<uint32_t> positions) {
  LBitField64 h_field{ {h_field_storage.data(),
                        h_field_storage.data() + h_field_storage.size()} };

-  for (size_t i = 0; i < h_field.Size(); ++i) {
+  for (size_t i = 0; i < h_field.Capacity(); ++i) {
    if (positions.find(i) != positions.cend()) {
      ASSERT_TRUE(h_field.Check(i));
    } else {
@@ -88,7 +88,7 @@ TEST(GPUFeatureInteractionConstraint, Init) {
        {h_node_storage.data(), h_node_storage.data() +  h_node_storage.size()}
      };
      // no feature is attached to node.
-      for (size_t i = 0; i < h_node.Size(); ++i) {
+      for (size_t i = 0; i < h_node.Capacity(); ++i) {
        ASSERT_FALSE(h_node.Check(i));
      }
    }
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -2,22 +2,24 @@
 * Copyright 2022-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
-#include <xgboost/base.h>                       // for GradientPairInternal, GradientPairPrecise
-#include <xgboost/data.h>                       // for MetaInfo
-#include <xgboost/host_device_vector.h>         // for HostDeviceVector
-#include <xgboost/span.h>                       // for operator!=, Span, SpanIterator
+#include <xgboost/base.h>                // for GradientPairInternal, GradientPairPrecise
+#include <xgboost/data.h>                // for MetaInfo
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/span.h>                // for operator!=, Span, SpanIterator

-#include <algorithm>                            // for max, max_element, next_permutation, copy
-#include <cmath>                                // for isnan
-#include <cstddef>                              // for size_t
-#include <cstdint>                              // for int32_t, uint64_t, uint32_t
-#include <limits>                               // for numeric_limits
-#include <numeric>                              // for iota
-#include <tuple>                                // for make_tuple, tie, tuple
-#include <utility>                              // for pair
-#include <vector>                               // for vector
+#include <algorithm>  // for max, max_element, next_permutation, copy
+#include <cmath>      // for isnan
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, uint64_t, uint32_t
+#include <limits>     // for numeric_limits
+#include <numeric>    // for iota
+#include <tuple>      // for make_tuple, tie, tuple
+#include <utility>    // for pair
+#include <vector>     // for vector

 #include "../../../src/common/hist_util.h"      // for HistogramCuts, HistCollection, GHistRow
+#include "../../../src/tree/hist/hist_cache.h"  // for HistogramCollection
+#include "../../../src/tree/hist/param.h"       // for HistMakerTrainParam
 #include "../../../src/tree/param.h"            // for TrainParam, GradStats
 #include "../../../src/tree/split_evaluator.h"  // for TreeEvaluator
 #include "../helpers.h"                         // for SimpleLCG, SimpleRealUniformDistribution
@@ -35,7 +37,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
  MetaInfo info_;
  float best_score_{-std::numeric_limits<float>::infinity()};
  common::HistogramCuts cuts_;
-  common::HistCollection hist_;
+  BoundedHistCollection hist_;
  GradientPairPrecise total_gpair_;

  void SetUp() override {
@@ -56,9 +58,9 @@ class TestPartitionBasedSplit : public ::testing::Test {

    cuts_.min_vals_.Resize(1);

-    hist_.Init(cuts_.TotalBins());
-    hist_.AddHistRow(0);
-    hist_.AllocateAllData();
+    HistMakerTrainParam hist_param;
+    hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
+    hist_.AllocateHistograms({0});
    auto node_hist = hist_[0];

    SimpleLCG lcg;
--- a/tests/cpp/tree/test_fit_stump.cc
+++ b/tests/cpp/tree/test_fit_stump.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/linalg.h>
@@ -8,17 +8,17 @@
 #include "../../src/tree/fit_stump.h"
 #include "../helpers.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 void TestFitStump(Context const *ctx, DataSplitMode split = DataSplitMode::kRow) {
  std::size_t constexpr kRows = 16, kTargets = 2;
-  HostDeviceVector<GradientPair> gpair;
-  auto &h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows * kTargets);
+  linalg::Matrix<GradientPair> gpair;
+  gpair.SetDevice(ctx->Device());
+  gpair.Reshape(kRows, kTargets);
+  auto h_gpair = gpair.HostView();
  for (std::size_t i = 0; i < kRows; ++i) {
    for (std::size_t t = 0; t < kTargets; ++t) {
-      h_gpair.at(i * kTargets + t) = GradientPair{static_cast<float>(i), 1};
+      h_gpair(i, t) = GradientPair{static_cast<float>(i), 1};
    }
  }
  linalg::Vector<float> out;
@@ -53,6 +53,4 @@ TEST(InitEstimation, FitStumpColumnSplit) {
  auto constexpr kWorldSize{3};
  RunWithInMemoryCommunicator(kWorldSize, &TestFitStump, &ctx, DataSplitMode::kCol);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -11,16 +11,15 @@
 #include <vector>

 #include "../../../src/common/common.h"
-#include "../../../src/data/sparse_page_source.h"
 #if defined(XGBOOST_USE_CUDA)
-#include "../../../src/tree/constraints.cuh"
+#include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
+#include "../../../src/data/ellpack_page.h"    // for EllpackPage
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../../../src/tree/updater_gpu_common.cuh"
 #include "../../../src/tree/updater_gpu_hist.cu"
 #elif defined(XGBOOST_USE_HIP)
-#include "../../../src/tree/constraints.hip.h"
+#include "../../../src/data/ellpack_page.hip.h"  // for EllpackPageImpl
+#include "../../../src/data/ellpack_page.h"    // for EllpackPage
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../../../src/tree/updater_gpu_common.hip.h"
 #include "../../../src/tree/updater_gpu_hist.hip"
 #endif
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
@@ -103,8 +102,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  auto page = BuildEllpackPage(kNRows, kNCols);
  BatchParam batch_param{};
  Context ctx{MakeCUDACtx(0)};
-  GPUHistMakerDevice<GradientSumT> maker(&ctx, page.get(), {}, kNRows, param, kNCols, kNCols,
-                                         batch_param);
+  auto cs = std::make_shared<common::ColumnSampler>(0);
+  GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
+                           batch_param, MetaInfo());
  xgboost::SimpleLCG gen;
  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
  HostDeviceVector<GradientPair> gpair(kNRows);
@@ -116,10 +116,16 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  gpair.SetDevice(0);

  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
+  maker.row_partitioner = std::make_unique<RowPartitioner>(0, kNRows);
+
+  maker.hist.Init(0, page->Cuts().TotalBins());
  maker.hist.AllocateHistograms({0});
+
  maker.gpair = gpair.DeviceSpan();
-  maker.quantiser.reset(new GradientQuantiser(maker.gpair));
+  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair, MetaInfo());
+  maker.page = page.get();
+
+  maker.InitFeatureGroupsOnce();

  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                         maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(),
@@ -143,8 +149,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  std::vector<GradientPairPrecise> solution = GetHostHistGpair();
  for (size_t i = 0; i < h_result.size(); ++i) {
    auto result = maker.quantiser->ToFloatingPoint(h_result[i]);
-    EXPECT_NEAR(result.GetGrad(), solution[i].GetGrad(), 0.01f);
-    EXPECT_NEAR(result.GetHess(), solution[i].GetHess(), 0.01f);
+    ASSERT_NEAR(result.GetGrad(), solution[i].GetGrad(), 0.01f);
+    ASSERT_NEAR(result.GetHess(), solution[i].GetHess(), 0.01f);
  }
 }

@@ -176,7 +182,7 @@ HistogramCutsWrapper GetHostCutMatrix () {
 inline GradientQuantiser DummyRoundingFactor() {
  thrust::device_vector<GradientPair> gpair(1);
  gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return GradientQuantiser(dh::ToSpan(gpair));
+  return {dh::ToSpan(gpair), MetaInfo()};
 }

 void TestHistogramIndexImpl() {
@@ -225,7 +231,7 @@ TEST(GpuHist, TestHistogramIndex) {
  TestHistogramIndexImpl();
 }

-void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
                size_t gpu_page_size, RegTree* tree, HostDeviceVector<bst_float>* preds,
                float subsample = 1.0f, const std::string& sampling_method = "uniform",
                int max_bin = 2) {
@@ -257,6 +263,7 @@ void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatr

  ObjInfo task{ObjInfo::kRegression};
  tree::GPUHistMaker hist_maker{ctx, &task};
+  hist_maker.Configure(Args{});

  std::vector<HostDeviceVector<bst_node_t>> position(1);
  hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
@@ -274,7 +281,8 @@ TEST(GpuHist, UniformSampling) {
  // Create an in-memory DMatrix.
  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));

-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  RegTree tree;
@@ -304,7 +312,8 @@ TEST(GpuHist, GradientBasedSampling) {
  // Create an in-memory DMatrix.
  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));

-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  RegTree tree;
@@ -340,11 +349,12 @@ TEST(GpuHist, ExternalMemory) {
  // Create a single batch DMatrix.
  std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));

-  auto gpair = GenerateRandomGradients(kRows);
+  Context ctx(MakeCUDACtx(0));
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  RegTree tree;
-  Context ctx(MakeCUDACtx(0));
  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
  // Build another tree using multiple ELLPACK pages.
@@ -377,12 +387,13 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
  std::unique_ptr<DMatrix> dmat_ext(
      CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));

-  auto gpair = GenerateRandomGradients(kRows);
+  Context ctx(MakeCUDACtx(0));
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));

  // Build a tree using the in-memory DMatrix.
  auto rng = common::GlobalRandom();

-  Context ctx(MakeCUDACtx(0));
  RegTree tree;
  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
  UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
@@ -408,14 +419,14 @@ TEST(GpuHist, ConfigIO) {
  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_gpu_hist", &ctx, &task)};
  updater->Configure(Args{});

-  Json j_updater { Object() };
+  Json j_updater{Object{}};
  updater->SaveConfig(&j_updater);
-  ASSERT_TRUE(IsA<Object>(j_updater["gpu_hist_train_param"]));
+  ASSERT_TRUE(IsA<Object>(j_updater["hist_train_param"]));
  updater->LoadConfig(j_updater);

-  Json j_updater_roundtrip { Object() };
+  Json j_updater_roundtrip{Object{}};
  updater->SaveConfig(&j_updater_roundtrip);
-  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["gpu_hist_train_param"]));
+  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["hist_train_param"]));

  ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
@@ -432,4 +443,54 @@ TEST(GpuHist, MaxDepth) {

  ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }
+
+namespace {
+RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
+  ObjInfo task{ObjInfo::kRegression};
+  GPUHistMaker hist_maker{ctx, &task};
+  hist_maker.Configure(Args{});
+
+  TrainParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  RegTree tree;
+  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                    {&tree});
+  return tree;
+}
+
+void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+  Context ctx(MakeCUDACtx(GPUIDX));
+
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
+
+  RegTree tree = GetUpdatedTree(&ctx, sliced.get());
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+}  // anonymous namespace
+
+class MGPUHistTest : public BaseMGPUTest {};
+
+TEST_F(MGPUHistTest, GPUHistColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  Context ctx(MakeCUDACtx(0));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  RegTree expected_tree = GetUpdatedTree(&ctx, dmat.get());
+
+  DoTest(VerifyColumnSplit, kRows, kCols, expected_tree);
+}
 }  // namespace xgboost::tree
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -26,9 +26,11 @@ TEST(GrowHistMaker, InteractionConstraint) {
  auto constexpr kRows = 32;
  auto constexpr kCols = 16;
  auto p_dmat = GenerateDMatrix(kRows, kCols);
-  auto p_gradients = GenerateGradients(kRows);
-
  Context ctx;
+
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
  ObjInfo task{ObjInfo::kRegression};
  {
    // With constraints
@@ -39,7 +41,8 @@ TEST(GrowHistMaker, InteractionConstraint) {
    param.UpdateAllowUnknown(
        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});

    ASSERT_EQ(tree.NumExtraNodes(), 4);
    ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -55,7 +58,8 @@ TEST(GrowHistMaker, InteractionConstraint) {
    std::vector<HostDeviceVector<bst_node_t>> position(1);
    TrainParam param;
    param.Init(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});

    ASSERT_EQ(tree.NumExtraNodes(), 10);
    ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -68,9 +72,12 @@ TEST(GrowHistMaker, InteractionConstraint) {
 namespace {
 void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
                       RegTree const& expected_tree) {
-  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  auto p_gradients = GenerateGradients(rows);
  Context ctx;
+  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
+  linalg::Matrix<GradientPair> gpair({rows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(rows));
+
+
  ObjInfo task{ObjInfo::kRegression};
  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
  std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -81,7 +88,8 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
  RegTree tree{1u, cols};
  TrainParam param;
  param.Init(Args{});
-  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+  updater->Configure(Args{});
+  updater->Update(&param, &gpair, sliced.get(), position, {&tree});

  Json json{Object{}};
  tree.SaveModel(&json);
@@ -97,14 +105,16 @@ void TestColumnSplit(bool categorical) {
  RegTree expected_tree{1u, kCols};
  ObjInfo task{ObjInfo::kRegression};
  {
-    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    auto p_gradients = GenerateGradients(kRows);
    Context ctx;
+    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
+    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+    gpair.Data()->Copy(GenerateRandomGradients(kRows));
    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
    std::vector<HostDeviceVector<bst_node_t>> position(1);
    TrainParam param;
    param.Init(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&expected_tree});
+    updater->Configure(Args{});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&expected_tree});
  }

  auto constexpr kWorldSize = 2;
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@@ -6,7 +6,9 @@
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_updater.h>  // for TreeUpdater

-#include <memory>                  // for unique_ptr
+#include <memory>  // for unique_ptr
+
+#include "../helpers.h"

 namespace xgboost {
 TEST(Updater, HasNodePosition) {
@@ -19,7 +21,7 @@ TEST(Updater, HasNodePosition) {
  ASSERT_TRUE(up->HasNodePosition());

 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-  ctx.gpu_id = 0;
+  ctx = MakeCUDACtx(0);
  up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, &task));
  ASSERT_TRUE(up->HasNodePosition());
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
--- a/Show More
+++ b/Show More