Merge branch 'master' into dev-hui

2023-03-08 00:39:33 +01:00
parent f286ae5bfa f236640427
commit ed45aa2816
221 changed files with 3122 additions and 1486 deletions
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -23,10 +23,15 @@ case "${container}" in
  gpu|rmm)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    if [[ $container == "rmm" ]]
+    then
+      BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
+    fi
    ;;

  gpu_build_centos7|jvm_gpu_build)
    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
    ;;

  *)
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -15,7 +15,8 @@ fi

 command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"

 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -16,7 +16,8 @@ else
 fi

 command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
-                `"CUDA_VERSION_ARG=$CUDA_VERSION"
+                `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
+                `"NCCL_VERSION_ARG=$NCCL_VERSION"

 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/prune_libnccl.sh
--- a/tests/buildkite/build-jvm-packages-gpu.sh
+++ b/tests/buildkite/build-jvm-packages-gpu.sh
@@ -14,5 +14,7 @@ else
 fi

 tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \
-  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_jvm_packages.sh \
+  --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
+  --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
+  tests/ci_build/build_jvm_packages.sh \
  ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag}
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -12,10 +12,10 @@ if ( $is_release_branch -eq 0 ) {
 }
 mkdir build
 cd build
-cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_UNITY_BUILD=ON ${arch_flag}
+cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
+  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
 $msbuild = -join @(
-  "C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0"
+  "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current"
  "\\Bin\\MSBuild.exe"
 )
 & $msbuild xgboost.sln /m /p:Configuration=Release /nodeReuse:false
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -22,8 +22,9 @@ function set_buildkite_env_vars_in_container {

 set -x

-CUDA_VERSION=11.0.3
-RAPIDS_VERSION=22.10
+CUDA_VERSION=11.8.0
+NCCL_VERSION=2.16.5-1
+RAPIDS_VERSION=23.02
 SPARK_VERSION=3.1.1
 JDK_VERSION=8

--- a/tests/buildkite/deploy-jvm-packages.sh
+++ b/tests/buildkite/deploy-jvm-packages.sh
@@ -9,5 +9,6 @@ then
  echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo"
  tests/ci_build/ci_build.sh jvm_gpu_build docker \
    --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
+    --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
    tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION}
 fi
--- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py
@@ -2,12 +2,16 @@ import argparse
 import copy
 import os
 import re
+import sys

 import boto3
 import botocore
 from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS

 current_dir = os.path.dirname(__file__)
+sys.path.append(os.path.join(current_dir, ".."))
+
+from common_blocks.utils import create_or_update_stack, wait

 TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"

@@ -68,72 +72,7 @@ def get_full_stack_id(stack_id):
    return f"buildkite-{stack_id}-autoscaling-group"


-def stack_exists(args, *, stack_name):
-    client = boto3.client("cloudformation", region_name=args.aws_region)
-    waiter = client.get_waiter("stack_exists")
-    try:
-        waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
-        return True
-    except botocore.exceptions.WaiterError as e:
-        return False
-
-
-def create_or_update_stack(
-    args, *, stack_name, template_url=None, template_body=None, params=None
-):
-    kwargs = {
-        "StackName": stack_name,
-        "Capabilities": [
-            "CAPABILITY_IAM",
-            "CAPABILITY_NAMED_IAM",
-            "CAPABILITY_AUTO_EXPAND",
-        ],
-    }
-    if template_url:
-        kwargs["TemplateURL"] = template_url
-    if template_body:
-        kwargs["TemplateBody"] = template_body
-    if params:
-        kwargs["Parameters"] = params
-
-    client = boto3.client("cloudformation", region_name=args.aws_region)
-
-    if stack_exists(args, stack_name=stack_name):
-        print(f"Stack {stack_name} already exists. Updating...")
-        try:
-            response = client.update_stack(**kwargs)
-            return {"StackName": stack_name, "Action": "update"}
-        except botocore.exceptions.ClientError as e:
-            if e.response["Error"]["Code"] == "ValidationError" and re.search(
-                "No updates are to be performed", e.response["Error"]["Message"]
-            ):
-                print(f"No update was made to {stack_name}")
-                return {"StackName": stack_name, "Action": "noop"}
-            else:
-                raise e
-    else:
-        kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
-        response = client.create_stack(**kwargs)
-        return {"StackName": stack_name, "Action": "create"}
-
-
-def wait(promise):
-    client = boto3.client("cloudformation", region_name=args.aws_region)
-    stack_name = promise["StackName"]
-    print(f"Waiting for {stack_name}...")
-    if promise["Action"] == "create":
-        waiter = client.get_waiter("stack_create_complete")
-        waiter.wait(StackName=stack_name)
-        print(f"Finished creating stack {stack_name}")
-    elif promise["Action"] == "update":
-        waiter = client.get_waiter("stack_update_complete")
-        waiter.wait(StackName=stack_name)
-        print(f"Finished updating stack {stack_name}")
-    elif promise["Action"] != "noop":
-        raise ValueError(f"Invalid promise {promise}")
-
-
-def create_agent_iam_policy(args):
+def create_agent_iam_policy(args, *, client):
    policy_stack_name = "buildkite-agent-iam-policy"
    print(f"Creating stack {policy_stack_name} for agent IAM policy...")
    with open(
@@ -142,9 +81,9 @@ def create_agent_iam_policy(args):
    ) as f:
        policy_template = f.read()
    promise = create_or_update_stack(
-        args, stack_name=policy_stack_name, template_body=policy_template
+        args, client=client, stack_name=policy_stack_name, template_body=policy_template
    )
-    wait(promise)
+    wait(promise, client=client)

    cf = boto3.resource("cloudformation", region_name=args.aws_region)
    policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
@@ -152,10 +91,10 @@ def create_agent_iam_policy(args):


 def main(args):
-    agent_iam_policy = create_agent_iam_policy(args)
-
    client = boto3.client("cloudformation", region_name=args.aws_region)

+    agent_iam_policy = create_agent_iam_policy(args, client=client)
+
    promises = []

    for stack_id in AMI_ID:
@@ -167,13 +106,17 @@ def main(args):
        )

        promise = create_or_update_stack(
-            args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
+            args,
+            client=client,
+            stack_name=stack_id_full,
+            template_url=TEMPLATE_URL,
+            params=params,
        )
        promises.append(promise)
        print(f"CI stack {stack_id_full} is in progress in the background")

    for promise in promises:
-        wait(promise)
+        wait(promise, client=client)


 if __name__ == "__main__":
--- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
@@ -1,27 +1,27 @@
 AMI_ID = {
    # Managed by XGBoost team
    "linux-amd64-gpu": {
-        "us-west-2": "ami-00ed92bd37f77bc33",
+        "us-west-2": "ami-094271bed4788ddb5",
    },
    "linux-amd64-mgpu": {
-        "us-west-2": "ami-00ed92bd37f77bc33",
+        "us-west-2": "ami-094271bed4788ddb5",
    },
    "windows-gpu": {
-        "us-west-2": "ami-0a1a2ea551a07ad5f",
+        "us-west-2": "ami-0839681594a1d7627",
    },
    "windows-cpu": {
-        "us-west-2": "ami-0a1a2ea551a07ad5f",
+        "us-west-2": "ami-0839681594a1d7627",
    },
    # Managed by BuildKite
    # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
    "linux-amd64-cpu": {
-        "us-west-2": "ami-075d4c25d5f0c17c1",
+        "us-west-2": "ami-00f2127550cf03658",
    },
    "pipeline-loader": {
-        "us-west-2": "ami-075d4c25d5f0c17c1",
+        "us-west-2": "ami-00f2127550cf03658",
    },
    "linux-arm64-cpu": {
-        "us-west-2": "ami-0952c6fb6db9a9891",
+        "us-west-2": "ami-0c5789068f4a2d1b5",
    },
 }

--- a/tests/buildkite/infrastructure/common_blocks/utils.py
+++ b/tests/buildkite/infrastructure/common_blocks/utils.py
@@ -0,0 +1,97 @@
+import re
+
+import boto3
+import botocore
+
+
+def stack_exists(args, *, stack_name):
+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    waiter = client.get_waiter("stack_exists")
+    try:
+        waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
+        return True
+    except botocore.exceptions.WaiterError as e:
+        return False
+
+
+def create_or_update_stack(
+    args, *, client, stack_name, template_url=None, template_body=None, params=None
+):
+    kwargs = {
+        "StackName": stack_name,
+        "Capabilities": [
+            "CAPABILITY_IAM",
+            "CAPABILITY_NAMED_IAM",
+            "CAPABILITY_AUTO_EXPAND",
+        ],
+    }
+    if template_url:
+        kwargs["TemplateURL"] = template_url
+    if template_body:
+        kwargs["TemplateBody"] = template_body
+    if params:
+        kwargs["Parameters"] = params
+
+    if stack_exists(args, stack_name=stack_name):
+        print(f"Stack {stack_name} already exists. Updating...")
+        try:
+            response = client.update_stack(**kwargs)
+            return {"StackName": stack_name, "Action": "update"}
+        except botocore.exceptions.ClientError as e:
+            if e.response["Error"]["Code"] == "ValidationError" and re.search(
+                "No updates are to be performed", e.response["Error"]["Message"]
+            ):
+                print(f"No update was made to {stack_name}")
+                return {"StackName": stack_name, "Action": "noop"}
+            else:
+                raise e
+    else:
+        kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
+        response = client.create_stack(**kwargs)
+        return {"StackName": stack_name, "Action": "create"}
+
+
+def replace_stack(
+    args, *, client, stack_name, template_url=None, template_body=None, params=None
+):
+    """Delete an existing stack and create a new stack with identical name"""
+
+    if not stack_exists(args, stack_name=stack_name):
+        raise ValueError(f"Stack {stack_name} does not exist")
+    r = client.delete_stack(StackName=stack_name)
+    delete_waiter = client.get_waiter("stack_delete_complete")
+    delete_waiter.wait(StackName=stack_name)
+
+    kwargs = {
+        "StackName": stack_name,
+        "Capabilities": [
+            "CAPABILITY_IAM",
+            "CAPABILITY_NAMED_IAM",
+            "CAPABILITY_AUTO_EXPAND",
+        ],
+        "OnFailure": "ROLLBACK",
+        "EnableTerminationProtection": False,
+    }
+    if template_url:
+        kwargs["TemplateURL"] = template_url
+    if template_body:
+        kwargs["TemplateBody"] = template_body
+    if params:
+        kwargs["Parameters"] = params
+    response = client.create_stack(**kwargs)
+    return {"StackName": stack_name, "Action": "create"}
+
+
+def wait(promise, *, client):
+    stack_name = promise["StackName"]
+    print(f"Waiting for {stack_name}...")
+    if promise["Action"] == "create":
+        waiter = client.get_waiter("stack_create_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished creating stack {stack_name}")
+    elif promise["Action"] == "update":
+        waiter = client.get_waiter("stack_update_complete")
+        waiter.wait(StackName=stack_name)
+        print(f"Finished updating stack {stack_name}")
+    elif promise["Action"] != "noop":
+        raise ValueError(f"Invalid promise {promise}")
--- a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py
@@ -2,6 +2,7 @@ import argparse
 import copy
 import json
 import os
+import sys
 from urllib.request import urlopen

 import boto3
@@ -9,6 +10,9 @@ import cfn_flip
 from metadata import IMAGE_PARAMS

 current_dir = os.path.dirname(__file__)
+sys.path.append(os.path.join(current_dir, ".."))
+
+from common_blocks.utils import replace_stack, wait

 BUILDKITE_CF_TEMPLATE_URL = (
    "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
@@ -47,6 +51,9 @@ def main(args):

    ami_mapping = get_ami_mapping()

+    client = boto3.client("cloudformation", region_name=args.aws_region)
+    promises = []
+
    for stack_id in IMAGE_PARAMS:
        stack_id_full = get_full_stack_id(stack_id)
        print(f"Creating EC2 image builder stack {stack_id_full}...")
@@ -55,28 +62,20 @@ def main(args):
            stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping
        )

-        client = boto3.client("cloudformation", region_name=args.aws_region)
-        response = client.create_stack(
-            StackName=stack_id_full,
-            TemplateBody=ec2_image_pipeline_template,
-            Capabilities=[
-                "CAPABILITY_IAM",
-                "CAPABILITY_NAMED_IAM",
-                "CAPABILITY_AUTO_EXPAND",
-            ],
-            OnFailure="ROLLBACK",
-            EnableTerminationProtection=False,
-            Parameters=params,
+        promise = replace_stack(
+            args,
+            client=client,
+            stack_name=stack_id_full,
+            template_body=ec2_image_pipeline_template,
+            params=params,
        )
+        promises.append(promise)
        print(
            f"EC2 image builder stack {stack_id_full} is in progress in the background"
        )

-    for stack_id in IMAGE_PARAMS:
-        stack_id_full = get_full_stack_id(stack_id)
-        waiter = client.get_waiter("stack_create_complete")
-        waiter.wait(StackName=stack_id_full)
-        print(f"EC2 image builder stack {stack_id_full} is now finished.")
+    for promise in promises:
+        wait(promise, client=client)


 if __name__ == "__main__":
--- a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml
@@ -58,7 +58,7 @@ Resources:
  BootstrapComponent:
    Type: AWS::ImageBuilder::Component
    Properties:
-      Name: !Sub "${AWS::StackName}-bootstrap-component"
+      Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
      Platform: !Ref InstanceOperatingSystem
      Version: "1.0.0"
      Description: Execute a bootstrap script.
@@ -67,7 +67,7 @@ Resources:
  Recipe:
    Type: AWS::ImageBuilder::ImageRecipe
    Properties:
-      Name: !Sub "${AWS::StackName}-image"
+      Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
      Components:
        - ComponentArn: !Ref BootstrapComponent
      ParentImage: !Ref BaseImageId
@@ -83,7 +83,7 @@ Resources:
  Infrastructure:
    Type: AWS::ImageBuilder::InfrastructureConfiguration
    Properties:
-      Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure"
+      Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
      InstanceProfileName: !Ref InstanceProfile
      InstanceTypes:
        - !Ref InstanceType
@@ -93,7 +93,7 @@ Resources:
  Distribution:
    Type: AWS::ImageBuilder::DistributionConfiguration
    Properties:
-      Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config"
+      Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
      Distributions:
        - Region: !Ref AWS::Region
          AmiDistributionConfiguration: {}
@@ -102,7 +102,7 @@ Resources:
  Pipeline:
    Type: AWS::ImageBuilder::ImagePipeline
    Properties:
-      Name: !Sub "${AWS::StackName}-image-pipeline"
+      Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
      DistributionConfigurationArn: !Ref Distribution
      ImageRecipeArn: !Ref Recipe
      InfrastructureConfigurationArn: !Ref Infrastructure
--- a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py
@@ -13,6 +13,6 @@ IMAGE_PARAMS = {
        "BootstrapScript": "windows-gpu-bootstrap.yml",
        "InstanceType": "g4dn.2xlarge",
        "InstanceOperatingSystem": "Windows",
-        "VolumeSize": "80",  # in GiBs
+        "VolumeSize": "120",  # in GiBs
    },
 }
--- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
@@ -15,9 +15,9 @@ phases:
              choco --version
              choco feature enable -n=allowGlobalConfirmation

-              # CMake 3.18
-              Write-Host '>>> Installing CMake 3.18...'
-              choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System"
+              # CMake 3.25
+              Write-Host '>>> Installing CMake 3.25...'
+              choco install cmake --version 3.25.2 --installargs "ADD_CMAKE_TO_PATH=System"
              if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

              # Notepad++
@@ -45,18 +45,18 @@ phases:
              choco install graphviz
              if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

-              # Install Visual Studio Community 2017 (15.9)
-              Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...'
-              choco install visualstudio2017community --version 15.9.23.0 `
+              # Install Visual Studio 2022 Community
+              Write-Host '>>> Installing Visual Studio 2022 Community...'
+              choco install visualstudio2022community `
                  --params "--wait --passive --norestart"
              if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
-              choco install visualstudio2017-workload-nativedesktop --params `
+              choco install visualstudio2022-workload-nativedesktop --params `
                  "--wait --passive --norestart --includeOptional"
              if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

-              # Install CUDA 11.0
-              Write-Host '>>> Installing CUDA 11.0...'
-              choco install cuda --version 11.0.3
+              # Install CUDA 11.8
+              Write-Host '>>> Installing CUDA 11.8...'
+              choco install cuda --version=11.8.0.52206
              if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

              # Install Python packages
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -20,4 +20,5 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
 # tests/ci_build/ci_build.sh rmm nvidia-docker \
 #   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
 #   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
+#   --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
 #   "source activate gpu_test && build/testxgboost --use-rmm-pool"
--- a/tests/ci_build/Dockerfile.aarch64
+++ b/tests/ci_build/Dockerfile.aarch64
@@ -8,15 +8,15 @@ RUN \
    yum install -y tar unzip wget xz git centos-release-scl-rh yum-utils && \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum update -y && \
-    yum install -y devtoolset-7 && \
+    yum install -y devtoolset-9 && \
    # Python
    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-aarch64.sh && \
    bash conda.sh -b -p /opt/mambaforge

 ENV PATH=/opt/mambaforge/bin:$PATH
-ENV CC=/opt/rh/devtoolset-7/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-7/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-7/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
 ENV GOSU_VERSION 1.10

 # Create new Conda environment
--- a/tests/ci_build/Dockerfile.clang_tidy
+++ b/tests/ci_build/Dockerfile.clang_tidy
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
 ARG CUDA_VERSION_ARG

 # Environment
@@ -7,21 +7,21 @@ ENV DEBIAN_FRONTEND noninteractive

 # Install all basic requirements
 RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
    apt-get update && \
    apt-get install -y tar unzip wget git build-essential python3 python3-pip software-properties-common \
                       apt-transport-https ca-certificates gnupg-agent && \
    wget -nv -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
-    add-apt-repository -u 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main' && \
+    add-apt-repository -u 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main' && \
    apt-get update && \
-    apt-get install -y llvm-11 clang-tidy-11 clang-11 && \
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    apt-get install -y llvm-15 clang-tidy-15 clang-15 libomp-15-dev && \
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr

 # Set default clang-tidy version
 RUN \
-    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 100 && \
-    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-11 100
+    update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-15 100 && \
+    update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 100

 # Install Python packages
 RUN \
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:22.04

 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -10,18 +10,15 @@ RUN \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:ubuntu-toolchain-r/test && \
    apt-get update && \
-    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
-    # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libidn12 cmake ninja-build gcc-9 g++-9 openjdk-8-jdk-headless && \
    # Python
    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
    bash conda.sh -b -p /opt/mambaforge

 ENV PATH=/opt/mambaforge/bin:$PATH
-ENV CC=gcc-8
-ENV CXX=g++-8
-ENV CPP=cpp-8
+ENV CC=gcc-9
+ENV CXX=g++-9
+ENV CPP=cpp-9

 ENV GOSU_VERSION 1.10
 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -22,10 +22,10 @@ ENV PATH=/opt/mambaforge/bin:$PATH
 RUN \
    conda install -c conda-forge mamba && \
    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
+        python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
        dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
        numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark cloudpickle cuda-python=11.7.0 && \
+        pyspark cloudpickle cuda-python && \
    mamba clean --all && \
    conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -1,6 +1,7 @@
 ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
+ARG NCCL_VERSION_ARG

 # Install all basic requirements
 RUN \
@@ -9,7 +10,7 @@ RUN \
    yum install -y epel-release centos-release-scl && \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum -y update && \
-    yum install -y tar unzip wget xz git which ninja-build devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ && \
+    yum install -y tar unzip wget xz git which ninja-build devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
    # Python
    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
    bash conda.sh -b -p /opt/mambaforge && \
@@ -21,7 +22,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.13.4-1 && \
+    export NCCL_VERSION=$NCCL_VERSION_ARG && \
    wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
    yum -y update && \
@@ -29,9 +30,9 @@ RUN \
    rm -f nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm;

 ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp

 ENV GOSU_VERSION 1.10

--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -12,16 +12,16 @@ RUN \
    yum install -y tar unzip wget xz git which ninja-build readline-devel libX11-devel libXt-devel \
                   xorg-x11-server-devel openssl-devel zlib-devel bzip2-devel xz-devel \
                   pcre-devel libcurl-devel texlive-* \
-                   devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ \
-                   devtoolset-8-gcc-gfortran devtoolset-8-libquadmath-devel \
-                   devtoolset-8-runtime devtoolset-8-libstdc++-devel
+                   devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \
+                   devtoolset-9-gcc-gfortran devtoolset-9-libquadmath-devel \
+                   devtoolset-9-runtime devtoolset-9-libstdc++-devel

 ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/3.3.0/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/3.3.0/lib64:$LD_LIBRARY_PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
-ENV F77=/opt/rh/devtoolset-8/root/usr/bin/gfortran
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
+ENV F77=/opt/rh/devtoolset-9/root/usr/bin/gfortran

 # R 3.3.0
 RUN \
@@ -36,8 +36,8 @@ RUN \
    bash conda.sh -b -p /opt/mambaforge && \
    /opt/mambaforge/bin/python -m pip install auditwheel awscli && \
    # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr

 ENV GOSU_VERSION 1.10

--- a/tests/ci_build/Dockerfile.jvm
+++ b/tests/ci_build/Dockerfile.jvm
@@ -6,23 +6,23 @@ RUN \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum -y update && \
    yum install -y tar unzip make bzip2 wget xz git which ninja-build java-1.8.0-openjdk-devel \
-                   devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ \
-                   devtoolset-8-runtime devtoolset-8-libstdc++-devel && \
+                   devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \
+                   devtoolset-9-runtime devtoolset-9-libstdc++-devel && \
    # Python
    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
    bash conda.sh -b -p /opt/mambaforge && \
    # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
    # Maven
    wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
    ln -s /opt/apache-maven-3.6.1/ /opt/maven

 ENV PATH=/opt/mambaforge/bin:/opt/maven/bin:$PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp

 # Install Python packages
 RUN \
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -1,6 +1,7 @@
 ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
+ARG NCCL_VERSION_ARG

 # Install all basic requirements
 RUN \
@@ -9,13 +10,13 @@ RUN \
    yum install -y epel-release centos-release-scl && \
    yum-config-manager --enable centos-sclo-rh-testing && \
    yum -y update && \
-    yum install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ && \
+    yum install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \
    # Python
    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
    bash conda.sh -b -p /opt/mambaforge && \
    # CMake
-    wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
+    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
    # Maven
    wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
@@ -24,15 +25,15 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.13.4-1 && \
+    export NCCL_VERSION=$NCCL_VERSION_ARG && \
    yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
    yum -y update && \
    yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}

 ENV PATH=/opt/mambaforge/bin:/opt/maven/bin:$PATH
-ENV CC=/opt/rh/devtoolset-8/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-8/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
+ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
+ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
+ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp

 # Install Python packages
 RUN \
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -1,7 +1,8 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
 ARG CUDA_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
+ARG NCCL_VERSION_ARG

 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -19,7 +20,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.13.4-1 && \
+    export NCCL_VERSION=$NCCL_VERSION_ARG && \
    apt-get update && \
    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}

@@ -29,7 +30,7 @@ ENV PATH=/opt/mambaforge/bin:$PATH
 RUN \
    conda install -c conda-forge mamba && \
    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
+        python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
    mamba clean --all

 ENV GOSU_VERSION 1.10
--- a/tests/ci_build/build_r_pkg_with_cuda_win64.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
@@ -18,7 +18,7 @@ mv xgboost/ xgboost_rpack/

 mkdir build
 cd build
-cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3"
+cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3"
 cmake --build . --config Release --parallel
 cd ..

--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -3,12 +3,15 @@ import os
 import subprocess
 import sys
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Tuple
+from typing import Dict, Optional, Tuple

 from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time

 CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
+SRCPATH = os.path.normpath(
+    os.path.join(CURDIR, os.path.pardir, os.path.pardir, "python-package")
+)


@record_time
@@ -29,7 +32,7 @@ Please run the following command on your machine to address the formatting error

@record_time
 def run_isort(rel_path: str) -> bool:
-    cmd = ["isort", "--check", "--profile=black", rel_path]
+    cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
    ret = subprocess.run(cmd).returncode
    if ret != 0:
        subprocess.run(["isort", "--version"])
@@ -151,6 +154,7 @@ def main(args: argparse.Namespace) -> None:
                "demo/guide-python/sklearn_parallel.py",
                "demo/guide-python/spark_estimator_examples.py",
                "demo/guide-python/individual_trees.py",
+                "demo/guide-python/quantile_regression.py",
                # CI
                "tests/ci_build/lint_python.py",
                "tests/ci_build/test_r_package.py",
@@ -193,6 +197,7 @@ def main(args: argparse.Namespace) -> None:
                "demo/guide-python/cat_in_the_dat.py",
                "demo/guide-python/feature_weights.py",
                "demo/guide-python/individual_trees.py",
+                "demo/guide-python/quantile_regression.py",
                # tests
                "tests/python/test_dt.py",
                "tests/python/test_data_iterator.py",
--- a/tests/ci_build/tidy.py
+++ b/tests/ci_build/tidy.py
@@ -109,6 +109,10 @@ class ClangTidy(object):
                continue
            elif components[i] == '-rdynamic':
                continue
+            elif components[i] == "-Xfatbin=-compress-all":
+                continue
+            elif components[i] == "-forward-unknown-to-host-compiler":
+                continue
            elif (components[i] == '-x' and
                  components[i+1] == 'cu'):
                # -x cu -> -x cuda
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -267,7 +267,7 @@ TEST(CAPI, DMatrixSetFeatureName) {
  }

  char const* feat_types [] {"i", "q"};
-  static_assert(sizeof(feat_types)/ sizeof(feat_types[0]) == kCols, "");
+  static_assert(sizeof(feat_types) / sizeof(feat_types[0]) == kCols);
  XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols);
  char const **c_out_types;
  XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len,
--- a/tests/cpp/common/test_algorithm.cc
+++ b/tests/cpp/common/test_algorithm.cc
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/context.h>  // Context
+#include <xgboost/span.h>
+
+#include <algorithm>  // is_sorted
+
+#include "../../../src/common/algorithm.h"
+
+namespace xgboost {
+namespace common {
+TEST(Algorithm, ArgSort) {
+  Context ctx;
+  std::vector<float> inputs{3.0, 2.0, 1.0};
+  auto ret = ArgSort<bst_feature_t>(&ctx, inputs.cbegin(), inputs.cend());
+  std::vector<bst_feature_t> sol{2, 1, 0};
+  ASSERT_EQ(ret, sol);
+}
+
+TEST(Algorithm, Sort) {
+  Context ctx;
+  ctx.Init(Args{{"nthread", "8"}});
+  std::vector<float> inputs{3.0, 1.0, 2.0};
+
+  Sort(&ctx, inputs.begin(), inputs.end(), std::less<>{});
+  ASSERT_TRUE(std::is_sorted(inputs.cbegin(), inputs.cend()));
+
+  inputs = {3.0, 1.0, 2.0};
+  StableSort(&ctx, inputs.begin(), inputs.end(), std::less<>{});
+  ASSERT_TRUE(std::is_sorted(inputs.cbegin(), inputs.cend()));
+}
+}  // namespace common
+}  // namespace xgboost
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -52,9 +52,9 @@ void TestSegmentedArgSort() {
  }
 }

-TEST(Algorithms, SegmentedArgSort) { TestSegmentedArgSort(); }
+TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }

-TEST(Algorithms, ArgSort) {
+TEST(Algorithm, GpuArgSort) {
  Context ctx;
  ctx.gpu_id = 0;

@@ -80,7 +80,7 @@ TEST(Algorithms, ArgSort) {
      thrust::is_sorted(sorted_idx.begin() + 10, sorted_idx.end(), thrust::greater<size_t>{}));
 }

-TEST(Algorithms, SegmentedSequence) {
+TEST(Algorithm, SegmentedSequence) {
  dh::device_vector<std::size_t> idx(16);
  dh::device_vector<std::size_t> ptr(3);
  Context ctx = CreateEmptyGenericParam(0);
--- a/tests/cpp/common/test_charconv.cc
+++ b/tests/cpp/common/test_charconv.cc
@@ -128,7 +128,7 @@ TEST(Ryu, Regression) {
  TestRyu("2E2", 200.0f);
  TestRyu("3.3554432E7", 3.3554432E7f);

-  static_assert(1.1920929E-7f == std::numeric_limits<float>::epsilon(), "");
+  static_assert(1.1920929E-7f == std::numeric_limits<float>::epsilon());
  TestRyu("1.1920929E-7", std::numeric_limits<float>::epsilon());
 }

--- a/tests/cpp/common/test_common.cc
+++ b/tests/cpp/common/test_common.cc
@@ -1,14 +0,0 @@
-#include <gtest/gtest.h>
-#include <xgboost/span.h>
-#include "../../../src/common/common.h"
-
-namespace xgboost {
-namespace common {
-TEST(ArgSort, Basic) {
-  std::vector<float> inputs {3.0, 2.0, 1.0};
-  auto ret = ArgSort<bst_feature_t>(Span<float>{inputs});
-  std::vector<bst_feature_t> sol{2, 1, 0};
-  ASSERT_EQ(ret, sol);
-}
-}  // namespace common
-}  // namespace xgboost
--- a/tests/cpp/common/test_group_data.cc
+++ b/tests/cpp/common/test_group_data.cc
@@ -43,8 +43,8 @@ TEST(GroupData, ParallelGroupBuilder) {
  builder2.Push(2, Entry(0, 4), 0);
  builder2.Push(2, Entry(1, 5), 0);

-  expected_data.emplace_back(Entry(0, 4));
-  expected_data.emplace_back(Entry(1, 5));
+  expected_data.emplace_back(0, 4);
+  expected_data.emplace_back(1, 5);
  expected_offsets.emplace_back(6);

  EXPECT_EQ(data, expected_data);
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -143,7 +143,7 @@ void TestMixedSketch() {
  size_t n_samples = 1000, n_features = 2, n_categories = 3;
  std::vector<float> data(n_samples * n_features);
  SimpleLCG gen;
-  SimpleRealUniformDistribution<float> cat_d{0.0f, float(n_categories)};
+  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
  SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
  for (size_t i = 0; i < n_samples * n_features; ++i) {
    if (i % 2 == 0) {
--- a/tests/cpp/common/test_intrusive_ptr.cc
+++ b/tests/cpp/common/test_intrusive_ptr.cc
@@ -13,9 +13,9 @@ class NotCopyConstructible {
  NotCopyConstructible(NotCopyConstructible&& that) = default;
 };
 static_assert(
-    !std::is_trivially_copy_constructible<NotCopyConstructible>::value, "");
+    !std::is_trivially_copy_constructible<NotCopyConstructible>::value);
 static_assert(
-    !std::is_trivially_copy_assignable<NotCopyConstructible>::value, "");
+    !std::is_trivially_copy_assignable<NotCopyConstructible>::value);

 class ForIntrusivePtrTest {
 public:
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -1,22 +1,23 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/linalg.h>

-#include <numeric>
+#include <cstddef>  // size_t
+#include <numeric>  // iota
+#include <vector>

 #include "../../../src/common/linalg_op.h"

-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace {
 auto kCpuId = Context::kCpuId;
 }

-auto MakeMatrixFromTest(HostDeviceVector<float> *storage, size_t n_rows, size_t n_cols) {
+auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, std::size_t n_cols) {
  storage->Resize(n_rows * n_cols);
  auto &h_storage = storage->HostVector();

@@ -48,10 +49,11 @@ TEST(Linalg, VectorView) {
 }

 TEST(Linalg, TensorView) {
+  Context ctx;
  std::vector<double> data(2 * 3 * 4, 0);
  std::iota(data.begin(), data.end(), 0);

-  auto t = MakeTensorView(data, {2, 3, 4}, -1);
+  auto t = MakeTensorView(&ctx, data, 2, 3, 4);
  ASSERT_EQ(t.Shape()[0], 2);
  ASSERT_EQ(t.Shape()[1], 3);
  ASSERT_EQ(t.Shape()[2], 4);
@@ -106,12 +108,12 @@ TEST(Linalg, TensorView) {
  {
    // Don't assign the initial dimension, tensor should be able to deduce the correct dim
    // for Slice.
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto s = t.Slice(1, 2, All());
-    static_assert(decltype(s)::kDimension == 1, "");
+    static_assert(decltype(s)::kDimension == 1);
  }
  {
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto s = t.Slice(1, linalg::All(), 1);
    ASSERT_EQ(s(0), 13);
    ASSERT_EQ(s(1), 17);
@@ -119,9 +121,9 @@ TEST(Linalg, TensorView) {
  }
  {
    // range slice
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto s = t.Slice(linalg::All(), linalg::Range(1, 3), 2);
-    static_assert(decltype(s)::kDimension == 2, "");
+    static_assert(decltype(s)::kDimension == 2);
    std::vector<double> sol{6, 10, 18, 22};
    auto k = 0;
    for (size_t i = 0; i < s.Shape(0); ++i) {
@@ -134,9 +136,9 @@ TEST(Linalg, TensorView) {
  }
  {
    // range slice
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto s = t.Slice(1, linalg::Range(1, 3), linalg::Range(1, 3));
-    static_assert(decltype(s)::kDimension == 2, "");
+    static_assert(decltype(s)::kDimension == 2);
    std::vector<double> sol{17, 18, 21, 22};
    auto k = 0;
    for (size_t i = 0; i < s.Shape(0); ++i) {
@@ -149,9 +151,9 @@ TEST(Linalg, TensorView) {
  }
  {
    // same as no slice.
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
-    static_assert(decltype(s)::kDimension == 3, "");
+    static_assert(decltype(s)::kDimension == 3);
    auto all = t.Slice(linalg::All(), linalg::All(), linalg::All());
    for (size_t i = 0; i < s.Shape(0); ++i) {
      for (size_t j = 0; j < s.Shape(1); ++j) {
@@ -166,7 +168,7 @@ TEST(Linalg, TensorView) {

  {
    // copy and move constructor.
-    auto t = MakeTensorView(data, {2, 3, 4}, kCpuId);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto from_copy = t;
    auto from_move = std::move(t);
    for (size_t i = 0; i < t.Shape().size(); ++i) {
@@ -177,7 +179,7 @@ TEST(Linalg, TensorView) {

  {
    // multiple slices
-    auto t = MakeTensorView(data, {2, 3, 4}, kCpuId);
+    auto t = MakeTensorView(&ctx, data, 2, 3, 4);
    auto s_0 = t.Slice(linalg::All(), linalg::Range(0, 2), linalg::Range(1, 4));
    ASSERT_FALSE(s_0.CContiguous());
    auto s_1 = s_0.Slice(1, 1, linalg::Range(0, 2));
@@ -208,7 +210,7 @@ TEST(Linalg, TensorView) {

 TEST(Linalg, Tensor) {
  {
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId};
+    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
    auto view = t.View(kCpuId);

    auto const &as_const = t;
@@ -227,7 +229,7 @@ TEST(Linalg, Tensor) {
  }
  {
    // Reshape
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId};
+    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
    t.Reshape(4, 3, 2);
    ASSERT_EQ(t.Size(), 24);
    ASSERT_EQ(t.Shape(2), 2);
@@ -245,7 +247,7 @@ TEST(Linalg, Tensor) {

 TEST(Linalg, Empty) {
  {
-    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId};
+    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId, Order::kC};
    for (int32_t i : {0, 1, 2}) {
      auto s = t.Slice(All(), i);
      ASSERT_EQ(s.Size(), 0);
@@ -254,7 +256,7 @@ TEST(Linalg, Empty) {
    }
  }
  {
-    auto t = Tensor<double, 2>{{0, 3}, kCpuId};
+    auto t = Tensor<double, 2>{{0, 3}, kCpuId, Order::kC};
    ASSERT_EQ(t.Size(), 0);
    auto view = t.View(kCpuId);

@@ -269,7 +271,7 @@ TEST(Linalg, Empty) {

 TEST(Linalg, ArrayInterface) {
  auto cpu = kCpuId;
-  auto t = Tensor<double, 2>{{3, 3}, cpu};
+  auto t = Tensor<double, 2>{{3, 3}, cpu, Order::kC};
  auto v = t.View(cpu);
  std::iota(v.Values().begin(), v.Values().end(), 0);
  auto arr = Json::Load(StringView{ArrayInterfaceStr(v)});
@@ -313,21 +315,48 @@ TEST(Linalg, Popc) {
 }

 TEST(Linalg, Stack) {
-  Tensor<float, 3> l{{2, 3, 4}, kCpuId};
+  Tensor<float, 3> l{{2, 3, 4}, kCpuId, Order::kC};
  ElementWiseTransformHost(l.View(kCpuId), omp_get_max_threads(),
                           [=](size_t i, float) { return i; });
-  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId};
+  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId, Order::kC};
  ElementWiseTransformHost(r_0.View(kCpuId), omp_get_max_threads(),
                           [=](size_t i, float) { return i; });

  Stack(&l, r_0);

-  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId};
+  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId, Order::kC};
  Stack(&l, r_1);
  ASSERT_EQ(l.Shape(0), 4);

  Stack(&r_1, l);
  ASSERT_EQ(r_1.Shape(0), l.Shape(0));
 }
-}  // namespace linalg
-}  // namespace xgboost
+
+TEST(Linalg, FOrder) {
+  std::size_t constexpr kRows = 16, kCols = 3;
+  std::vector<float> data(kRows * kCols);
+  MatrixView<float> mat{data, {kRows, kCols}, Context::kCpuId, Order::kF};
+  float k{0};
+  for (std::size_t i = 0; i < kRows; ++i) {
+    for (std::size_t j = 0; j < kCols; ++j) {
+      mat(i, j) = k;
+      k++;
+    }
+  }
+  auto column = mat.Slice(linalg::All(), 1);
+  ASSERT_TRUE(column.FContiguous());
+  ASSERT_EQ(column.Stride(0), 1);
+  ASSERT_TRUE(column.CContiguous());
+  k = 1;
+  for (auto it = linalg::cbegin(column); it != linalg::cend(column); ++it) {
+    ASSERT_EQ(*it, k);
+    k += kCols;
+  }
+  k = 1;
+  auto ptr = column.Values().data();
+  for (auto it = ptr; it != ptr + kRows; ++it) {
+    ASSERT_EQ(*it, k);
+    k += kCols;
+  }
+}
+}  // namespace xgboost::linalg
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>

@@ -7,8 +7,7 @@
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"

-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
  Tensor<float, 3> l{{2, 3, 4}, 0};
@@ -55,12 +54,14 @@ void TestElementWiseKernel() {
 }

 void TestSlice() {
+  Context ctx;
+  ctx.gpu_id = 1;
  thrust::device_vector<double> data(2 * 3 * 4);
-  auto t = MakeTensorView(dh::ToSpan(data), {2, 3, 4}, 0);
+  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
  dh::LaunchN(1, [=] __device__(size_t) {
    auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
    auto all = t.Slice(linalg::All(), linalg::All(), linalg::All());
-    static_assert(decltype(s)::kDimension == 3, "");
+    static_assert(decltype(s)::kDimension == 3);
    for (size_t i = 0; i < s.Shape(0); ++i) {
      for (size_t j = 0; j < s.Shape(1); ++j) {
        for (size_t k = 0; k < s.Shape(2); ++k) {
@@ -75,5 +76,4 @@ void TestSlice() {
 TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }

 TEST(Linalg, GPUTensorView) { TestSlice(); }
-}  // namespace linalg
-}  // namespace xgboost
+}  // namespace xgboost::linalg
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -2,16 +2,18 @@
 #include "../../../src/common/random.h"
 #include "../helpers.h"
 #include "gtest/gtest.h"
+#include "xgboost/context.h"  // Context

 namespace xgboost {
 namespace common {
 TEST(ColumnSampler, Test) {
+  Context ctx;
  int n = 128;
  ColumnSampler cs;
  std::vector<float> feature_weights;

  // No node sampling
-  cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 0.5f, 0.5f);
  auto set0 = cs.GetFeatureSet(0);
  ASSERT_EQ(set0->Size(), 32);

@@ -24,7 +26,7 @@ TEST(ColumnSampler, Test) {
  ASSERT_EQ(set2->Size(), 32);

  // Node sampling
-  cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f);
+  cs.Init(&ctx, n, feature_weights, 0.5f, 1.0f, 0.5f);
  auto set3 = cs.GetFeatureSet(0);
  ASSERT_EQ(set3->Size(), 32);

@@ -34,24 +36,25 @@ TEST(ColumnSampler, Test) {
  ASSERT_EQ(set4->Size(), 32);

  // No level or node sampling, should be the same at different depth
-  cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 0.5f);
  ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
            cs.GetFeatureSet(1)->HostVector());

-  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
  auto set5 = cs.GetFeatureSet(0);
  ASSERT_EQ(set5->Size(), n);
-  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
+  cs.Init(&ctx, n, feature_weights, 1.0f, 1.0f, 1.0f);
  auto set6 = cs.GetFeatureSet(0);
  ASSERT_EQ(set5->HostVector(), set6->HostVector());

  // Should always be a minimum of one feature
-  cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
+  cs.Init(&ctx, n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
  ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
 }

 // Test if different threads using the same seed produce the same result
 TEST(ColumnSampler, ThreadSynchronisation) {
+  Context ctx;
  const int64_t num_threads = 100;
  int n = 128;
  size_t iterations = 10;
@@ -63,7 +66,7 @@ TEST(ColumnSampler, ThreadSynchronisation) {
  {
    for (auto j = 0ull; j < iterations; j++) {
      ColumnSampler cs(j);
-      cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f);
+      cs.Init(&ctx, n, feature_weights, 0.5f, 0.5f, 0.5f);
      for (auto level = 0ull; level < levels; level++) {
        auto result = cs.GetFeatureSet(level)->ConstHostVector();
 #pragma omp single
@@ -80,11 +83,12 @@ TEST(ColumnSampler, ThreadSynchronisation) {

 TEST(ColumnSampler, WeightedSampling) {
  auto test_basic = [](int first) {
+    Context ctx;
    std::vector<float> feature_weights(2);
    feature_weights[0] = std::abs(first - 1.0f);
    feature_weights[1] = first - 0.0f;
    ColumnSampler cs{0};
-    cs.Init(2, feature_weights, 1.0, 1.0, 0.5);
+    cs.Init(&ctx, 2, feature_weights, 1.0, 1.0, 0.5);
    auto feature_sets = cs.GetFeatureSet(0);
    auto const &h_feat_set = feature_sets->HostVector();
    ASSERT_EQ(h_feat_set.size(), 1);
@@ -100,7 +104,8 @@ TEST(ColumnSampler, WeightedSampling) {
  SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
  std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
  ColumnSampler cs{0};
-  cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f);
+  Context ctx;
+  cs.Init(&ctx, kCols, feature_weights, 0.5f, 1.0f, 1.0f);
  std::vector<bst_feature_t> features(kCols);
  std::iota(features.begin(), features.end(), 0);
  std::vector<float> freq(kCols, 0);
@@ -135,7 +140,8 @@ TEST(ColumnSampler, WeightedMultiSampling) {
  }
  ColumnSampler cs{0};
  float bytree{0.5}, bylevel{0.5}, bynode{0.5};
-  cs.Init(feature_weights.size(), feature_weights, bytree, bylevel, bynode);
+  Context ctx;
+  cs.Init(&ctx, feature_weights.size(), feature_weights, bytree, bylevel, bynode);
  auto feature_set = cs.GetFeatureSet(0);
  size_t n_sampled = kCols * bytree * bylevel * bynode;
  ASSERT_EQ(feature_set->Size(), n_sampled);
--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@@ -522,9 +522,9 @@ TEST(Span, Empty) {
 TEST(SpanDeathTest, Empty) {
  std::vector<float> data(1, 0);
  ASSERT_TRUE(data.data());
-  Span<float> s{data.data(), Span<float>::index_type(0)};  // ok to define 0 size span.
+  // ok to define 0 size span.
+  Span<float> s{data.data(), static_cast<Span<float>::index_type>(0)};
  EXPECT_DEATH(s[0], "");  // not ok to use it.
 }
-
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -11,19 +11,20 @@
 namespace xgboost {
 namespace common {
 TEST(Stats, Quantile) {
+  Context ctx;
  {
    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
    std::vector<size_t> index{0, 2, 3, 4, 6};
    auto h_arr = arr.HostView();
    auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
    auto end = beg + index.size();
-    auto q = Quantile(0.40f, beg, end);
+    auto q = Quantile(&ctx, 0.40f, beg, end);
    ASSERT_EQ(q, 26.0);

-    q = Quantile(0.20f, beg, end);
+    q = Quantile(&ctx, 0.20f, beg, end);
    ASSERT_EQ(q, 16.0);

-    q = Quantile(0.10f, beg, end);
+    q = Quantile(&ctx, 0.10f, beg, end);
    ASSERT_EQ(q, 15.0);
  }

@@ -31,12 +32,13 @@ TEST(Stats, Quantile) {
    std::vector<float> vec{1., 2., 3., 4., 5.};
    auto beg = MakeIndexTransformIter([&](size_t i) { return vec[i]; });
    auto end = beg + vec.size();
-    auto q = Quantile(0.5f, beg, end);
+    auto q = Quantile(&ctx, 0.5f, beg, end);
    ASSERT_EQ(q, 3.);
  }
 }

 TEST(Stats, WeightedQuantile) {
+  Context ctx;
  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);

@@ -47,13 +49,13 @@ TEST(Stats, WeightedQuantile) {
  auto end = beg + arr.Size();
  auto w = MakeIndexTransformIter([&](size_t i) { return h_weight(i); });

-  auto q = WeightedQuantile(0.50f, beg, end, w);
+  auto q = WeightedQuantile(&ctx, 0.50f, beg, end, w);
  ASSERT_EQ(q, 3);

-  q = WeightedQuantile(0.0, beg, end, w);
+  q = WeightedQuantile(&ctx, 0.0, beg, end, w);
  ASSERT_EQ(q, 1);

-  q = WeightedQuantile(1.0, beg, end, w);
+  q = WeightedQuantile(&ctx, 1.0, beg, end, w);
  ASSERT_EQ(q, 5);
 }

--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -119,13 +119,13 @@ TEST(ArrayInterface, TrivialDim) {
 }

 TEST(ArrayInterface, ToDType) {
-  static_assert(ToDType<float>::kType == ArrayInterfaceHandler::kF4, "");
-  static_assert(ToDType<double>::kType == ArrayInterfaceHandler::kF8, "");
+  static_assert(ToDType<float>::kType == ArrayInterfaceHandler::kF4);
+  static_assert(ToDType<double>::kType == ArrayInterfaceHandler::kF8);

-  static_assert(ToDType<uint32_t>::kType == ArrayInterfaceHandler::kU4, "");
-  static_assert(ToDType<uint64_t>::kType == ArrayInterfaceHandler::kU8, "");
+  static_assert(ToDType<uint32_t>::kType == ArrayInterfaceHandler::kU4);
+  static_assert(ToDType<uint64_t>::kType == ArrayInterfaceHandler::kU8);

-  static_assert(ToDType<int32_t>::kType == ArrayInterfaceHandler::kI4, "");
-  static_assert(ToDType<int64_t>::kType == ArrayInterfaceHandler::kI8, "");
+  static_assert(ToDType<int32_t>::kType == ArrayInterfaceHandler::kI4);
+  static_assert(ToDType<int64_t>::kType == ArrayInterfaceHandler::kI8);
 }
 }  // namespace xgboost
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -21,7 +21,7 @@ TEST(SparsePage, PushCSC) {

  offset = {0, 1, 4};
  for (size_t i = 0; i < offset.back(); ++i) {
-    data.emplace_back(Entry(i, 0.1f));
+    data.emplace_back(i, 0.1f);
  }

  SparsePage other;
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
  }
 }

+TEST(GradientIndex, FromCategoricalLarge) {
+  size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
+  bst_bin_t max_bins = 8;
+  auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
+  auto m = GetDMatrixFromData(x, kRows, 1);
+  Context ctx;
+
+  auto &h_ft = m->Info().feature_types.HostVector();
+  h_ft.resize(kCols, FeatureType::kCategorical);
+
+  BatchParam p{max_bins, 0.8};
+  {
+    GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, AllThreadsForTest(), {});
+    ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
+  }
+  {
+    for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
+      common::HistogramCuts cut = page.cut;
+      GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
+      ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
+    }
+  }
+}
+
 TEST(GradientIndex, PushBatch) {
  size_t constexpr kRows = 64, kCols = 4;
  bst_bin_t max_bins = 64;
--- a/tests/cpp/data/test_simple_dmatrix.cu
+++ b/tests/cpp/data/test_simple_dmatrix.cu
@@ -189,8 +189,8 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
    auto& mask = column_bitfields[0];
    mask.resize(8);

-    for (size_t j = 0; j < mask.size(); ++j) {
-      mask[j] = ~0;
+    for (auto && j : mask) {
+      j = ~0;
    }
    // the 2^th entry of first column is invalid
    // [0 0 0 0 0 1 0 0]
@@ -201,8 +201,8 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
    auto& mask = column_bitfields[1];
    mask.resize(8);

-    for (size_t j = 0; j < mask.size(); ++j) {
-      mask[j] = ~0;
+    for (auto && j : mask) {
+      j = ~0;
    }
    // the 19^th entry of second column is invalid
    // [~0~], [~0~], [0 0 0 0 1 0 0 0]
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -96,7 +96,7 @@ void TestRetainPage() {

  // make sure it's const and the caller can not modify the content of page.
  for (auto& page : m->GetBatches<Page>()) {
-    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
+    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
  }
 }

--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -1,5 +1,6 @@
-// Copyright by Contributors
-
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
+ */
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_dmatrix.h"
@@ -69,7 +70,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
  std::vector<std::shared_ptr<EllpackPage const>> iterators;
  for (auto it = begin; it != end; ++it) {
    iterators.push_back(it.Page());
-    gidx_buffers.emplace_back(HostDeviceVector<common::CompressedByteT>{});
+    gidx_buffers.emplace_back();
    gidx_buffers.back().Resize((*it).Impl()->gidx_buffer.Size());
    gidx_buffers.back().Copy((*it).Impl()->gidx_buffer);
  }
@@ -87,7 +88,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {

  // make sure it's const and the caller can not modify the content of page.
  for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
-    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
+    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
  }

  // The above iteration clears out all references inside DMatrix.
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -186,7 +186,7 @@ SimpleLCG::StateType SimpleLCG::operator()() {
 SimpleLCG::StateType SimpleLCG::Min() const { return min(); }
 SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 // Make sure it's compile time constant.
-static_assert(SimpleLCG::max() - SimpleLCG::min(), "");
+static_assert(SimpleLCG::max() - SimpleLCG::min());

 void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
  xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -46,7 +46,7 @@ class GradientBooster;

 template <typename Float>
 Float RelError(Float l, Float r) {
-  static_assert(std::is_floating_point<Float>::value, "");
+  static_assert(std::is_floating_point<Float>::value);
  return std::abs(1.0f - l / r);
 }

@@ -164,7 +164,7 @@ class SimpleRealUniformDistribution {
    ResultT sum_value = 0, r_k = 1;

    for (size_t k = m; k != 0; --k) {
-      sum_value += ResultT((*rng)() - rng->Min()) * r_k;
+      sum_value += static_cast<ResultT>((*rng)() - rng->Min()) * r_k;
      r_k *= r;
    }

@@ -191,12 +191,10 @@ Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
  Json array_interface{Object()};
  array_interface["data"] = std::vector<Json>(2);
  if (storage->DeviceCanRead()) {
-    array_interface["data"][0] =
-        Integer(reinterpret_cast<int64_t>(storage->ConstDevicePointer()));
+    array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(storage->ConstDevicePointer())};
    array_interface["stream"] = nullptr;
  } else {
-    array_interface["data"][0] =
-        Integer(reinterpret_cast<int64_t>(storage->ConstHostPointer()));
+    array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(storage->ConstHostPointer())};
  }
  array_interface["data"][1] = Boolean(false);

--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -1,4 +1,6 @@
-// Copyright by Contributors
+/**
+ * Copyright 2016-2023 by XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/objective.h>
@@ -25,11 +27,14 @@ TEST(Objective, PredTransform) {
  tparam.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
  size_t n = 100;

-  for (const auto &entry :
-       ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) {
-    std::unique_ptr<xgboost::ObjFunction> obj{
-        xgboost::ObjFunction::Create(entry->name, &tparam)};
-    obj->Configure(Args{{"num_class", "2"}});
+  for (const auto& entry : ::dmlc::Registry<::xgboost::ObjFunctionReg>::List()) {
+    std::unique_ptr<xgboost::ObjFunction> obj{xgboost::ObjFunction::Create(entry->name, &tparam)};
+    if (entry->name.find("multi") != std::string::npos) {
+      obj->Configure(Args{{"num_class", "2"}});
+    }
+    if (entry->name.find("quantile") != std::string::npos) {
+      obj->Configure(Args{{"quantile_alpha", "0.5"}});
+    }
    HostDeviceVector<float> predts;
    predts.Resize(n, 3.14f);  // prediction is performed on host.
    ASSERT_FALSE(predts.DeviceCanRead());
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>       // Args
+#include <xgboost/context.h>    // Context
+#include <xgboost/objective.h>  // ObjFunction
+#include <xgboost/span.h>       // Span
+
+#include <memory>               // std::unique_ptr
+#include <vector>               // std::vector
+
+#include "../helpers.h"         // CheckConfigReload,CreateEmptyGenericParam,DeclareUnifiedTest
+
+namespace xgboost {
+TEST(Objective, DeclareUnifiedTest(Quantile)) {
+  Context ctx = CreateEmptyGenericParam(GPUIDX);
+
+  {
+    Args args{{"quantile_alpha", "[0.6, 0.8]"}};
+    std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
+    obj->Configure(args);
+    CheckConfigReload(obj, "reg:quantileerror");
+  }
+
+  Args args{{"quantile_alpha", "0.6"}};
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
+  obj->Configure(args);
+  CheckConfigReload(obj, "reg:quantileerror");
+
+  std::vector<float> predts{1.0f, 2.0f, 3.0f};
+  std::vector<float> labels{3.0f, 2.0f, 1.0f};
+  std::vector<float> weights{1.0f, 1.0f, 1.0f};
+  std::vector<float> grad{-0.6f, 0.4f, 0.4f};
+  std::vector<float> hess = weights;
+  CheckObjFunction(obj, predts, labels, weights, grad, hess);
+}
+
+TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
+  Context ctx = CreateEmptyGenericParam(GPUIDX);
+  Args args{{"quantile_alpha", "[0.6, 0.8]"}};
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
+  obj->Configure(args);
+
+  MetaInfo info;
+  info.num_row_ = 10;
+  info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
+    data->SetDevice(ctx.gpu_id);
+    data->Resize(info.num_row_);
+    shape[0] = info.num_row_;
+    shape[1] = 1;
+
+    auto& h_labels = data->HostVector();
+    for (std::size_t i = 0; i < info.num_row_; ++i) {
+      h_labels[i] = i;
+    }
+  });
+
+  linalg::Vector<float> base_scores;
+  obj->InitEstimation(info, &base_scores);
+  ASSERT_EQ(base_scores.Size(), 1) << "Vector is not yet supported.";
+  // mean([5.6, 7.8])
+  ASSERT_NEAR(base_scores(0), 6.7, kRtEps);
+
+  for (std::size_t i = 0; i < info.num_row_; ++i) {
+    info.weights_.HostVector().emplace_back(info.num_row_ - i - 1.0);
+  }
+
+  obj->InitEstimation(info, &base_scores);
+  ASSERT_EQ(base_scores.Size(), 1) << "Vector is not yet supported.";
+  // mean([3, 5])
+  ASSERT_NEAR(base_scores(0), 4.0, kRtEps);
+}
+}  // namespace xgboost
--- a/tests/cpp/objective/test_quantile_obj_gpu.cu
+++ b/tests/cpp/objective/test_quantile_obj_gpu.cu
@@ -0,0 +1,5 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+// Dummy file to enable the CUDA tests.
+#include "test_quantile_obj.cc"
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -6,8 +6,9 @@
 #include <xgboost/json.h>
 #include <xgboost/objective.h>

-#include "../../../src/common/linalg_op.h"  // begin,end
+#include "../../../src/common/linalg_op.h"  // for begin, end
 #include "../../../src/objective/adaptive.h"
+#include "../../../src/tree/param.h"        // for TrainParam
 #include "../helpers.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -157,7 +158,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
    ObjFunction::Create("count:poisson", &ctx)
  };

-  args.emplace_back(std::make_pair("max_delta_step", "0.1f"));
+  args.emplace_back("max_delta_step", "0.1f");
  obj->Configure(args);

  CheckObjFunction(obj,
@@ -259,7 +260,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
  std::vector<std::pair<std::string, std::string>> args;
  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:tweedie", &ctx)};

-  args.emplace_back(std::make_pair("tweedie_variance_power", "1.1f"));
+  args.emplace_back("tweedie_variance_power", "1.1f");
  obj->Configure(args);

  CheckObjFunction(obj,
@@ -408,9 +409,13 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
    h_predt[i] = labels[i] + i;
  }

-  obj->UpdateTreeLeaf(position, info, predt, 0, &tree);
-  ASSERT_EQ(tree[1].LeafValue(), -1);
-  ASSERT_EQ(tree[2].LeafValue(), -4);
+  tree::TrainParam param;
+  param.Init(Args{});
+  auto lr = param.learning_rate;
+
+  obj->UpdateTreeLeaf(position, info, param.learning_rate, predt, 0, &tree);
+  ASSERT_EQ(tree[1].LeafValue(), -1.0f * lr);
+  ASSERT_EQ(tree[2].LeafValue(), -4.0f * lr);
 }

 TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
@@ -428,8 +433,8 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
    auto h_labels = info.labels.HostView().Slice(linalg::All(), t);
    std::iota(linalg::begin(h_labels), linalg::end(h_labels), 0);

-    auto h_predt = linalg::MakeTensorView(predt.HostSpan(), {kRows, kTargets}, Context::kCpuId)
-                       .Slice(linalg::All(), t);
+    auto h_predt =
+        linalg::MakeTensorView(&ctx, predt.HostSpan(), kRows, kTargets).Slice(linalg::All(), t);
    for (size_t i = 0; i < h_predt.Size(); ++i) {
      h_predt(i) = h_labels(i) + i;
    }
@@ -457,11 +462,16 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
    ASSERT_EQ(tree.GetNumLeaves(), 4);

    auto empty_leaf = tree[4].LeafValue();
-    obj->UpdateTreeLeaf(position, info, predt, t, &tree);
-    ASSERT_EQ(tree[3].LeafValue(), -5);
-    ASSERT_EQ(tree[4].LeafValue(), empty_leaf);
-    ASSERT_EQ(tree[5].LeafValue(), -10);
-    ASSERT_EQ(tree[6].LeafValue(), -14);
+
+    tree::TrainParam param;
+    param.Init(Args{});
+    auto lr = param.learning_rate;
+
+    obj->UpdateTreeLeaf(position, info, lr, predt, t, &tree);
+    ASSERT_EQ(tree[3].LeafValue(), -5.0f * lr);
+    ASSERT_EQ(tree[4].LeafValue(), empty_leaf * lr);
+    ASSERT_EQ(tree[5].LeafValue(), -10.0f * lr);
+    ASSERT_EQ(tree[6].LeafValue(), -14.0f * lr);
  }
 }

--- a/tests/cpp/test_cache.cc
+++ b/tests/cpp/test_cache.cc
@@ -3,16 +3,18 @@
 */
 #include <gtest/gtest.h>
 #include <xgboost/cache.h>
-#include <xgboost/data.h>  // DMatrix
+#include <xgboost/data.h>  // for DMatrix

-#include <cstddef>         // std::size_t
+#include <cstddef>         // for size_t
+#include <cstdint>         // for uint32_t
+#include <thread>          // for thread

-#include "helpers.h"       // RandomDataGenerator
+#include "helpers.h"       // for RandomDataGenerator

 namespace xgboost {
 namespace {
 struct CacheForTest {
-  std::size_t i;
+  std::size_t const i;

  explicit CacheForTest(std::size_t k) : i{k} {}
 };
@@ -20,7 +22,7 @@ struct CacheForTest {

 TEST(DMatrixCache, Basic) {
  std::size_t constexpr kRows = 2, kCols = 1, kCacheSize = 4;
-  DMatrixCache<CacheForTest> cache(kCacheSize);
+  DMatrixCache<CacheForTest> cache{kCacheSize};

  auto add_cache = [&]() {
    // Create a lambda function here, so that p_fmat gets deleted upon the
@@ -52,4 +54,63 @@ TEST(DMatrixCache, Basic) {
    }
  }
 }
+
+TEST(DMatrixCache, MultiThread) {
+  std::size_t constexpr kRows = 2, kCols = 1, kCacheSize = 3;
+  auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+
+  auto n = std::thread::hardware_concurrency() * 128u;
+  CHECK_NE(n, 0);
+  std::vector<std::shared_ptr<CacheForTest>> results(n);
+
+  {
+    DMatrixCache<CacheForTest> cache{kCacheSize};
+    std::vector<std::thread> tasks;
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      tasks.emplace_back([&, i = tidx]() {
+        cache.CacheItem(p_fmat, i);
+
+        auto p_fmat_local = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+        results[i] = cache.CacheItem(p_fmat_local, i);
+      });
+    }
+    for (auto& t : tasks) {
+      t.join();
+    }
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      ASSERT_EQ(results[tidx]->i, tidx);
+    }
+
+    tasks.clear();
+
+    for (std::int32_t tidx = static_cast<std::int32_t>(n - 1); tidx >= 0; --tidx) {
+      tasks.emplace_back([&, i = tidx]() {
+        cache.CacheItem(p_fmat, i);
+
+        auto p_fmat_local = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+        results[i] = cache.CacheItem(p_fmat_local, i);
+      });
+    }
+    for (auto& t : tasks) {
+      t.join();
+    }
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      ASSERT_EQ(results[tidx]->i, tidx);
+    }
+  }
+
+  {
+    DMatrixCache<CacheForTest> cache{n};
+    std::vector<std::thread> tasks;
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      tasks.emplace_back([&, tidx]() { results[tidx] = cache.CacheItem(p_fmat, tidx); });
+    }
+    for (auto& t : tasks) {
+      t.join();
+    }
+    for (std::uint32_t tidx = 0; tidx < n; ++tidx) {
+      ASSERT_EQ(results[tidx]->i, tidx);
+    }
+  }
+}
 }  // namespace xgboost
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -9,12 +9,14 @@
 #include "../../../../src/tree/hist/evaluate_splits.h"
 #include "../test_evaluate_splits.h"
 #include "../../helpers.h"
+#include "xgboost/context.h"  // Context

 namespace xgboost {
 namespace tree {
 void TestEvaluateSplits(bool force_read_by_column) {
+  Context ctx;
+  ctx.nthread = 4;
  int static constexpr kRows = 8, kCols = 16;
-  int32_t n_threads = std::min(omp_get_max_threads(), 4);
  auto sampler = std::make_shared<common::ColumnSampler>();

  TrainParam param;
@@ -22,7 +24,7 @@ void TestEvaluateSplits(bool force_read_by_column) {

  auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();

-  auto evaluator = HistEvaluator<CPUExpandEntry>{param, dmat->Info(), n_threads, sampler};
+  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
  common::HistCollection hist;
  std::vector<GradientPair> row_gpairs = {
      {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
@@ -86,13 +88,15 @@ TEST(HistEvaluator, Evaluate) {
 }

 TEST(HistEvaluator, Apply) {
+  Context ctx;
+  ctx.nthread = 4;
  RegTree tree;
  int static constexpr kNRows = 8, kNCols = 16;
  TrainParam param;
  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
  auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
  auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator_ = HistEvaluator<CPUExpandEntry>{param, dmat->Info(), 4, sampler};
+  auto evaluator_ = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};

  CPUExpandEntry entry{0, 0, 10.0f};
  entry.split.left_sum = GradStats{0.4, 0.6f};
@@ -115,10 +119,11 @@ TEST(HistEvaluator, Apply) {
 }

 TEST_F(TestPartitionBasedSplit, CPUHist) {
+  Context ctx;
  // check the evaluator is returning the optimal split
  std::vector<FeatureType> ft{FeatureType::kCategorical};
  auto sampler = std::make_shared<common::ColumnSampler>();
-  HistEvaluator<CPUExpandEntry> evaluator{param_, info_, AllThreadsForTest(), sampler};
+  HistEvaluator<CPUExpandEntry> evaluator{&ctx, &param_, info_, sampler};
  evaluator.InitRoot(GradStats{total_gpair_});
  RegTree tree;
  std::vector<CPUExpandEntry> entries(1);
@@ -128,6 +133,7 @@ TEST_F(TestPartitionBasedSplit, CPUHist) {

 namespace {
 auto CompareOneHotAndPartition(bool onehot) {
+  Context ctx;
  int static constexpr kRows = 128, kCols = 1;
  std::vector<FeatureType> ft(kCols, FeatureType::kCategorical);

@@ -147,8 +153,7 @@ auto CompareOneHotAndPartition(bool onehot) {
      RandomDataGenerator(kRows, kCols, 0).Seed(3).Type(ft).MaxCategory(n_cats).GenerateDMatrix();

  auto sampler = std::make_shared<common::ColumnSampler>();
-  auto evaluator =
-      HistEvaluator<CPUExpandEntry>{param, dmat->Info(), AllThreadsForTest(), sampler};
+  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param, dmat->Info(), sampler};
  std::vector<CPUExpandEntry> entries(1);

  for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>({32, param.sparse_threshold})) {
@@ -198,8 +203,8 @@ TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
  MetaInfo info;
  info.num_col_ = 1;
  info.feature_types = {FeatureType::kCategorical};
-  auto evaluator =
-      HistEvaluator<CPUExpandEntry>{param_, info, AllThreadsForTest(), sampler};
+  Context ctx;
+  auto evaluator = HistEvaluator<CPUExpandEntry>{&ctx, &param_, info, sampler};
  evaluator.InitRoot(GradStats{parent_sum_});

  std::vector<CPUExpandEntry> entries(1);
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -48,7 +48,7 @@ void TestAddHistRows(bool is_distributed) {

  HistogramBuilder<CPUExpandEntry> histogram_builder;
  histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
-                          is_distributed);
+                          is_distributed, false);
  histogram_builder.AddHistRows(&starting_index, &sync_count,
                                nodes_for_explicit_hist_build_,
                                nodes_for_subtraction_trick_, &tree);
@@ -86,7 +86,7 @@ void TestSyncHist(bool is_distributed) {

  HistogramBuilder<CPUExpandEntry> histogram;
  uint32_t total_bins = gmat.cut.Ptrs().back();
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed);
+  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed, false);

  common::RowSetCollection row_set_collection_;
  {
@@ -226,11 +226,14 @@ TEST(CPUHistogram, SyncHist) {
  TestSyncHist(false);
 }

-void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {
+void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_col_split) {
  size_t constexpr kNRows = 8, kNCols = 16;
  int32_t constexpr kMaxBins = 4;
-  auto p_fmat =
-      RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
+  if (is_col_split) {
+    p_fmat = std::shared_ptr<DMatrix>{
+        p_fmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+  }
  auto const &gmat = *(p_fmat->GetBatches<GHistIndexMatrix>(BatchParam{kMaxBins, 0.5}).begin());
  uint32_t total_bins = gmat.cut.Ptrs().back();

@@ -241,7 +244,8 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {

  bst_node_t nid = 0;
  HistogramBuilder<CPUExpandEntry> histogram;
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed);
+  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed,
+                  is_col_split);

  RegTree tree;

@@ -284,11 +288,16 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column) {
 }

 TEST(CPUHistogram, BuildHist) {
-  TestBuildHistogram(true, false);
-  TestBuildHistogram(false, false);
-  TestBuildHistogram(true, true);
-  TestBuildHistogram(false, true);
+  TestBuildHistogram(true, false, false);
+  TestBuildHistogram(false, false, false);
+  TestBuildHistogram(true, true, false);
+  TestBuildHistogram(false, true, false);
+}

+TEST(CPUHistogram, BuildHistColSplit) {
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, true, true);
+  RunWithInMemoryCommunicator(kWorkers, TestBuildHistogram, true, false, true);
 }

 namespace {
@@ -340,7 +349,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
  HistogramBuilder<CPUExpandEntry> cat_hist;
  for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
    auto total_bins = gidx.cut.TotalBins();
-    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false);
+    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
    cat_hist.BuildHist(0, gidx, &tree, row_set_collection,
                        nodes_for_explicit_hist_build, {}, gpair.HostVector(),
                        force_read_by_column);
@@ -354,7 +363,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
  HistogramBuilder<CPUExpandEntry> onehot_hist;
  for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>({kBins, 0.5})) {
    auto total_bins = gidx.cut.TotalBins();
-    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false);
+    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
    onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
                          gpair.HostVector(),
                          force_read_by_column);
@@ -419,7 +428,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
        1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
        256};

-    multi_build.Reset(total_bins, batch_param, ctx.Threads(), rows_set.size(), false);
+    multi_build.Reset(total_bins, batch_param, ctx.Threads(), rows_set.size(), false, false);

    size_t page_idx{0};
    for (auto const &page : m->GetBatches<GHistIndexMatrix>(batch_param)) {
@@ -440,7 +449,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool fo
    common::RowSetCollection row_set_collection;
    InitRowPartitionForTest(&row_set_collection, n_samples);

-    single_build.Reset(total_bins, batch_param, ctx.Threads(), 1, false);
+    single_build.Reset(total_bins, batch_param, ctx.Threads(), 1, false, false);
    SparsePage concat;
    std::vector<float> hess(m->Info().num_row_, 1.0f);
    for (auto const& page : m->GetBatches<SparsePage>()) {
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -10,29 +10,36 @@

 namespace xgboost {
 namespace tree {
-TEST(Approx, Partitioner) {
-  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
-  Context ctx;
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
-  ASSERT_EQ(partitioner.base_rowid, base_rowid);
-  ASSERT_EQ(partitioner.Size(), 1);
-  ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
-
-  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
-  ctx.InitAllowUnknown(Args{});
-  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};

+namespace {
+std::vector<float> GenerateHess(size_t n_samples) {
  auto grad = GenerateRandomGradients(n_samples);
  std::vector<float> hess(grad.Size());
  std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(),
                 [](auto gpair) { return gpair.GetHess(); });
+  return hess;
+}
+}  // anonymous namespace
+
+TEST(Approx, Partitioner) {
+  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
+  ASSERT_EQ(partitioner.base_rowid, base_rowid);
+  ASSERT_EQ(partitioner.Size(), 1);
+  ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
+
+  auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  auto hess = GenerateHess(n_samples);
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};

  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
    bst_feature_t const split_ind = 0;
    {
      auto min_value = page.cut.MinValues()[split_ind];
      RegTree tree;
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
      GetSplit(&tree, min_value, &candidates);
      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
      ASSERT_EQ(partitioner.Size(), 3);
@@ -40,7 +47,7 @@ TEST(Approx, Partitioner) {
      ASSERT_EQ(partitioner[2].Size(), n_samples);
    }
    {
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
      auto ptr = page.cut.Ptrs()[split_ind + 1];
      float split_value = page.cut.Values().at(ptr / 2);
      RegTree tree;
@@ -66,12 +73,85 @@ TEST(Approx, Partitioner) {
  }
 }

+namespace {
+void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared_ptr<DMatrix> Xy,
+                                std::vector<float>* hess, float min_value, float mid_value,
+                                CommonRowPartitioner const& expected_mid_partitioner) {
+  auto dmat =
+      std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  for (auto const& page : dmat->GetBatches<GHistIndexMatrix>({64, *hess, true})) {
+    {
+      RegTree tree;
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
+      GetSplit(&tree, min_value, &candidates);
+      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+      ASSERT_EQ(partitioner.Size(), 3);
+      ASSERT_EQ(partitioner[1].Size(), 0);
+      ASSERT_EQ(partitioner[2].Size(), n_samples);
+    }
+    {
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
+      RegTree tree;
+      GetSplit(&tree, mid_value, &candidates);
+      partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+
+      auto left_nidx = tree[RegTree::kRoot].LeftChild();
+      auto elem = partitioner[left_nidx];
+      ASSERT_LT(elem.Size(), n_samples);
+      ASSERT_GT(elem.Size(), 1);
+      auto expected_elem = expected_mid_partitioner[left_nidx];
+      ASSERT_EQ(elem.Size(), expected_elem.Size());
+      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
+        ASSERT_EQ(*it, *eit);
+      }
+
+      auto right_nidx = tree[RegTree::kRoot].RightChild();
+      elem = partitioner[right_nidx];
+      expected_elem = expected_mid_partitioner[right_nidx];
+      ASSERT_EQ(elem.Size(), expected_elem.Size());
+      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
+        ASSERT_EQ(*it, *eit);
+      }
+    }
+  }
+}
+}  // anonymous namespace
+
+TEST(Approx, PartitionerColSplit) {
+  size_t n_samples = 1024, n_features = 16, base_rowid = 0;
+  auto const Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  auto hess = GenerateHess(n_samples);
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+
+  float min_value, mid_value;
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+    bst_feature_t const split_ind = 0;
+    min_value = page.cut.MinValues()[split_ind];
+
+    auto ptr = page.cut.Ptrs()[split_ind + 1];
+    mid_value = page.cut.Values().at(ptr / 2);
+    RegTree tree;
+    GetSplit(&tree, mid_value, &candidates);
+    mid_partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+  }
+
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, TestColumnSplitPartitioner, n_samples, base_rowid, Xy,
+                              &hess, min_value, mid_value, mid_partitioner);
+}
+
 namespace {
 void TestLeafPartition(size_t n_samples) {
  size_t const n_features = 2, base_rowid = 0;
  Context ctx;
  common::RowSetCollection row_set;
-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};

  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
@@ -12,8 +12,7 @@
 #include "../../../src/tree/split_evaluator.h"
 #include "../helpers.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 /**
 * \brief Enumerate all possible partitions for categorical split.
 */
@@ -151,5 +150,4 @@ class TestCategoricalSplitWithMissing : public testing::Test {
    ASSERT_EQ(right_sum.GetHess(), parent_sum_.GetHess() - left_sum.GetHess());
  }
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2023 by XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -13,6 +13,7 @@
 #include "../../../src/common/common.h"
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/constraints.cuh"
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../../../src/tree/updater_gpu_common.cuh"
 #include "../../../src/tree/updater_gpu_hist.cu"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
@@ -21,8 +22,7 @@
 #include "xgboost/context.h"
 #include "xgboost/json.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 TEST(GpuHist, DeviceHistogram) {
  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
  dh::safe_cuda(cudaSetDevice(0));
@@ -83,11 +83,12 @@ void TestBuildHist(bool use_shared_memory_histograms) {
  int const kNRows = 16, kNCols = 8;

  TrainParam param;
-  std::vector<std::pair<std::string, std::string>> args {
-    {"max_depth", "6"},
-    {"max_leaves", "0"},
+  Args args{
+      {"max_depth", "6"},
+      {"max_leaves", "0"},
  };
  param.Init(args);
+
  auto page = BuildEllpackPage(kNRows, kNCols);
  BatchParam batch_param{};
  Context ctx{CreateEmptyGenericParam(0)};
@@ -168,7 +169,6 @@ void TestHistogramIndexImpl() {
  int constexpr kNRows = 1000, kNCols = 10;

  // Build 2 matrices and build a histogram maker with that
-
  Context ctx(CreateEmptyGenericParam(0));
  tree::GPUHistMaker hist_maker{&ctx, ObjInfo{ObjInfo::kRegression}},
      hist_maker_ext{&ctx, ObjInfo{ObjInfo::kRegression}};
@@ -179,15 +179,14 @@ void TestHistogramIndexImpl() {
  std::unique_ptr<DMatrix> hist_maker_ext_dmat(
    CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true, tempdir));

-  std::vector<std::pair<std::string, std::string>> training_params = {
-    {"max_depth", "10"},
-    {"max_leaves", "0"}
-  };
+  Args training_params = {{"max_depth", "10"}, {"max_leaves", "0"}};
+  TrainParam param;
+  param.UpdateAllowUnknown(training_params);

  hist_maker.Configure(training_params);
-  hist_maker.InitDataOnce(hist_maker_dmat.get());
+  hist_maker.InitDataOnce(&param, hist_maker_dmat.get());
  hist_maker_ext.Configure(training_params);
-  hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
+  hist_maker_ext.InitDataOnce(&param, hist_maker_ext_dmat.get());

  // Extract the device maker from the histogram makers and from that its compressed
  // histogram index
@@ -237,13 +236,15 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
      {"subsample", std::to_string(subsample)},
      {"sampling_method", sampling_method},
  };
+  TrainParam param;
+  param.UpdateAllowUnknown(args);

  Context ctx(CreateEmptyGenericParam(0));
  tree::GPUHistMaker hist_maker{&ctx,ObjInfo{ObjInfo::kRegression}};
-  hist_maker.Configure(args);

  std::vector<HostDeviceVector<bst_node_t>> position(1);
-  hist_maker.Update(gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, {tree});
+  hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                    {tree});
  auto cache = linalg::VectorView<float>{preds->DeviceSpan(), {preds->Size()}, 0};
  hist_maker.UpdatePredictionCache(dmat, cache);
 }
@@ -391,13 +392,11 @@ TEST(GpuHist, ConfigIO) {
  Json j_updater { Object() };
  updater->SaveConfig(&j_updater);
  ASSERT_TRUE(IsA<Object>(j_updater["gpu_hist_train_param"]));
-  ASSERT_TRUE(IsA<Object>(j_updater["train_param"]));
  updater->LoadConfig(j_updater);

  Json j_updater_roundtrip { Object() };
  updater->SaveConfig(&j_updater_roundtrip);
  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["gpu_hist_train_param"]));
-  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["train_param"]));

  ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
@@ -414,5 +413,4 @@ TEST(GpuHist, MaxDepth) {

  ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -1,33 +1,42 @@
+/**
+ * Copyright 2019-2023 by XGBoost Contributors
+ */
 #include <gtest/gtest.h>
-
 #include <xgboost/tree_model.h>
 #include <xgboost/tree_updater.h>

+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
+std::shared_ptr<DMatrix> GenerateDMatrix(std::size_t rows, std::size_t cols){
+  return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix();
+}

-TEST(GrowHistMaker, InteractionConstraint) {
-  size_t constexpr kRows = 32;
-  size_t constexpr kCols = 16;
-
-  Context ctx;
-
-  auto p_dmat = RandomDataGenerator{kRows, kCols, 0.6f}.Seed(3).GenerateDMatrix();
-
-  HostDeviceVector<GradientPair> gradients (kRows);
-  std::vector<GradientPair>& h_gradients = gradients.HostVector();
+std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(std::size_t rows) {
+  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows);
+  auto& h_gradients = p_gradients->HostVector();

  xgboost::SimpleLCG gen;
  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);

-  for (size_t i = 0; i < kRows; ++i) {
-    bst_float grad = dist(&gen);
-    bst_float hess = dist(&gen);
-    h_gradients[i] = GradientPair(grad, hess);
+  for (std::size_t i = 0; i < rows; ++i) {
+    auto grad = dist(&gen);
+    auto hess = dist(&gen);
+    h_gradients[i] = GradientPair{grad, hess};
  }

+  return p_gradients;
+}
+
+TEST(GrowHistMaker, InteractionConstraint)
+{
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+  auto p_dmat = GenerateDMatrix(kRows, kCols);
+  auto p_gradients = GenerateGradients(kRows);
+
+  Context ctx;
  {
    // With constraints
    RegTree tree;
@@ -35,11 +44,11 @@ TEST(GrowHistMaker, InteractionConstraint) {

    std::unique_ptr<TreeUpdater> updater{
        TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
-    updater->Configure(Args{
-        {"interaction_constraints", "[[0, 1]]"},
-        {"num_feature", std::to_string(kCols)}});
+    TrainParam param;
+    param.UpdateAllowUnknown(
+        Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Update(&gradients, p_dmat.get(), position, {&tree});
+    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});

    ASSERT_EQ(tree.NumExtraNodes(), 4);
    ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -54,9 +63,10 @@ TEST(GrowHistMaker, InteractionConstraint) {

    std::unique_ptr<TreeUpdater> updater{
        TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
-    updater->Configure(Args{{"num_feature", std::to_string(kCols)}});
    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    updater->Update(&gradients, p_dmat.get(), position, {&tree});
+    TrainParam param;
+    param.Init(Args{});
+    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});

    ASSERT_EQ(tree.NumExtraNodes(), 10);
    ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -66,5 +76,53 @@ TEST(GrowHistMaker, InteractionConstraint) {
  }
 }

-}  // namespace tree
-}  // namespace xgboost
+namespace {
+void TestColumnSplit(int32_t rows, int32_t cols, RegTree const& expected_tree) {
+  auto p_dmat = GenerateDMatrix(rows, cols);
+  auto p_gradients = GenerateGradients(rows);
+  Context ctx;
+  std::unique_ptr<TreeUpdater> updater{
+      TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+
+  std::unique_ptr<DMatrix> sliced{
+      p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+
+  RegTree tree;
+  tree.param.num_feature = cols;
+  TrainParam param;
+  param.Init(Args{});
+  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+
+  EXPECT_EQ(tree.NumExtraNodes(), 10);
+  EXPECT_EQ(tree[0].SplitIndex(), 1);
+
+  EXPECT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
+  EXPECT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
+
+  EXPECT_EQ(tree, expected_tree);
+}
+}  // anonymous namespace
+
+TEST(GrowHistMaker, ColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  RegTree expected_tree;
+  expected_tree.param.num_feature = kCols;
+  {
+    auto p_dmat = GenerateDMatrix(kRows, kCols);
+    auto p_gradients = GenerateGradients(kRows);
+    Context ctx;
+    std::unique_ptr<TreeUpdater> updater{
+        TreeUpdater::Create("grow_histmaker", &ctx, ObjInfo{ObjInfo::kRegression})};
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    TrainParam param;
+    param.Init(Args{});
+    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&expected_tree});
+  }
+
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit, kRows, kCols, std::cref(expected_tree));
+}
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -7,6 +7,7 @@

 #include <memory>

+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"

 namespace xgboost {
@@ -75,9 +76,11 @@ class TestPredictionCache : public ::testing::Test {
      RegTree tree;
      std::vector<RegTree *> trees{&tree};
      auto gpair = GenerateRandomGradients(n_samples_);
-      updater->Configure(Args{{"max_bin", "64"}});
+      tree::TrainParam param;
+      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
+
      std::vector<HostDeviceVector<bst_node_t>> position(1);
-      updater->Update(&gpair, Xy_.get(), position, trees);
+      updater->Update(&param, &gpair, Xy_.get(), position, trees);
      HostDeviceVector<float> out_prediction_cached;
      out_prediction_cached.SetDevice(ctx.gpu_id);
      out_prediction_cached.Resize(n_samples_);
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -1,28 +1,26 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2023 by XGBoost Contributors
 */
+#include <gtest/gtest.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
-#include <xgboost/tree_updater.h>
 #include <xgboost/learner.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include <string>
-#include <memory>
+#include <xgboost/tree_updater.h>

+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"

-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 TEST(Updater, Prune) {
  int constexpr kCols = 16;

  std::vector<std::pair<std::string, std::string>> cfg;
-  cfg.emplace_back(std::pair<std::string, std::string>("num_feature",
-                                                       std::to_string(kCols)));
-  cfg.emplace_back(std::pair<std::string, std::string>(
-      "min_split_loss", "10"));
+  cfg.emplace_back("num_feature", std::to_string(kCols));
+  cfg.emplace_back("min_split_loss", "10");

  // These data are just place holders.
  HostDeviceVector<GradientPair> gpair =
@@ -38,28 +36,30 @@ TEST(Updater, Prune) {
  tree.param.UpdateAllowUnknown(cfg);
  std::vector<RegTree*> trees {&tree};
  // prepare pruner
+  TrainParam param;
+  param.UpdateAllowUnknown(cfg);
+
  std::unique_ptr<TreeUpdater> pruner(
      TreeUpdater::Create("prune", &ctx, ObjInfo{ObjInfo::kRegression}));
-  pruner->Configure(cfg);

  // loss_chg < min_split_loss;
  std::vector<HostDeviceVector<bst_node_t>> position(trees.size());
  tree.ExpandNode(0, 0, 0, true, 0.0f, 0.3f, 0.4f, 0.0f, 0.0f,
                  /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);

  ASSERT_EQ(tree.NumExtraNodes(), 0);

  // loss_chg > min_split_loss;
  tree.ExpandNode(0, 0, 0, true, 0.0f, 0.3f, 0.4f, 11.0f, 0.0f,
                  /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);

  ASSERT_EQ(tree.NumExtraNodes(), 2);

  // loss_chg == min_split_loss;
  tree.Stat(0).loss_chg = 10;
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);

  ASSERT_EQ(tree.NumExtraNodes(), 2);

@@ -73,20 +73,20 @@ TEST(Updater, Prune) {
                  0, 0.5f, true, 0.3, 0.4, 0.5,
                  /*loss_chg=*/19.0f, 0.0f,
                  /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  cfg.emplace_back(std::make_pair("max_depth", "1"));
-  pruner->Configure(cfg);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);

+  cfg.emplace_back("max_depth", "1");
+  param.UpdateAllowUnknown(cfg);
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
  ASSERT_EQ(tree.NumExtraNodes(), 2);

  tree.ExpandNode(tree[0].LeftChild(),
                  0, 0.5f, true, 0.3, 0.4, 0.5,
                  /*loss_chg=*/18.0f, 0.0f,
                  /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  cfg.emplace_back(std::make_pair("min_split_loss", "0"));
-  pruner->Configure(cfg);
-  pruner->Update(&gpair, p_dmat.get(), position, trees);
+  cfg.emplace_back("min_split_loss", "0");
+  param.UpdateAllowUnknown(cfg);
+
+  pruner->Update(&param, &gpair, p_dmat.get(), position, trees);
  ASSERT_EQ(tree.NumExtraNodes(), 2);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -23,7 +23,7 @@ TEST(QuantileHist, Partitioner) {
  Context ctx;
  ctx.InitAllowUnknown(Args{});

-  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+  CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
  ASSERT_EQ(partitioner.base_rowid, base_rowid);
  ASSERT_EQ(partitioner.Size(), 1);
  ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
@@ -41,7 +41,7 @@ TEST(QuantileHist, Partitioner) {
    {
      auto min_value = gmat.cut.MinValues()[split_ind];
      RegTree tree;
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
      GetSplit(&tree, min_value, &candidates);
      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
      ASSERT_EQ(partitioner.Size(), 3);
@@ -49,7 +49,7 @@ TEST(QuantileHist, Partitioner) {
      ASSERT_EQ(partitioner[2].Size(), n_samples);
    }
    {
-      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
      auto ptr = gmat.cut.Ptrs()[split_ind + 1];
      float split_value = gmat.cut.Values().at(ptr / 2);
      RegTree tree;
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -1,14 +1,15 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2013 by XGBoost Contributors
 */
+#include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
-#include <gtest/gtest.h>

-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>

+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"

 namespace xgboost {
@@ -43,9 +44,11 @@ TEST(Updater, Refresh) {
  tree.Stat(cleft).base_weight = 1.2;
  tree.Stat(cright).base_weight = 1.3;

-  refresher->Configure(cfg);
  std::vector<HostDeviceVector<bst_node_t>> position;
-  refresher->Update(&gpair, p_dmat.get(), position, trees);
+  tree::TrainParam param;
+  param.UpdateAllowUnknown(cfg);
+
+  refresher->Update(&param, &gpair, p_dmat.get(), position, trees);

  bst_float constexpr kEps = 1e-6;
  ASSERT_NEAR(-0.183392, tree[cright].LeafValue(), kEps);
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -1,7 +1,11 @@
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
+ */
 #include <gtest/gtest.h>
 #include <xgboost/tree_model.h>
 #include <xgboost/tree_updater.h>

+#include "../../../src/tree/param.h"  // for TrainParam
 #include "../helpers.h"

 namespace xgboost {
@@ -21,6 +25,9 @@ class UpdaterTreeStatTest : public ::testing::Test {
  }

  void RunTest(std::string updater) {
+    tree::TrainParam param;
+    param.Init(Args{});
+
    Context ctx(updater == "grow_gpu_hist" ? CreateEmptyGenericParam(0)
                                           : CreateEmptyGenericParam(Context::kCpuId));
    auto up = std::unique_ptr<TreeUpdater>{
@@ -29,7 +36,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
    RegTree tree;
    tree.param.num_feature = kCols;
    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    up->Update(&gpairs_, p_dmat_.get(), position, {&tree});
+    up->Update(&param, &gpairs_, p_dmat_.get(), position, {&tree});

    tree.WalkTree([&tree](bst_node_t nidx) {
      if (tree[nidx].IsLeaf()) {
@@ -69,28 +76,33 @@ class UpdaterEtaTest : public ::testing::Test {
  void RunTest(std::string updater) {
    Context ctx(updater == "grow_gpu_hist" ? CreateEmptyGenericParam(0)
                                           : CreateEmptyGenericParam(Context::kCpuId));
+
    float eta = 0.4;
    auto up_0 = std::unique_ptr<TreeUpdater>{
        TreeUpdater::Create(updater, &ctx, ObjInfo{ObjInfo::kClassification})};
-    up_0->Configure(Args{{"eta", std::to_string(eta)}});
+    up_0->Configure(Args{});
+    tree::TrainParam param0;
+    param0.Init(Args{{"eta", std::to_string(eta)}});

    auto up_1 = std::unique_ptr<TreeUpdater>{
        TreeUpdater::Create(updater, &ctx, ObjInfo{ObjInfo::kClassification})};
    up_1->Configure(Args{{"eta", "1.0"}});
+    tree::TrainParam param1;
+    param1.Init(Args{{"eta", "1.0"}});

    for (size_t iter = 0; iter < 4; ++iter) {
      RegTree tree_0;
      {
        tree_0.param.num_feature = kCols;
        std::vector<HostDeviceVector<bst_node_t>> position(1);
-        up_0->Update(&gpairs_, p_dmat_.get(), position, {&tree_0});
+        up_0->Update(&param0, &gpairs_, p_dmat_.get(), position, {&tree_0});
      }

      RegTree tree_1;
      {
        tree_1.param.num_feature = kCols;
        std::vector<HostDeviceVector<bst_node_t>> position(1);
-        up_1->Update(&gpairs_, p_dmat_.get(), position, {&tree_1});
+        up_1->Update(&param1, &gpairs_, p_dmat_.get(), position, {&tree_1});
      }
      tree_0.WalkTree([&](bst_node_t nidx) {
        if (tree_0[nidx].IsLeaf()) {
@@ -139,17 +151,18 @@ class TestMinSplitLoss : public ::testing::Test {

              // test gamma
              {"gamma", std::to_string(gamma)}};
+    tree::TrainParam param;
+    param.UpdateAllowUnknown(args);

    Context ctx(updater == "grow_gpu_hist" ? CreateEmptyGenericParam(0)
                                           : CreateEmptyGenericParam(Context::kCpuId));
-    std::cout << ctx.gpu_id << std::endl;
    auto up = std::unique_ptr<TreeUpdater>{
        TreeUpdater::Create(updater, &ctx, ObjInfo{ObjInfo::kRegression})};
-    up->Configure(args);
+    up->Configure({});

    RegTree tree;
    std::vector<HostDeviceVector<bst_node_t>> position(1);
-    up->Update(&gpair_, dmat_.get(), position, {&tree});
+    up->Update(&param, &gpair_, dmat_.get(), position, {&tree});

    auto n_nodes = tree.NumExtraNodes();
    return n_nodes;
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -42,9 +42,15 @@ class TestGPUBasicModels:
    def test_custom_objective(self):
        self.cpu_test_bm.run_custom_objective("gpu_hist")

-    def test_eta_decay_gpu_hist(self):
+    def test_eta_decay(self):
        self.cpu_test_cb.run_eta_decay('gpu_hist')

+    @pytest.mark.parametrize(
+        "objective", ["binary:logistic", "reg:absoluteerror", "reg:quantileerror"]
+    )
+    def test_eta_decay_leaf_output(self, objective) -> None:
+        self.cpu_test_cb.run_eta_decay_leaf_output("gpu_hist", objective)
+
    def test_deterministic_gpu_hist(self):
        kRows = 1000
        kCols = 64
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -2,6 +2,7 @@ import sys

 import pytest
 from hypothesis import given, settings, strategies
+
 from xgboost.testing import no_cupy

 sys.path.append("tests/python")
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -1,10 +1,10 @@
 import sys

 import pytest
-from xgboost.testing.metrics import check_quantile_error

 import xgboost
 from xgboost import testing as tm
+from xgboost.testing.metrics import check_quantile_error

 sys.path.append("tests/python")
 import test_eval_metrics as test_em  # noqa
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -3,10 +3,10 @@ import sys
 import numpy as np
 import pytest
 from hypothesis import assume, given, settings, strategies
-from xgboost.compat import PANDAS_INSTALLED

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.compat import PANDAS_INSTALLED

 if PANDAS_INSTALLED:
    from hypothesis.extra.pandas import column, data_frames, range_indexes
@@ -215,6 +215,7 @@ class TestGPUPredict:
    def test_inplace_predict_cupy(self):
        self.run_inplace_predict_cupy(0)

+    @pytest.mark.xfail
    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.mgpu
    def test_inplace_predict_cupy_specified_device(self):
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -4,11 +4,11 @@ from typing import Any, Dict
 import numpy as np
 import pytest
 from hypothesis import assume, given, note, settings, strategies
-from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy
-from xgboost.testing.updater import check_init_estimation

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy
+from xgboost.testing.updater import check_init_estimation, check_quantile_loss

 sys.path.append("tests/python")
 import test_updaters as test_up
@@ -209,3 +209,38 @@ class TestGPUUpdaters:

    def test_init_estimation(self) -> None:
        check_init_estimation("gpu_hist")
+
+    @pytest.mark.parametrize("weighted", [True, False])
+    def test_quantile_loss(self, weighted: bool) -> None:
+        check_quantile_loss("gpu_hist", weighted)
+
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_issue8824(self):
+        # column sampling by node crashes because shared pointers go out of scope
+        import pandas as pd
+
+        data = pd.DataFrame(np.random.rand(1024, 8))
+        data.columns = "x" + data.columns.astype(str)
+        features = data.columns
+        data["y"] = data.sum(axis=1) < 4
+        dtrain = xgb.DMatrix(data[features], label=data["y"])
+        model = xgb.train(
+            dtrain=dtrain,
+            params={
+                "max_depth": 5,
+                "learning_rate": 0.05,
+                "objective": "binary:logistic",
+                "tree_method": "gpu_hist",
+                "colsample_bytree": 0.5,
+                "colsample_bylevel": 0.5,
+                "colsample_bynode": 0.5,  # Causes issues
+                "reg_alpha": 0.05,
+                "reg_lambda": 0.005,
+                "seed": 66,
+                "subsample": 0.5,
+                "gamma": 0.2,
+                "predictor": "auto",
+                "eval_metric": "auc",
+            },
+            num_boost_round=150,
+        )
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -8,6 +8,7 @@ import pytest

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_ranking_qid_df

 sys.path.append("tests/python")
 import test_with_sklearn as twskl  # noqa
@@ -153,3 +154,10 @@ def test_classififer():
    y *= 10
    with pytest.raises(ValueError, match=r"Invalid classes.*"):
        clf.fit(X, y)
+
+
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_qid_df():
+    import cudf
+
+    run_ranking_qid_df(cudf, "gpu_hist")
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 from contextlib import nullcontext
@@ -355,47 +356,125 @@ class TestCallbacks:
        with warning_check:
            xgb.cv(param, dtrain, num_round, callbacks=[scheduler(eta_decay)])

-    @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
+    def run_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
+        # check decay has effect on leaf output.
+        num_round = 4
+        scheduler = xgb.callback.LearningRateScheduler
+
+        dpath = tm.data_dir(__file__)
+        dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train"))
+        dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test"))
+        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+
+        param = {
+            "max_depth": 2,
+            "objective": objective,
+            "eval_metric": "error",
+            "tree_method": tree_method,
+        }
+        if objective == "reg:quantileerror":
+            param["quantile_alpha"] = 0.3
+
+        def eta_decay_0(i):
+            return num_round / (i + 1)
+
+        bst0 = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            watchlist,
+            callbacks=[scheduler(eta_decay_0)],
+        )
+
+        def eta_decay_1(i: int) -> float:
+            if i > 1:
+                return 5.0
+            return num_round / (i + 1)
+
+        bst1 = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            watchlist,
+            callbacks=[scheduler(eta_decay_1)],
+        )
+        bst_json0 = bst0.save_raw(raw_format="json")
+        bst_json1 = bst1.save_raw(raw_format="json")
+
+        j0 = json.loads(bst_json0)
+        j1 = json.loads(bst_json1)
+
+        tree_2th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][2]
+        tree_2th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][2]
+        assert tree_2th_0["base_weights"] == tree_2th_1["base_weights"]
+        assert tree_2th_0["split_conditions"] == tree_2th_1["split_conditions"]
+
+        tree_3th_0 = j0["learner"]["gradient_booster"]["model"]["trees"][3]
+        tree_3th_1 = j1["learner"]["gradient_booster"]["model"]["trees"][3]
+        assert tree_3th_0["base_weights"] != tree_3th_1["base_weights"]
+        assert tree_3th_0["split_conditions"] != tree_3th_1["split_conditions"]
+
+    @pytest.mark.parametrize("tree_method", ["hist", "approx", "approx"])
    def test_eta_decay(self, tree_method):
        self.run_eta_decay(tree_method)

+    @pytest.mark.parametrize(
+        "tree_method,objective",
+        [
+            ("hist", "binary:logistic"),
+            ("hist", "reg:absoluteerror"),
+            ("hist", "reg:quantileerror"),
+            ("approx", "binary:logistic"),
+            ("approx", "reg:absoluteerror"),
+            ("approx", "reg:quantileerror"),
+        ],
+    )
+    def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
+        self.run_eta_decay_leaf_output(tree_method, objective)
+
    def test_check_point(self):
        from sklearn.datasets import load_breast_cancer
+
        X, y = load_breast_cancer(return_X_y=True)
        m = xgb.DMatrix(X, y)
        with tempfile.TemporaryDirectory() as tmpdir:
-            check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                          iterations=1,
-                                                          name='model')
-            xgb.train({'objective': 'binary:logistic'}, m,
-                      num_boost_round=10,
-                      verbose_eval=False,
-                      callbacks=[check_point])
+            check_point = xgb.callback.TrainingCheckPoint(
+                directory=tmpdir, iterations=1, name="model"
+            )
+            xgb.train(
+                {"objective": "binary:logistic"},
+                m,
+                num_boost_round=10,
+                verbose_eval=False,
+                callbacks=[check_point],
+            )
            for i in range(1, 10):
-                assert os.path.exists(
-                    os.path.join(tmpdir, 'model_' + str(i) + '.json'))
+                assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json"))

-            check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                          iterations=1,
-                                                          as_pickle=True,
-                                                          name='model')
-            xgb.train({'objective': 'binary:logistic'}, m,
-                      num_boost_round=10,
-                      verbose_eval=False,
-                      callbacks=[check_point])
+            check_point = xgb.callback.TrainingCheckPoint(
+                directory=tmpdir, iterations=1, as_pickle=True, name="model"
+            )
+            xgb.train(
+                {"objective": "binary:logistic"},
+                m,
+                num_boost_round=10,
+                verbose_eval=False,
+                callbacks=[check_point],
+            )
            for i in range(1, 10):
-                assert os.path.exists(
-                    os.path.join(tmpdir, 'model_' + str(i) + '.pkl'))
+                assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))

    def test_callback_list(self):
        X, y = tm.get_california_housing()
        m = xgb.DMatrix(X, y)
        callbacks = [xgb.callback.EarlyStopping(rounds=10)]
        for i in range(4):
-            xgb.train({'objective': 'reg:squarederror',
-                       'eval_metric': 'rmse'}, m,
-                      evals=[(m, 'Train')],
-                      num_boost_round=1,
-                      verbose_eval=True,
-                      callbacks=callbacks)
+            xgb.train(
+                {"objective": "reg:squarederror", "eval_metric": "rmse"},
+                m,
+                evals=[(m, "Train")],
+                num_boost_round=1,
+                verbose_eval=True,
+                callbacks=callbacks,
+            )
        assert len(callbacks) == 1
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -4,11 +4,11 @@ import numpy as np
 import pytest
 from hypothesis import given, settings, strategies
 from scipy.sparse import csr_matrix
-from xgboost.data import SingleBatchInternalIter as SingleBatch
-from xgboost.testing import IteratorForTest, make_batches, non_increasing

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.data import SingleBatchInternalIter as SingleBatch
+from xgboost.testing import IteratorForTest, make_batches, non_increasing

 pytestmark = tm.timeout(30)

--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -146,6 +146,13 @@ def test_multioutput_reg() -> None:
    subprocess.check_call(cmd)


+@pytest.mark.skipif(**tm.no_sklearn())
+def test_quantile_reg() -> None:
+    script = os.path.join(PYTHON_DEMO_DIR, "quantile_regression.py")
+    cmd = ['python', script]
+    subprocess.check_call(cmd)
+
+
@pytest.mark.skipif(**tm.no_ubjson())
 def test_json_model() -> None:
    script = os.path.join(DEMO_DIR, "json-model", "json_parser.py")
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -6,10 +6,10 @@ import pytest
 import scipy.sparse
 from hypothesis import given, settings, strategies
 from scipy.sparse import csr_matrix, rand
-from xgboost.testing.data import np_dtypes

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import np_dtypes

 rng = np.random.RandomState(1)

--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -1,9 +1,9 @@
 import numpy as np
 import pytest
-from xgboost.testing.updater import get_basescore

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.updater import get_basescore

 rng = np.random.RandomState(1994)

--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -1,9 +1,9 @@
 import numpy as np
 import pytest
-from xgboost.testing.metrics import check_quantile_error

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.metrics import check_quantile_error

 rng = np.random.RandomState(1337)

--- a/tests/python/test_pickling.py
+++ b/tests/python/test_pickling.py
@@ -51,11 +51,8 @@ class TestPickling:

    def test_model_pickling_json(self):
        def check(config):
-            updater = config["learner"]["gradient_booster"]["updater"]
-            if params["tree_method"] == "exact":
-                subsample = updater["grow_colmaker"]["train_param"]["subsample"]
-            else:
-                subsample = updater["grow_quantile_histmaker"]["train_param"]["subsample"]
+            tree_param = config["learner"]["gradient_booster"]["tree_train_param"]
+            subsample = tree_param["subsample"]
            assert float(subsample) == 0.5

        params = {"nthread": 8, "tree_method": "hist", "subsample": 0.5}
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -5,11 +5,11 @@ import numpy as np
 import pandas as pd
 import pytest
 from scipy import sparse
-from xgboost.testing.data import np_dtypes, pd_dtypes
-from xgboost.testing.shared import validate_leaf_output

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import np_dtypes, pd_dtypes
+from xgboost.testing.shared import validate_leaf_output


 def run_threaded_predict(X, rows, predict_func):
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -4,6 +4,8 @@ import numpy as np
 import pytest
 from hypothesis import given, settings, strategies
 from scipy import sparse
+
+import xgboost as xgb
 from xgboost.testing import (
    IteratorForTest,
    make_batches,
@@ -15,8 +17,6 @@ from xgboost.testing import (
 )
 from xgboost.testing.data import np_dtypes

-import xgboost as xgb
-

 class TestQuantileDMatrix:
    def test_basic(self) -> None:
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -5,15 +5,15 @@ from typing import Any, Dict, List
 import numpy as np
 import pytest
 from hypothesis import given, note, settings, strategies
+
+import xgboost as xgb
+from xgboost import testing as tm
 from xgboost.testing.params import (
    cat_parameter_strategy,
    exact_parameter_strategy,
    hist_parameter_strategy,
 )
-from xgboost.testing.updater import check_init_estimation
-
-import xgboost as xgb
-from xgboost import testing as tm
+from xgboost.testing.updater import check_init_estimation, check_quantile_loss


 def train_result(param, dmat, num_rounds):
@@ -447,7 +447,8 @@ class TestTreeMethod:
            {
                "tree_method": tree_method,
                "objective": "reg:absoluteerror",
-                "subsample": 0.8
+                "subsample": 0.8,
+                "eta": 1.0,
            },
            Xy,
            num_boost_round=10,
@@ -469,3 +470,7 @@ class TestTreeMethod:

    def test_init_estimation(self) -> None:
        check_init_estimation("hist")
+
+    @pytest.mark.parametrize("weighted", [True, False])
+    def test_quantile_loss(self, weighted: bool) -> None:
+        check_quantile_loss("hist", weighted)
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -3,10 +3,10 @@ from typing import Type
 import numpy as np
 import pytest
 from test_dmatrix import set_base_margin_info
-from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes

 try:
    import pandas as pd
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -8,11 +8,12 @@ from typing import Callable, Optional
 import numpy as np
 import pytest
 from sklearn.utils.estimator_checks import parametrize_with_checks
-from xgboost.testing.shared import get_feature_weights, validate_data_initialization
-from xgboost.testing.updater import get_basescore

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.shared import get_feature_weights, validate_data_initialization
+from xgboost.testing.updater import get_basescore

 rng = np.random.RandomState(1994)
 pytestmark = [pytest.mark.skipif(**tm.no_sklearn()), tm.timeout(30)]
@@ -180,6 +181,13 @@ def test_ranking_metric() -> None:
    assert results["validation_0"]["roc_auc_score"][-1] > 0.6


+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_qid_df():
+    import pandas as pd
+
+    run_ranking_qid_df(pd, "hist")
+
+
 def test_stacking_regression():
    from sklearn.datasets import load_diabetes
    from sklearn.ensemble import RandomForestRegressor, StackingRegressor
@@ -1018,14 +1026,18 @@ def test_XGBClassifier_resume():


 def test_constraint_parameters():
-    reg = xgb.XGBRegressor(interaction_constraints='[[0, 1], [2, 3, 4]]')
+    reg = xgb.XGBRegressor(interaction_constraints="[[0, 1], [2, 3, 4]]")
    X = np.random.randn(10, 10)
    y = np.random.randn(10)
    reg.fit(X, y)

    config = json.loads(reg.get_booster().save_config())
-    assert config['learner']['gradient_booster']['updater']['grow_colmaker'][
-        'train_param']['interaction_constraints'] == '[[0, 1], [2, 3, 4]]'
+    assert (
+        config["learner"]["gradient_booster"]["tree_train_param"][
+            "interaction_constraints"
+        ]
+        == "[[0, 1], [2, 3, 4]]"
+    )


 def test_parameter_validation():
--- a/tests/test_distributed/test_federated/test_federated.py
+++ b/tests/test_distributed/test_federated/test_federated.py
@@ -3,9 +3,8 @@ import multiprocessing
 import sys
 import time

-import xgboost.federated
-
 import xgboost as xgb
+import xgboost.federated

 SERVER_KEY = 'server-key.pem'
 SERVER_CERT = 'server-cert.pem'
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -10,10 +10,10 @@ import numpy as np
 import pytest
 from hypothesis import given, note, settings, strategies
 from hypothesis._settings import duration
-from xgboost.testing.params import hist_parameter_strategy

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.testing.params import hist_parameter_strategy

 pytestmark = [
    pytest.mark.skipif(**tm.no_dask()),
@@ -42,9 +42,9 @@ try:
    from dask import array as da
    from dask.distributed import Client
    from dask_cuda import LocalCUDACluster
-    from xgboost.testing.dask import check_init_estimation

    from xgboost import dask as dxgb
+    from xgboost.testing.dask import check_init_estimation
 except ImportError:
    pass

--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -12,6 +12,7 @@ pytestmark = pytest.mark.skipif(**tm.no_spark())
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
+
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor

 gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -21,6 +21,9 @@ import scipy
 import sklearn
 from hypothesis import HealthCheck, given, note, settings
 from sklearn.datasets import make_classification, make_regression
+
+import xgboost as xgb
+from xgboost import testing as tm
 from xgboost.data import _is_cudf_df
 from xgboost.testing.params import hist_parameter_strategy
 from xgboost.testing.shared import (
@@ -29,9 +32,6 @@ from xgboost.testing.shared import (
    validate_leaf_output,
 )

-import xgboost as xgb
-from xgboost import testing as tm
-
 pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_dask())]

 import dask
@@ -39,6 +39,7 @@ import dask.array as da
 import dask.dataframe as dd
 from distributed import Client, LocalCluster
 from toolz import sliding_window  # dependency of dask
+
 from xgboost.dask import DaskDMatrix
 from xgboost.testing.dask import check_init_estimation

--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -8,6 +8,7 @@ from xgboost import testing as tm

 pytestmark = [pytest.mark.skipif(**tm.no_spark())]

+from xgboost import DMatrix, QuantileDMatrix
 from xgboost.spark.data import (
    _read_csr_matrix_from_unwrapped_spark_vec,
    alias,
@@ -15,8 +16,6 @@ from xgboost.spark.data import (
    stack_series,
 )

-from xgboost import DMatrix, QuantileDMatrix
-

 def test_stack() -> None:
    a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -8,10 +8,10 @@ from typing import Generator, Sequence, Type

 import numpy as np
 import pytest
-from xgboost.spark.data import pred_contribs

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.spark.data import pred_contribs

 pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_spark())]

@@ -23,6 +23,8 @@ from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
 from pyspark.sql import functions as spark_sql_func
+
+from xgboost import XGBClassifier, XGBModel, XGBRegressor
 from xgboost.spark import (
    SparkXGBClassifier,
    SparkXGBClassifierModel,
@@ -32,8 +34,6 @@ from xgboost.spark import (
 )
 from xgboost.spark.core import _non_booster_params

-from xgboost import XGBClassifier, XGBModel, XGBRegressor
-
 from .utils import SparkTestCase

 logging.getLogger("py4j").setLevel(logging.INFO)
@@ -730,6 +730,16 @@ class TestPySparkLocal:
        train_params = py_cls._get_distributed_train_params(clf_data.cls_df_train)
        assert train_params["tree_method"] == "gpu_hist"

+    def test_classifier_with_list_eval_metric(self, clf_data: ClfData) -> None:
+        classifier = SparkXGBClassifier(eval_metric=["auc", "rmse"])
+        model = classifier.fit(clf_data.cls_df_train)
+        model.transform(clf_data.cls_df_test).collect()
+
+    def test_classifier_with_string_eval_metric(self, clf_data: ClfData) -> None:
+        classifier = SparkXGBClassifier(eval_metric="auc")
+        model = classifier.fit(clf_data.cls_df_train)
+        model.transform(clf_data.cls_df_test).collect()
+

 class XgboostLocalTest(SparkTestCase):
    def setUp(self):
--- a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
@@ -11,6 +11,7 @@ from xgboost import testing as tm
 pytestmark = pytest.mark.skipif(**tm.no_spark())

 from pyspark.ml.linalg import Vectors
+
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
 from xgboost.spark.utils import _get_max_num_concurrent_tasks

@@ -421,10 +422,10 @@ class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
        self.assertTrue(hasattr(classifier, "max_depth"))
        self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
        booster_config = json.loads(model.get_booster().save_config())
-        max_depth = booster_config["learner"]["gradient_booster"]["updater"][
-            "grow_histmaker"
-        ]["train_param"]["max_depth"]
-        self.assertEqual(int(max_depth), 7)
+        max_depth = booster_config["learner"]["gradient_booster"]["tree_train_param"][
+            "max_depth"
+        ]
+        assert int(max_depth) == 7

    def test_repartition(self):
        # The following test case has a few partitioned datasets that are either
--- a/tests/test_distributed/test_with_spark/utils.py
+++ b/tests/test_distributed/test_with_spark/utils.py
@@ -13,6 +13,7 @@ from xgboost import testing as tm
 pytestmark = [pytest.mark.skipif(**tm.no_spark())]

 from pyspark.sql import SparkSession
+
 from xgboost.spark.utils import _get_default_params_from_func