Bump cudf from 23.04.0 to 23.06.0 in /jvm-packages

Bumps [cudf](https://github.com/rapidsai/cudf) from 23.04.0 to 23.06.0. - [Release notes](https://github.com/rapidsai/cudf/releases) - [Changelog](https://github.com/rapidsai/cudf/blob/branch-23.08/CHANGELOG.md) - [Commits](https://github.com/rapidsai/cudf/compare/v23.04.00...v23.06.00) --- updated-dependencies: - dependency-name: ai.rapids:cudf:cuda11 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>
2023-06-09 03:01:51 +00:00
196 changed files with 2598 additions and 5975 deletions
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -255,44 +255,3 @@ jobs:
      shell: bash -l {0}
      run: |
        pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
-
-  python-system-installation-on-ubuntu:
-    name: Test XGBoost Python package System Installation on ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest]
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: 'true'
-
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-
-      - name: Install ninja
-        run: |
-          sudo apt-get update && sudo apt-get install -y ninja-build
-
-      - name: Build XGBoost on Ubuntu
-        run: |
-          mkdir build
-          cd build
-          cmake .. -GNinja
-          ninja
-
-      - name: Copy lib to system lib
-        run: |
-          cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib"
-
-      - name: Install XGBoost in Virtual Environment
-        run: |
-          cd python-package
-          pip install virtualenv
-          virtualenv venv
-          source venv/bin/activate && \
-            pip install -v . --config-settings use_system_libxgboost=True && \
-            python -c 'import xgboost'
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,23 +3,6 @@ XGBoost Change Log

 This file records the changes in xgboost library in reverse chronological order.

-## 1.7.6 (2023 Jun 16)
-
-This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.
-
-### Bug Fixes
-* Fix distributed training with mixed dense and sparse partitions. (#9272)
-* Fix monotone constraints on CPU with large trees. (#9122)
-* [spark] Make the spark model have the same UID as its estimator (#9022)
-* Optimize prediction with `QuantileDMatrix`. (#9096)
-
-### Document
-* Improve doxygen (#8959)
-* Update the cuDF pip index URL. (#9106)
-
-### Maintenance
-* Fix tests with pandas 2.0. (#9014)
-
 ## 1.7.5 (2023 Mar 30)
 This is a patch release for bug fixes.

--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -85,18 +85,9 @@ test_that("dart prediction works", {
    rnorm(100)

  set.seed(1994)
-  booster_by_xgboost <- xgboost(
-    data = d,
-    label = y,
-    max_depth = 2,
-    booster = "dart",
-    rate_drop = 0.5,
-    one_drop = TRUE,
-    eta = 1,
-    nthread = 2,
-    nrounds = nrounds,
-    objective = "reg:squarederror"
-  )
+  booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
+                                rate_drop = 0.5, one_drop = TRUE,
+                                eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
  expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
@@ -106,19 +97,19 @@ test_that("dart prediction works", {

  set.seed(1994)
  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
-  booster_by_train <- xgb.train(
-    params = list(
-      booster = "dart",
-      max_depth = 2,
-      eta = 1,
-      rate_drop = 0.5,
-      one_drop = TRUE,
-      nthread = 1,
-      objective = "reg:squarederror"
-    ),
-    data = dtrain,
-    nrounds = nrounds
-  )
+  booster_by_train <- xgb.train(params = list(
+                                    booster = "dart",
+                                    max_depth = 2,
+                                    eta = 1,
+                                    rate_drop = 0.5,
+                                    one_drop = TRUE,
+                                    nthread = 1,
+                                    tree_method = "exact",
+                                    objective = "reg:squarederror"
+                                ),
+                                data = dtrain,
+                                nrounds = nrounds
+                                )
  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
  pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
@@ -408,7 +399,7 @@ test_that("colsample_bytree works", {
  xgb.importance(model = bst)
  # If colsample_bytree works properly, a variety of features should be used
  # in the 100 trees
-  expect_gte(nrow(xgb.importance(model = bst)), 28)
+  expect_gte(nrow(xgb.importance(model = bst)), 30)
 })

 test_that("Configuration works", {
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -76,20 +76,32 @@ test_that("Models from previous versions of XGBoost can be loaded", {
    name <- m[3]
    is_rds <- endsWith(model_file, '.rds')
    is_json <- endsWith(model_file, '.json')
-    # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
-    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
-      booster <- readRDS(model_file)
-      expect_warning(predict(booster, newdata = pred_data))
-      booster <- readRDS(model_file)
-      expect_warning(run_booster_check(booster, name))
-    } else {
-      if (is_rds) {
+
+    cpp_warning <- capture.output({
+      # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
+      if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
        booster <- readRDS(model_file)
+        expect_warning(predict(booster, newdata = pred_data))
+        booster <- readRDS(model_file)
+        expect_warning(run_booster_check(booster, name))
      } else {
-        booster <- xgb.load(model_file)
+        if (is_rds) {
+          booster <- readRDS(model_file)
+        } else {
+          booster <- xgb.load(model_file)
+        }
+        predict(booster, newdata = pred_data)
+        run_booster_check(booster, name)
      }
-      predict(booster, newdata = pred_data)
-      run_booster_check(booster, name)
+    })
+    cpp_warning <- paste0(cpp_warning, collapse = ' ')
+    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') >= 0) {
+      # Expect a C++ warning when a model is loaded from RDS and it was generated by old XGBoost`
+      m <- grepl(paste0('.*If you are loading a serialized model ',
+                        '\\(like pickle in Python, RDS in R\\).*',
+                        'for more details about differences between ',
+                        'saving model and serializing.*'), cpp_warning, perl = TRUE)
+      expect_true(length(m) > 0 && all(m))
    }
  })
 })
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -13,10 +13,7 @@ test_that("updating the model works", {
  watchlist <- list(train = dtrain, test = dtest)

  # no-subsampling
-  p1 <- list(
-    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
-    updater = "grow_colmaker,prune"
-  )
+  p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
  set.seed(11)
  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
  tr1 <- xgb.model.dt.tree(model = bst1)
--- a/R-package/vignettes/xgboost.bib
+++ b/R-package/vignettes/xgboost.bib
@@ -18,11 +18,13 @@
  publisher={Institute of Mathematical Statistics}
 }

+
@misc{
    Bache+Lichman:2013 ,
    author = "K. Bache and M. Lichman",
    year = "2013",
    title = "{UCI} Machine Learning Repository",
-    url = "https://archive.ics.uci.edu/",
-    institution = "University of California, Irvine, School of Information and Computer Sciences"
+    url = "http://archive.ics.uci.edu/ml/",
+    institution = "University of California, Irvine, School of Information and Computer Sciences" 
 }
+
--- a/demo/aft_survival/aft_survival_viz_demo.py
+++ b/demo/aft_survival/aft_survival_viz_demo.py
@@ -11,43 +11,33 @@ import numpy as np

 import xgboost as xgb

-plt.rcParams.update({"font.size": 13})
-
+plt.rcParams.update({'font.size': 13})

 # Function to visualize censored labels
-def plot_censored_labels(
-    X: np.ndarray, y_lower: np.ndarray, y_upper: np.ndarray
-) -> None:
-    def replace_inf(x: np.ndarray, target_value: float) -> np.ndarray:
+def plot_censored_labels(X, y_lower, y_upper):
+    def replace_inf(x, target_value):
        x[np.isinf(x)] = target_value
        return x
-
-    plt.plot(X, y_lower, "o", label="y_lower", color="blue")
-    plt.plot(X, y_upper, "o", label="y_upper", color="fuchsia")
-    plt.vlines(
-        X,
-        ymin=replace_inf(y_lower, 0.01),
-        ymax=replace_inf(y_upper, 1000.0),
-        label="Range for y",
-        color="gray",
-    )
-
+    plt.plot(X, y_lower, 'o', label='y_lower', color='blue')
+    plt.plot(X, y_upper, 'o', label='y_upper', color='fuchsia')
+    plt.vlines(X, ymin=replace_inf(y_lower, 0.01), ymax=replace_inf(y_upper, 1000),
+               label='Range for y', color='gray')

 # Toy data
 X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
 INF = np.inf
-y_lower = np.array([10, 15, -INF, 30, 100])
-y_upper = np.array([INF, INF, 20, 50, INF])
+y_lower = np.array([ 10,  15, -INF, 30, 100])
+y_upper = np.array([INF, INF,   20, 50, INF])

 # Visualize toy data
 plt.figure(figsize=(5, 4))
 plot_censored_labels(X, y_lower, y_upper)
 plt.ylim((6, 200))
-plt.legend(loc="lower right")
-plt.title("Toy data")
-plt.xlabel("Input feature")
-plt.ylabel("Label")
-plt.yscale("log")
+plt.legend(loc='lower right')
+plt.title('Toy data')
+plt.xlabel('Input feature')
+plt.ylabel('Label')
+plt.yscale('log')
 plt.tight_layout()
 plt.show(block=True)

@@ -56,83 +46,54 @@ grid_pts = np.linspace(0.8, 5.2, 1000).reshape((-1, 1))

 # Train AFT model using XGBoost
 dmat = xgb.DMatrix(X)
-dmat.set_float_info("label_lower_bound", y_lower)
-dmat.set_float_info("label_upper_bound", y_upper)
-params = {"max_depth": 3, "objective": "survival:aft", "min_child_weight": 0}
+dmat.set_float_info('label_lower_bound', y_lower)
+dmat.set_float_info('label_upper_bound', y_upper)
+params = {'max_depth': 3, 'objective':'survival:aft', 'min_child_weight': 0}

 accuracy_history = []
+def plot_intermediate_model_callback(env):
+    """Custom callback to plot intermediate models"""
+    # Compute y_pred = prediction using the intermediate model, at current boosting iteration
+    y_pred = env.model.predict(dmat)
+    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
+    #              the corresponding predicted label (y_pred)
+    acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X) * 100)
+    accuracy_history.append(acc)

+    # Plot ranged labels as well as predictions by the model
+    plt.subplot(5, 3, env.iteration + 1)
+    plot_censored_labels(X, y_lower, y_upper)
+    y_pred_grid_pts = env.model.predict(xgb.DMatrix(grid_pts))
+    plt.plot(grid_pts, y_pred_grid_pts, 'r-', label='XGBoost AFT model', linewidth=4)
+    plt.title('Iteration {}'.format(env.iteration), x=0.5, y=0.8)
+    plt.xlim((0.8, 5.2))
+    plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
+    plt.yscale('log')

-class PlotIntermediateModel(xgb.callback.TrainingCallback):
-    """Custom callback to plot intermediate models."""
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def after_iteration(
-        self,
-        model: xgb.Booster,
-        epoch: int,
-        evals_log: xgb.callback.TrainingCallback.EvalsLog,
-    ) -> bool:
-        """Run after training is finished."""
-        # Compute y_pred = prediction using the intermediate model, at current boosting
-        # iteration
-        y_pred = model.predict(dmat)
-        # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
-        #              includes the corresponding predicted label (y_pred)
-        acc = np.sum(
-            np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X) * 100
-        )
-        accuracy_history.append(acc)
-
-        # Plot ranged labels as well as predictions by the model
-        plt.subplot(5, 3, epoch + 1)
-        plot_censored_labels(X, y_lower, y_upper)
-        y_pred_grid_pts = model.predict(xgb.DMatrix(grid_pts))
-        plt.plot(
-            grid_pts, y_pred_grid_pts, "r-", label="XGBoost AFT model", linewidth=4
-        )
-        plt.title("Iteration {}".format(epoch), x=0.5, y=0.8)
-        plt.xlim((0.8, 5.2))
-        plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
-        plt.yscale("log")
-        return False
-
-
-res: xgb.callback.TrainingCallback.EvalsLog = {}
-plt.figure(figsize=(12, 13))
-bst = xgb.train(
-    params,
-    dmat,
-    15,
-    [(dmat, "train")],
-    evals_result=res,
-    callbacks=[PlotIntermediateModel()],
-)
+res = {}
+plt.figure(figsize=(12,13))
+bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=res,
+                callbacks=[plot_intermediate_model_callback])
 plt.tight_layout()
-plt.legend(
-    loc="lower center",
-    ncol=4,
-    bbox_to_anchor=(0.5, 0),
-    bbox_transform=plt.gcf().transFigure,
-)
+plt.legend(loc='lower center', ncol=4,
+           bbox_to_anchor=(0.5, 0),
+           bbox_transform=plt.gcf().transFigure)
 plt.tight_layout()

 # Plot negative log likelihood over boosting iterations
-plt.figure(figsize=(8, 3))
+plt.figure(figsize=(8,3))
 plt.subplot(1, 2, 1)
-plt.plot(res["train"]["aft-nloglik"], "b-o", label="aft-nloglik")
-plt.xlabel("# Boosting Iterations")
-plt.legend(loc="best")
+plt.plot(res['train']['aft-nloglik'], 'b-o', label='aft-nloglik')
+plt.xlabel('# Boosting Iterations')
+plt.legend(loc='best')

 # Plot "accuracy" over boosting iterations
 # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
 #              the corresponding predicted label (y_pred)
 plt.subplot(1, 2, 2)
-plt.plot(accuracy_history, "r-o", label="Accuracy (%)")
-plt.xlabel("# Boosting Iterations")
-plt.legend(loc="best")
+plt.plot(accuracy_history, 'r-o', label='Accuracy (%)')
+plt.xlabel('# Boosting Iterations')
+plt.legend(loc='best')
 plt.tight_layout()

 plt.show()
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -82,10 +82,10 @@ def main(tmpdir: str) -> xgboost.Booster:
    missing = np.NaN
    Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)

-    # Other tree methods including ``approx``, and ``gpu_hist`` are supported. GPU
-    # behaves differently than CPU tree methods. See tutorial in doc for details.
+    # Other tree methods including ``hist`` and ``gpu_hist`` also work, see tutorial in
+    # doc for details.
    booster = xgboost.train(
-        {"tree_method": "hist", "max_depth": 4},
+        {"tree_method": "approx", "max_depth": 2},
        Xy,
        evals=[(Xy, "Train")],
        num_boost_round=10,
--- a/demo/guide-python/learning_to_rank.py
+++ b/demo/guide-python/learning_to_rank.py
@@ -1,212 +0,0 @@
-"""
-Getting started with learning to rank
-=====================================
-
-  .. versionadded:: 2.0.0
-
-This is a demonstration of using XGBoost for learning to rank tasks using the
-MSLR_10k_letor dataset. For more infomation about the dataset, please visit its
-`description page <https://www.microsoft.com/en-us/research/project/mslr/>`_.
-
-This is a two-part demo, the first one contains a basic example of using XGBoost to
-train on relevance degree, and the second part simulates click data and enable the
-position debiasing training.
-
-For an overview of learning to rank in XGBoost, please see
-:doc:`Learning to Rank </tutorials/learning_to_rank>`.
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import pickle as pkl
-
-import numpy as np
-import pandas as pd
-from sklearn.datasets import load_svmlight_file
-
-import xgboost as xgb
-from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples
-
-
-def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV:
-    """Load the MSLR10k dataset from data_path and cache a pickle object in cache_path.
-
-    Returns
-    -------
-
-    A list of tuples [(X, y, qid), ...].
-
-    """
-    root_path = os.path.expanduser(args.data)
-    cacheroot_path = os.path.expanduser(args.cache)
-    cache_path = os.path.join(cacheroot_path, "MSLR_10K_LETOR.pkl")
-
-    # Use only the Fold1 for demo:
-    # Train,      Valid, Test
-    # {S1,S2,S3}, S4,    S5
-    fold = 1
-
-    if not os.path.exists(cache_path):
-        fold_path = os.path.join(root_path, f"Fold{fold}")
-        train_path = os.path.join(fold_path, "train.txt")
-        valid_path = os.path.join(fold_path, "vali.txt")
-        test_path = os.path.join(fold_path, "test.txt")
-        X_train, y_train, qid_train = load_svmlight_file(
-            train_path, query_id=True, dtype=np.float32
-        )
-        y_train = y_train.astype(np.int32)
-        qid_train = qid_train.astype(np.int32)
-
-        X_valid, y_valid, qid_valid = load_svmlight_file(
-            valid_path, query_id=True, dtype=np.float32
-        )
-        y_valid = y_valid.astype(np.int32)
-        qid_valid = qid_valid.astype(np.int32)
-
-        X_test, y_test, qid_test = load_svmlight_file(
-            test_path, query_id=True, dtype=np.float32
-        )
-        y_test = y_test.astype(np.int32)
-        qid_test = qid_test.astype(np.int32)
-
-        data = RelDataCV(
-            train=(X_train, y_train, qid_train),
-            test=(X_test, y_test, qid_test),
-            max_rel=4,
-        )
-
-        with open(cache_path, "wb") as fd:
-            pkl.dump(data, fd)
-
-    with open(cache_path, "rb") as fd:
-        data = pkl.load(fd)
-
-    return data
-
-
-def ranking_demo(args: argparse.Namespace) -> None:
-    """Demonstration for learning to rank with relevance degree."""
-    data = load_mlsr_10k(args.data, args.cache)
-
-    # Sort data according to query index
-    X_train, y_train, qid_train = data.train
-    sorted_idx = np.argsort(qid_train)
-    X_train = X_train[sorted_idx]
-    y_train = y_train[sorted_idx]
-    qid_train = qid_train[sorted_idx]
-
-    X_test, y_test, qid_test = data.test
-    sorted_idx = np.argsort(qid_test)
-    X_test = X_test[sorted_idx]
-    y_test = y_test[sorted_idx]
-    qid_test = qid_test[sorted_idx]
-
-    ranker = xgb.XGBRanker(
-        tree_method="gpu_hist",
-        lambdarank_pair_method="topk",
-        lambdarank_num_pair_per_sample=13,
-        eval_metric=["ndcg@1", "ndcg@8"],
-    )
-    ranker.fit(
-        X_train,
-        y_train,
-        qid=qid_train,
-        eval_set=[(X_test, y_test)],
-        eval_qid=[qid_test],
-        verbose=True,
-    )
-
-
-def click_data_demo(args: argparse.Namespace) -> None:
-    """Demonstration for learning to rank with click data."""
-    data = load_mlsr_10k(args.data, args.cache)
-    train, test = simulate_clicks(data)
-    assert test is not None
-
-    assert train.X.shape[0] == train.click.size
-    assert test.X.shape[0] == test.click.size
-    assert test.score.dtype == np.float32
-    assert test.click.dtype == np.int32
-
-    X_train, clicks_train, y_train, qid_train = sort_ltr_samples(
-        train.X,
-        train.y,
-        train.qid,
-        train.click,
-        train.pos,
-    )
-    X_test, clicks_test, y_test, qid_test = sort_ltr_samples(
-        test.X,
-        test.y,
-        test.qid,
-        test.click,
-        test.pos,
-    )
-
-    class ShowPosition(xgb.callback.TrainingCallback):
-        def after_iteration(
-            self,
-            model: xgb.Booster,
-            epoch: int,
-            evals_log: xgb.callback.TrainingCallback.EvalsLog,
-        ) -> bool:
-            config = json.loads(model.save_config())
-            ti_plus = np.array(config["learner"]["objective"]["ti+"])
-            tj_minus = np.array(config["learner"]["objective"]["tj-"])
-            df = pd.DataFrame({"ti+": ti_plus, "tj-": tj_minus})
-            print(df)
-            return False
-
-    ranker = xgb.XGBRanker(
-        n_estimators=512,
-        tree_method="gpu_hist",
-        learning_rate=0.01,
-        reg_lambda=1.5,
-        subsample=0.8,
-        sampling_method="gradient_based",
-        # LTR specific parameters
-        objective="rank:ndcg",
-        # - Enable bias estimation
-        lambdarank_unbiased=True,
-        # - normalization (1 / (norm + 1))
-        lambdarank_bias_norm=1,
-        # - Focus on the top 12 documents
-        lambdarank_num_pair_per_sample=12,
-        lambdarank_pair_method="topk",
-        ndcg_exp_gain=True,
-        eval_metric=["ndcg@1", "ndcg@3", "ndcg@5", "ndcg@10"],
-        callbacks=[ShowPosition()],
-    )
-    ranker.fit(
-        X_train,
-        clicks_train,
-        qid=qid_train,
-        eval_set=[(X_test, y_test), (X_test, clicks_test)],
-        eval_qid=[qid_test, qid_test],
-        verbose=True,
-    )
-    ranker.predict(X_test)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Demonstration of learning to rank using XGBoost."
-    )
-    parser.add_argument(
-        "--data",
-        type=str,
-        help="Root directory of the MSLR-WEB10K data.",
-        required=True,
-    )
-    parser.add_argument(
-        "--cache",
-        type=str,
-        help="Directory for caching processed data.",
-        required=True,
-    )
-    args = parser.parse_args()
-
-    ranking_demo(args)
-    click_data_demo(args)
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -259,7 +259,7 @@ There are several ways to build and install the package from source:

    import sys
    import pathlib
-    libpath = pathlib.Path(sys.base_prefix).joinpath("lib", "libxgboost.so")
+    libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
    assert libpath.exists()

  Then pass ``use_system_libxgboost=True`` option to ``pip install``:
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -33,8 +33,6 @@ DMatrix
 .. doxygengroup:: DMatrix
   :project: xgboost

-.. _c_streaming:
-
 Streaming
 ---------

--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -16,10 +16,8 @@ C++ Coding Guideline
  * Each line of text may contain up to 100 characters.
  * The use of C++ exceptions is allowed.

- Use C++17 features such as smart pointers, braced initializers, lambda functions, and ``std::thread``.
+- Use C++11 features such as smart pointers, braced initializers, lambda functions, and ``std::thread``.
 - Use Doxygen to document all the interface code.
- We have some comments around symbols imported by headers, some of those are hinted by `include-what-you-use <https://include-what-you-use.org>`_. It's not required.
- We use clang-tidy and clang-format. You can check their configuration in the root directory of the XGBoost source tree.
 - We have a series of automatic checks to ensure that all of our codebase complies with the Google style. Before submitting your pull request, you are encouraged to run the style checks on your machine. See :ref:`running_checks_locally`.

 ***********************
--- a/doc/contrib/community.rst
+++ b/doc/contrib/community.rst
@@ -11,7 +11,7 @@ General Development Process
 ---------------------------
 Everyone in the community is welcomed to send patches, documents, and propose new directions to the project. The key guideline here is to enable everyone in the community to get involved and participate the decision and development.  When major changes are proposed, an RFC should be sent to allow discussion by the community. We encourage public discussion, archivable channels such as issues and discuss forum, so that everyone in the community can participate and review the process later.

-Code reviews are one of the key ways to ensure the quality of the code. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request needs to be reviewed before it gets merged. A committer who has the expertise of the corresponding area would moderate the pull request and then merge the code when it is ready. The corresponding committer could request multiple reviewers who are familiar with the area of the code. We encourage contributors to request code reviews themselves and help review each other's code -- remember everyone is volunteering their time to the community, high-quality code review itself costs as much as the actual code contribution, you could get your code quickly reviewed if you do others the same favor.
+Code reviews are one of the key ways to ensure the quality of the code. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request needs to be reviewed before it gets merged. A committer who has the expertise of the corresponding area would moderate the pull request and the merge the code when it is ready. The corresponding committer could request multiple reviewers who are familiar with the area of the code. We encourage contributors to request code reviews themselves and help review each other's code -- remember everyone is volunteering their time to the community, high-quality code review itself costs as much as the actual code contribution, you could get your code quickly reviewed if you do others the same favor.

 The community should strive to reach a consensus on technical decisions through discussion. We expect committers and PMCs to moderate technical discussions in a diplomatic way, and provide suggestions with clear technical reasoning when necessary.

@@ -25,11 +25,11 @@ Committers are individuals who are granted the write access to the project. A co
 - Quality of contributions: High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review.  History of creating clean, maintainable code and including good test cases. Informative code reviews to help other contributors that adhere to a good standard.
 - Community involvement: active participation in the discussion forum, promote the projects via tutorials, talks and outreach. We encourage committers to collaborate broadly, e.g. do code reviews and discuss designs with community members that they do not interact physically.

-The Project Management Committee(PMC) consists of a group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines in order to make XGBoost a better community for everyone. PMCs should strive to only nominate new candidates outside of their own organization.
+The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines XGBoost a better community for everyone. PMCs should strive to only nominate new candidates outside of their own organization.

 The PMC is in charge of the project's `continuous integration (CI) <https://en.wikipedia.org/wiki/Continuous_integration>`_ and testing infrastructure. Currently, we host our own Jenkins server at https://xgboost-ci.net. The PMC shall appoint committer(s) to manage the CI infrastructure. The PMC may accept 3rd-party donations and sponsorships that would defray the cost of the CI infrastructure. See :ref:`donation_policy`.


 Reviewers
 ---------
-Reviewers are individuals who actively contributed to the project and are willing to participate in the code review of new contributions. We identify reviewers from active contributors. The committers should explicitly solicit reviews from reviewers.  High-quality code reviews prevent technical debt for the long-term and are crucial to the success of the project. A pull request to the project has to be reviewed by at least one reviewer in order to be merged.
+Reviewers are individuals who actively contributed to the project and are willing to participate in the code review of new contributions. We identify reviewers from active contributors. The committers should explicitly solicit reviews from reviewers.  High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request to the project has to be reviewed by at least one reviewer in order to be merged.
--- a/doc/contrib/docs.rst
+++ b/doc/contrib/docs.rst
@@ -8,83 +8,23 @@ Documentation and Examples
  :backlinks: none
  :local:

-*************
-Documentation
-*************
+*********
+Documents
+*********
 * Python and C documentation is built using `Sphinx <http://www.sphinx-doc.org/en/master/>`_.
 * Each document is written in `reStructuredText <http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html>`_.
-* The documentation is the ``doc/`` directory.
-* You can build it locally using ``make html`` command.
+* You can build document locally to see the effect, by running

-  .. code-block:: bash
+.. code-block:: bash

-    make html
+  make html

-  Run ``make help`` to learn about the other commands.
-
-The online document is hosted by `Read the Docs <https://readthedocs.org/>`__ where the imported project is managed by `Hyunsu Cho <https://github.com/hcho3>`__ and `Jiaming Yuan <https://github.com/trivialfis>`__.
-
-=========================================
-Build the Python Docs using pip and Conda
-=========================================
-
-#. Create a conda environment.
-
-   .. code-block:: bash
-
-     conda create -n xgboost-docs --yes python=3.10
-
-   .. note:: Python 3.10 is required by `xgboost_ray <https://github.com/ray-project/xgboost_ray>`__ package.
-
-#. Activate the environment
-
-   .. code-block:: bash
-
-     conda activate xgboost-docs
-
-#. Install required packages (in the current environment) using ``pip`` command.
-
-   .. code-block:: bash
-
-     pip install -r requirements.txt
-
-   .. note::
-      It is currently not possible to install the required packages using ``conda``
-      due to ``xgboost_ray`` being unavailable in conda channels.
-
-      .. code-block:: bash
-
-        conda install --file requirements.txt --yes -c conda-forge
-
-
-#. (optional) Install `graphviz <https://www.graphviz.org/>`__
-
-   .. code-block:: bash
-
-     conda install graphviz --yes
-
-#. Eventually, build the docs.
-
-   .. code-block:: bash
-
-     make html
-
-  You should see the following messages in the console:
-
-  .. code-block:: console
-
-    $ make html
-    sphinx-build -b html -d _build/doctrees   . _build/html
-    Running Sphinx v6.2.1
-    ...
-    The HTML pages are in _build/html.
-
-    Build finished. The HTML pages are in _build/html.
+inside the ``doc/`` directory.  The online document is hosted by `Read the Docs <https://readthedocs.org/>`__ where the imported project is managed by `Hyunsu Cho <https://github.com/hcho3>`__ and `Jiaming Yuan <https://github.com/trivialfis>`__.

 ********
 Examples
 ********
-* Use cases and examples are in `demo <https://github.com/dmlc/xgboost/tree/master/demo>`_ directory.
+* Use cases and examples will be in `demo <https://github.com/dmlc/xgboost/tree/master/demo>`_.
 * We are super excited to hear about your story. If you have blog posts,
  tutorials, or code solutions using XGBoost, please tell us, and we will add
  a link in the example pages.
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -45,7 +45,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as

 .. code-block:: python

-  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+  model.set_param({"predictor": "gpu_predictor"})
  shap_values = model.predict(dtrain, pred_contribs=True)
  shap_interaction_values = model.predict(dtrain, pred_interactions=True)

--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -23,8 +23,8 @@ Installation
  :local:
  :backlinks: none

-Checkout the :doc:`Installation Guide </install>` for how to install the jvm package, or
-:doc:`Building from Source </build>` on how to build it from the sources.
+Checkout the :doc:`Installation Guide </install>` for how to install jvm package, or
+:doc:`Building from Source </build>` on how to build it form source.

 ********
 Contents
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -199,6 +199,18 @@ Parameters for Tree Booster
  - Maximum number of discrete bins to bucket continuous features.
  - Increasing this number improves the optimality of splits at the cost of higher computation time.

+* ``predictor``, [default= ``auto``]
+
+  - The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
+
+    - ``auto``: Configure predictor based on heuristics.
+    - ``cpu_predictor``: Multicore CPU prediction algorithm.
+    - ``gpu_predictor``: Prediction using GPU.  Used when ``tree_method`` is ``gpu_hist``.
+      When ``predictor`` is set to default value ``auto``, the ``gpu_hist`` tree method is
+      able to provide GPU based prediction without copying training data to GPU memory.
+      If ``gpu_predictor`` is explicitly specified, then all data is copied into GPU, only
+      recommended for performing prediction tasks.
+
 * ``num_parallel_tree``, [default=1]

  - Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -87,6 +87,15 @@ with the native Python interface :py:meth:`xgboost.Booster.predict` and
 behavior.  Also the ``save_best`` parameter from :py:obj:`xgboost.callback.EarlyStopping`
 might be useful.

+*********
+Predictor
+*********
+
+There are 2 predictors in XGBoost (3 if you have the one-api plugin enabled), namely
+``cpu_predictor`` and ``gpu_predictor``.  The default option is ``auto`` so that XGBoost
+can employ some heuristics for saving GPU memory during training.  They might have slight
+different outputs due to floating point errors.
+

 ***********
 Base Margin
@@ -125,6 +134,15 @@ it.  Be aware that the output of in-place prediction depends on input data type,
 input is on GPU data output is :py:obj:`cupy.ndarray`, otherwise a :py:obj:`numpy.ndarray`
 is returned.

+****************
+Categorical Data
+****************
+
+Other than users performing encoding, XGBoost has experimental support for categorical
+data using ``gpu_hist`` and ``gpu_predictor``.  No special operation needs to be done on
+input test data since the information about categories is encoded into the model during
+training.
+
 *************
 Thread Safety
 *************
@@ -141,6 +159,7 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi

    def predict_fn(clf: xgb.XGBClassifier, X):
        X = preprocess(X)
+        clf.set_params(predictor="gpu_predictor")  # NOT safe!
        clf.set_params(n_jobs=1)  # NOT safe!
        return clf.predict_proba(X, iteration_range=(0, 10))

--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -54,9 +54,6 @@ on a dask cluster:
        y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))

        dtrain = xgb.dask.DaskDMatrix(client, X, y)
-        # or
-        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
-        # `DaskQuantileDMatrix` is available for the `hist` and `gpu_hist` tree method.

        output = xgb.dask.train(
            client,
@@ -148,8 +145,8 @@ Also for inplace prediction:

 .. code-block:: python

-  # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
-  booster.set_param({"gpu_id": "0"})
+  booster.set_param({'predictor': 'gpu_predictor'})
+  # where X is a dask DataFrame or dask Array containing cupy or cuDF backed data.
  prediction = xgb.dask.inplace_predict(client, booster, X)

 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -2,34 +2,11 @@
 Using XGBoost External Memory Version
 #####################################

-When working with large datasets, training XGBoost models can be challenging as the entire
-dataset needs to be loaded into memory. This can be costly and sometimes
-infeasible. Staring from 1.5, users can define a custom iterator to load data in chunks
-for running XGBoost algorithms. External memory can be used for both training and
-prediction, but training is the primary use case and it will be our focus in this
-tutorial. For prediction and evaluation, users can iterate through the data themseleves
-while training requires the full dataset to be loaded into the memory.
-
-During training, there are two different modes for external memory support available in
-XGBoost, one for CPU-based algorithms like ``hist`` and ``approx``, another one for the
-GPU-based training algorithm. We will introduce them in the following sections.
-
-.. note::
-
-   Training on data from external memory is not supported by the ``exact`` tree method.
-
-.. note::
-
-   The feature is still experimental as of 2.0. The performance is not well optimized.
-
-The external memory support has gone through multiple iterations and is still under heavy
-development. Like the :py:class:`~xgboost.QuantileDMatrix` with
-:py:class:`~xgboost.DataIter`, XGBoost loads data batch-by-batch using a custom iterator
-supplied by the user. However, unlike the :py:class:`~xgboost.QuantileDMatrix`, external
-memory will not concatenate the batches unless GPU is used (it uses a hybrid approach,
-more details follow). Instead, it will cache all batches on the external memory and fetch
-them on-demand.  Go to the end of the document to see a comparison between
-`QuantileDMatrix` and external memory.
+XGBoost supports loading data from external memory using builtin data parser.  And
+starting from version 1.5, users can also define a custom iterator to load data in chunks.
+The feature is still experimental and not yet ready for production use.  In this tutorial
+we will introduce both methods.  Please note that training on data from external memory is
+not supported by ``exact`` tree method.

 *************
 Data Iterator
@@ -38,8 +15,8 @@ Data Iterator
 Starting from XGBoost 1.5, users can define their own data loader using Python or C
 interface.  There are some examples in the ``demo`` directory for quick start.  This is a
 generalized version of text input external memory, where users no longer need to prepare a
-text file that XGBoost recognizes.  To enable the feature, users need to define a data
-iterator with 2 class methods: ``next`` and ``reset``, then pass it into the ``DMatrix``
+text file that XGBoost recognizes.  To enable the feature, user need to define a data
+iterator with 2 class methods ``next`` and ``reset`` then pass it into ``DMatrix``
 constructor.

 .. code-block:: python
@@ -83,146 +60,20 @@ constructor.

  # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
  # as noted in following sections.
-  booster = xgboost.train({"tree_method": "hist"}, Xy)
+  booster = xgboost.train({"tree_method": "approx"}, Xy)


-The above snippet is a simplified version of :ref:`sphx_glr_python_examples_external_memory.py`.
-For an example in C, please see ``demo/c-api/external-memory/``. The iterator is the
-common interface for using external memory with XGBoost, you can pass the resulting
-``DMatrix`` object for training, prediction, and evaluation.
-
-It is important to set the batch size based on the memory available. A good starting point
-is to set the batch size to 10GB per batch if you have 64GB of memory. It is *not*
-recommended to set small batch sizes like 32 samples per batch, as this can seriously hurt
-performance in gradient boosting.
-
-***********
-CPU Version
-***********
-
-In the previous section, we demonstrated how to train a tree-based model using the
-``hist`` tree method on a CPU. This method involves iterating through data batches stored
-in a cache during tree construction. For optimal performance, we recommend using the
-``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer of tree
-nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
-requires XGBoost to iterate over the data set for each tree node, resulting in slower
-performance.
-
-If external memory is used, the performance of CPU training is limited by IO
-(input/output) speed. This means that the disk IO speed primarily determines the training
-speed. During benchmarking, we used an NVMe connected to a PCIe-4 slot, other types of
-storage can be too slow for practical usage. In addition, your system may perform caching
-to reduce the overhead of file reading.
-
-**********************************
-GPU Version (GPU Hist tree method)
-**********************************
-
-External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
-``gpu_hist``). However, the algorithm used for GPU is different from the one used for
-CPU. When training on a CPU, the tree method iterates through all batches from external
-memory for each step of the tree construction algorithm. On the other hand, the GPU
-algorithm uses a hybrid approach. It iterates through the data during the beginning of
-each iteration and concatenates all batches into one in GPU memory. To reduce overall
-memory usage, users can utilize subsampling. The GPU hist tree method supports
-`gradient-based sampling`, enabling users to set a low sampling rate without compromising
-accuracy.
-
-.. code-block:: python
-
-  param = {
-    ...
-    'subsample': 0.2,
-    'sampling_method': 'gradient_based',
-  }
-
-For more information about the sampling algorithm and its use in external memory training,
-see `this paper <https://arxiv.org/abs/2005.09148>`_.
-
-.. warning::
-
-   When GPU is running out of memory during iteration on external memory, user might
-   recieve a segfault instead of an OOM exception.
-
-.. _ext_remarks:
-
-*******
-Remarks
-*******
-
-When using external memory with XBGoost, data is divided into smaller chunks so that only
-a fraction of it needs to be stored in memory at any given time. It's important to note
-that this method only applies to the predictor data (``X``), while other data, like labels
-and internal runtime structures are concatenated. This means that memory reduction is most
-effective when dealing with wide datasets where ``X`` is significantly larger in size
-compared to other data like ``y``, while it has little impact on slim datasets.
-
-As one might expect, fetching data on-demand puts significant pressure on the storage
-device. Today's computing device can process way more data than a storage can read in a
-single unit of time. The ratio is at order of magnitudes. An GPU is capable of processing
-hundred of Gigabytes of floating-point data in a split second. On the other hand, a
-four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data transfer
-rate. As a result, the training is likely to be severely bounded by your storage
-device. Before adopting the external memory solution, some back-of-envelop calculations
-might help you see whether it's viable. For instance, if your NVMe drive can transfer 4GB
-(a fairly practical number) of data per second and you have a 100GB of data in compressed
-XGBoost cache (which corresponds to a dense float32 numpy array with the size of 200GB,
-give or take). A tree with depth 8 needs at least 16 iterations through the data when the
-parameter is right. You need about 14 minutes to train a single tree without accounting
-for some other overheads and assume the computation overlaps with the IO. If your dataset
-happens to have TB-level size, then you might need thousands of trees to get a generalized
-model. These calculations can help you get an estimate on the expected training time.
-
-However, sometimes we can ameliorate this limitation. One should also consider that the OS
-(mostly talking about the Linux kernel) can usually cache the data on host memory. It only
-evicts pages when new data comes in and there's no room left. In practice, at least some
-portion of the data can persist on the host memory throughout the entire training
-session. We are aware of this cache when optimizing the external memory fetcher. The
-compressed cache is usually smaller than the raw input data, especially when the input is
-dense without any missing value. If the host memory can fit a significant portion of this
-compressed cache, then the performance should be decent after initialization. Our
-development so far focus on two fronts of optimization for external memory:
-
- Avoid iterating through the data whenever appropriate.
- If the OS can cache the data, the performance should be close to in-core training.
-
-Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
-tested against system errors like disconnected network devices (`SIGBUS`). In the face of
-a bus error, you will see a hard crash and need to clean up the cache files. If the
-training session might take a long time and you are using solutions like NVMe-oF, we
-recommend checkpointing your model periodically. Also, it's worth noting that most tests
-have been conducted on Linux distributions.
-
-
-Another important point to keep in mind is that creating the initial cache for XGBoost may
-take some time. The interface to external memory is through custom iterators, which we can
-not assume to be thread-safe. Therefore, initialization is performed sequentially. Using
-the `xgboost.config_context` with `verbosity=2` can give you some information on what
-XGBoost is doing during the wait if you don't mind the extra output.
-
-*******************************
-Compared to the QuantileDMatrix
-*******************************
-
-Passing an iterator to the :py:class:`~xgboost.QuantileDmatrix` enables direct
-construction of `QuantileDmatrix` with data chunks. On the other hand, if it's passed to
-:py:class:`~xgboost.DMatrix`, it instead enables the external memory feature. The
-:py:class:`~xgboost.QuantileDmatrix` concatenates the data on memory after compression and
-doesn't fetch data during training. On the other hand, the external memory `DMatrix`
-fetches data batches from external memory on-demand.  Use the `QuantileDMatrix` (with
-iterator if necessary) when you can fit most of your data in memory. The training would be
-an order of magnitute faster than using external memory.
+The above snippet is a simplified version of ``demo/guide-python/external_memory.py``.  For
+an example in C, please see ``demo/c-api/external-memory/``.

 ****************
 Text File Inputs
 ****************

-This is the original form of external memory support, users are encouraged to use custom
-data iterator instead. There is no big difference between using external memory version of
-text input and the in-memory version.  The only difference is the filename format.
+There is no big difference between using external memory version and in-memory version.
+The only difference is the filename format.

-The external memory version takes in the following `URI
-<https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
+The external memory version takes in the following `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:

 .. code-block:: none

@@ -240,8 +91,9 @@ To load from csv files, use the following syntax:

 where ``label_column`` should point to the csv column acting as the label.

-If you have a dataset stored in a file similar to ``demo/data/agaricus.txt.train`` with LIBSVM
-format, the external memory support can be enabled by:
+To provide a simple example for illustration, extracting the code from
+`demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_. If
+you have a dataset stored in a file similar to ``agaricus.txt.train`` with LIBSVM format, the external memory support can be enabled by:

 .. code-block:: python

@@ -252,3 +104,35 @@ XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to
 more notes about text input formats, see :doc:`/tutorials/input_format`.

 For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
+
+
+**********************************
+GPU Version (GPU Hist tree method)
+**********************************
+External memory is supported in GPU algorithms (i.e. when ``tree_method`` is set to ``gpu_hist``).
+
+If you are still getting out-of-memory errors after enabling external memory, try subsampling the
+data to further reduce GPU memory usage:
+
+.. code-block:: python
+
+  param = {
+    ...
+    'subsample': 0.1,
+    'sampling_method': 'gradient_based',
+  }
+
+For more information, see `this paper <https://arxiv.org/abs/2005.09148>`_.  Internally
+the tree method still concatenate all the chunks into 1 final histogram index due to
+performance reason, but in compressed format.  So its scalability has an upper bound but
+still has lower memory cost in general.
+
+***********
+CPU Version
+***********
+
+For CPU histogram based tree methods (``approx``, ``hist``) it's recommended to use
+``grow_policy=depthwise`` for performance reason.  Iterating over data batches is slow,
+with ``depthwise`` policy XGBoost can build a entire layer of tree nodes with a few
+iterations, while with ``lossguide`` XGBoost needs to iterate over the data set for each
+tree node.
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -11,22 +11,21 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo

  model
  saving_model
-  learning_to_rank
-  dart
-  monotonic
-  feature_interaction_constraint
-  aft_survival_analysis
-  categorical
-  multioutput
-  rf
  kubernetes
  Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
  Distributed XGBoost with XGBoost4J-Spark-GPU <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_gpu_tutorial.html>
  dask
  spark_estimator
  ray
-  external_memory
+  dart
+  monotonic
+  rf
+  feature_interaction_constraint
+  aft_survival_analysis
  c_api_tutorial
  input_format
  param_tuning
+  external_memory
  custom_metric_obj
+  categorical
+  multioutput
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -1,202 +0,0 @@
-################
-Learning to Rank
-################
-
-**Contents**
-
-.. contents::
-  :local:
-  :backlinks: none
-
-********
-Overview
-********
-Often in the context of information retrieval, learning-to-rank aims to train a model that arranges a set of query results into an ordered list `[1] <#references>`__. For surprivised learning-to-rank, the predictors are sample documents encoded as feature matrix, and the labels are relevance degree for each sample. Relevance degree can be multi-level (graded) or binary (relevant or not). The training samples are often grouped by their query index with each query group containing multiple query results.
-
-XGBoost implements learning to rank through a set of objective functions and performance metrics. The default objective is ``rank:ndcg`` based on the ``LambdaMART`` `[2] <#references>`__ algorithm, which in turn is an adaptation of the ``LambdaRank`` `[3] <#references>`__ framework to gradient boosting trees. For a history and a summary of the algorithm, see `[5] <#references>`__. The implementation in XGBoost features deterministic GPU computation, distributed training, position debiasing and two different pair construction strategies.
-
-************************************
-Training with the Pariwise Objective
-************************************
-``LambdaMART`` is a pairwise ranking model, meaning that it compares the relevance degree for every pair of samples in a query group and calculate a proxy gradient for each pair. The default objective ``rank:ndcg`` is using the surrogate gradient derived from the ``ndcg`` metric. To train a XGBoost model, we need an additional sorted array called ``qid`` for specifying the query group of input samples. An example input would look like this:
-
-+-------+-----------+---------------+
-|   QID |   Label   |   Features    |
-+=======+===========+===============+
-|   1   |   0       |   :math:`x_1` |
-+-------+-----------+---------------+
-|   1   |   1       |   :math:`x_2` |
-+-------+-----------+---------------+
-|   1   |   0       |   :math:`x_3` |
-+-------+-----------+---------------+
-|   2   |   0       |   :math:`x_4` |
-+-------+-----------+---------------+
-|   2   |   1       |   :math:`x_5` |
-+-------+-----------+---------------+
-|   2   |   1       |   :math:`x_6` |
-+-------+-----------+---------------+
-|   2   |   1       |   :math:`x_7` |
-+-------+-----------+---------------+
-
-Notice that the samples are sorted based on their query index in a non-decreasing order. In the above example, the first three samples belong to the first query and the next four samples belong to the second. For the sake of simplicity, we will use a synthetic binary learning-to-rank dataset in the following code snippets, with binary labels representing whether the result is relevant or not, and randomly assign the query group index to each sample. For an example that uses a real world dataset, please see :ref:`sphx_glr_python_examples_learning_to_rank.py`.
-
-.. code-block:: python
-
-  from sklearn.datasets import make_classification
-  import numpy as np
-
-  import xgboost as xgb
-
-  # Make a synthetic ranking dataset for demonstration
-  seed = 1994 
-  X, y = make_classification(random_state=seed)
-  rng = np.random.default_rng(seed)
-  n_query_groups = 3
-  qid = rng.integers(0, 3, size=X.shape[0])
-
-  # Sort the inputs based on query index
-  sorted_idx = np.argsort(qid)
-  X = X[sorted_idx, :]
-  y = y[sorted_idx]
-
-The simpliest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning:
-
-.. code-block:: python
-
-  ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=8, objective="rank:ndcg", lambdarank_pair_method="topk")
-  ranker.fit(X, y, qid=qid)
-
-Please note that, as of writing, there's no learning-to-rank interface in scikit-learn. As a result, the :py:class:`xgboost.XGBRanker` class does not fully conform the scikit-learn estimator guideline and can not be directly used with some of its utility functions. For instances, the ``auc_score`` and ``ndcg_score`` in scikit-learn don't consider query group information nor the pairwise loss. Most of the metrics are implemented as part of XGBoost, but to use scikit-learn utilities like :py:func:`sklearn.model_selection.cross_validation`, we need to make some adjustments in order to pass the ``qid`` as an additional parameter for :py:meth:`xgboost.XGBRanker.score`. Given a data frame ``X`` (either pandas or cuDF), add the column ``qid`` as follows:
-
-.. code-block:: python
-
-  df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1]))
-  df["qid"] = qid
-  ranker.fit(df, y)  # No need to pass qid as a separate argument
-
-  from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
-  # Works with cv in scikit-learn, along with HPO utilities like GridSearchCV
-  kfold = StratifiedGroupKFold(shuffle=False)
-  cross_val_score(ranker, df, y, cv=kfold, groups=df.qid)
-
-The above snippets build a model using ``LambdaMART`` with the ``NDCG@8`` metric. The outputs of a ranker are relevance scores:
-
-.. code-block:: python
-
-  scores = ranker.predict(X)
-  sorted_idx = np.argsort(scores)[::-1]
-  # Sort the relevance scores from most relevant to least relevant
-  scores = scores[sorted_idx]
-
-
-*************
-Position Bias
-*************
-
-.. versionadded:: 2.0.0
-
-.. note::
-
-   The feature is considered experimental. This is a heated research area, and your input is much appreciated!
-
-Obtaining real relevance degrees for query results is an expensive and strenuous, requiring human labelers to label all results one by one. When such labeling task is infeasible, we might want to train the learning-to-rank model on user click data instead, as it is relatively easy to collect. Another advantage of using click data directly is that it can reflect the most up-to-date user preferences `[1] <#references>`__. However, user clicks are often biased,  as users tend to choose results that are displayed in higher positions. User clicks are also noisy, where users might accidentally click on irrelevant documents. To ameliorate these issues, XGBoost implements the ``Unbiased LambdaMART`` `[4] <#references>`__ algorithm to debias the position-dependent click data. The feature can be enabled by the ``lambdarank_unbiased`` parameter; see :ref:`ltr-param` for related options and :ref:`sphx_glr_python_examples_learning_to_rank.py` for a worked example with simulated user clicks.
-
-****
-Loss
-****
-
-XGBoost implements different ``LambdaMART`` objectives based on different metrics. We list them here as a reference. Other than those used as objective function, XGBoost also implements metrics like ``pre`` (for precision) for evaluation. See :doc:`parameters </parameter>` for available options and the following sections for how to choose these objectives based of the amount of effective pairs.
-
-* NDCG
-
-`Normalized Discounted Cumulative Gain` ``NDCG`` can be used with both binary relevance and multi-level relevance. If you are not sure about your data, this metric can be used as the default. The name for the objective is ``rank:ndcg``.
-
-
-* MAP
-
-`Mean average precision` ``MAP`` is a binary measure. It can be used when the relevance label is 0 or 1. The name for the objective is ``rank:map``.
-
-
-* Pairwise
-
-The `LambdaMART` algorithm scales the logistic loss with learning to rank metrics like ``NDCG`` in the hope of including ranking information into the loss function. The ``rank:pairwise`` loss is the original version of the pairwise loss, also known as the `RankNet loss` `[7] <#references>`__ or the `pairwise logistic loss`. Unlike the ``rank:map`` and the ``rank:ndcg``, no scaling is applied (:math:`|\Delta Z_{ij}| = 1`).
-
-Whether scaling with a LTR metric is actually more effective is still up for debate; `[8] <#references>`__ provides a theoretical foundation for general lambda loss functions and some insights into the framework.
-
-******************
-Constructing Pairs
-******************
-
-There are two implemented strategies for constructing document pairs for :math:`\lambda`-gradient calculation. The first one is the ``mean`` method, another one is the ``topk`` method. The preferred strategy can be specified by the ``lambdarank_pair_method`` parameter.
-
-For the ``mean`` strategy, XGBoost samples ``lambdarank_num_pair_per_sample`` pairs for each document in a query list. For example, given a list of 3 documents and ``lambdarank_num_pair_per_sample`` is set to 2, XGBoost will randomly sample 6 pairs, assuming the labels for these documents are different. On the other hand, if the pair method is set to ``topk``, XGBoost constructs about :math:`k \times |query|` number of pairs with :math:`|query|` pairs for each sample at the top :math:`k = lambdarank\_num\_pair` position. The number of pairs counted here is an approximation since we skip pairs that have the same label.
-
-*********************
-Obtaining Good Result
-*********************
-
-Learning to rank is a sophisticated task and an active research area. It's not trivial to train a model that generalizes well. There are multiple loss functions available in XGBoost along with a set of hyperparameters. This section contains some hints for how to choose hyperparameters as a starting point. One can further optimize the model by tuning these hyperparameters.
-
-The first question would be how to choose an objective that matches the task at hand. If your input data has multi-level relevance degrees, then either ``rank:ndcg`` or ``rank:pairwise`` should be used. However, when the input has binary labels, we have multiple options based on the target metric. `[6] <#references>`__ provides some guidelines on this topic and users are encouraged to see the analysis done in their work. The choice should be based on the number of `effective pairs`, which refers to the number of pairs that can generate non-zero gradient and contribute to training. `LambdaMART` with ``MRR`` has the least amount of effective pairs as the :math:`\lambda`-gradient is only non-zero when the pair contains a non-relevant document ranked higher than the top relevant document. As a result, it's not implemented in XGBoost. Since ``NDCG`` is a multi-level metric, it usually generate more effective pairs than ``MAP``.
-
-However, when there are sufficiently many effective pairs, it's shown in `[6] <#references>`__ that matching the target metric with the objective is of significance. When the target metric is ``MAP`` and you are using a large dataset that can provide a sufficient amount of effective pairs, ``rank:map`` can in theory yield higher ``MAP`` value than ``rank:ndcg``.
-
-The consideration of effective pairs also applies to the choice of pair method (``lambdarank_pair_method``) and the number of pairs for each sample (``lambdarank_num_pair_per_sample``). For example, the mean-``NDCG`` considers more pairs than ``NDCG@10``, so the former generates more effective pairs and provides more granularity than the latter. Also, using the ``mean`` strategy can help the model generalize with random sampling. However, one might want to focus the training on the top :math:`k` documents instead of using all pairs, to better fit their real-world application.
-
-When using the mean strategy for generating pairs, where the target metric (like ``NDCG``) is computed over the whole query list, users can specify how many pairs should be generated per each document, by setting the ``lambdarank_num_pair_per_sample``. XGBoost will randomly sample ``lambdarank_num_pair_per_sample`` pairs for each element in the query group (:math:`|pairs| = |query| \times num\_pairsample`). Often, setting it to 1 can produce reasonable results. In cases where performance is inadequate due to insufficient number of effective pairs being generated, set ``lambdarank_num_pair_per_sample`` to a higher value. As more document pairs are generated, more effective pairs will be generated as well.
-
-On the other hand, if you are prioritizing the top :math:`k` documents, the ``lambdarank_num_pair_per_sample`` should be set slightly higher than :math:`k` (with a few more documents) to obtain a good training result.
-
-**Summary** If you have large amount of training data:
-
-* Use the target-matching objective.
-* Choose the ``topk`` strategy for generating document pairs (if it's appropriate for your application).
-
-On the other hand, if you have comparatively small amount of training data:
-
-* Select ``NDCG`` or the RankNet loss (``rank:pairwise``).
-* Choose the ``mean`` strategy for generating document pairs, to obtain more effective pairs.
-
-For any method chosen, you can modify ``lambdarank_num_pair_per_sample`` to control the amount of pairs generated.
-
-********************
-Distributed Training
-********************
-XGBoost implements distributed learning-to-rank with integration of multiple frameworks including Dask, Spark, and PySpark. The interface is similar to the single-node counterpart. Please refer to document of the respective XGBoost interface for details. Scattering a query group onto multiple workers is theoretically sound but can affect the model accuracy. For most of the use cases, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used. As a result, users don't need to partition the data based on query groups. As long as each data partition is correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly.
-
-*******************
-Reproducible Result
-*******************
-
-Like any other tasks, XGBoost should generate reproducible results given the same hardware and software environments (and data partitions, if distributed interface is used). Even when the underlying environment has changed, the result should still be consistent. However, when the ``lambdarank_pair_method`` is set to ``mean``, XGBoost uses random sampling, and results may differ depending on the platform used. The random number generator used on Windows (Microsoft Visual C++) is different from the ones used on other platforms like Linux (GCC, Clang) [#f0]_, so the output varies significantly between these platforms.
-
-.. [#f0] `minstd_rand` implementation is different on MSVC. The implementations from GCC and Thrust produce the same output.
-
-**********
-References
-**********
-
-[1] Tie-Yan Liu. 2009. "`Learning to Rank for Information Retrieval`_". Found. Trends Inf. Retr. 3, 3 (March 2009), 225–331.
-
-[2] Christopher J. C. Burges, Robert Ragno, and Quoc Viet Le. 2006. "`Learning to rank with nonsmooth cost functions`_". In Proceedings of the 19th International Conference on Neural Information Processing Systems (NIPS'06). MIT Press, Cambridge, MA, USA, 193–200.
-
-[3] Wu, Q., Burges, C.J.C., Svore, K.M. et al. "`Adapting boosting for information retrieval measures`_". Inf Retrieval 13, 254–270 (2010).
-
-[4] Ziniu Hu, Yang Wang, Qu Peng, Hang Li. "`Unbiased LambdaMART: An Unbiased Pairwise Learning-to-Rank Algorithm`_". Proceedings of the 2019 World Wide Web Conference.
-
-[5] Burges, Chris J.C. "`From RankNet to LambdaRank to LambdaMART: An Overview`_". MSR-TR-2010-82
-
-[6] Pinar Donmez, Krysta M. Svore, and Christopher J.C. Burges. 2009. "`On the local optimality of LambdaRank`_". In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (SIGIR '09). Association for Computing Machinery, New York, NY, USA, 460–467.
-
-[7] Chris Burges, Tal Shaked, Erin Renshaw, Ari Lazier, Matt Deeds, Nicole Hamilton, and Greg Hullender. 2005. "`Learning to rank using gradient descent`_". In Proceedings of the 22nd international conference on Machine learning (ICML '05). Association for Computing Machinery, New York, NY, USA, 89–96.
-
-[8] Xuanhui Wang and Cheng Li and Nadav Golbandi and Mike Bendersky and Marc Najork. 2018. "`The LambdaLoss Framework for Ranking Metric Optimization`_". Proceedings of The 27th ACM International Conference on Information and Knowledge Management (CIKM '18).
-
-.. _`Learning to Rank for Information Retrieval`: https://doi.org/10.1561/1500000016
-.. _`Learning to rank with nonsmooth cost functions`: https://dl.acm.org/doi/10.5555/2976456.2976481
-.. _`Adapting boosting for information retrieval measures`: https://doi.org/10.1007/s10791-009-9112-1
-.. _`Unbiased LambdaMART: An Unbiased Pairwise Learning-to-Rank Algorithm`: https://dl.acm.org/doi/10.1145/3308558.3313447
-.. _`From RankNet to LambdaRank to LambdaMART: An Overview`: https://www.microsoft.com/en-us/research/publication/from-ranknet-to-lambdarank-to-lambdamart-an-overview/
-.. _`On the local optimality of LambdaRank`: https://doi.org/10.1145/1571941.1572021
-.. _`Learning to rank using gradient descent`:  https://doi.org/10.1145/1102351.1102363
-.. _`The LambdaLoss Framework for Ranking Metric Optimization`: https://dl.acm.org/doi/10.1145/3269206.3271784
--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -58,45 +58,3 @@ This can affect the training of XGBoost model, and there are two ways to improve

  - In such a case, you cannot re-balance the dataset
  - Set parameter ``max_delta_step`` to a finite number (say 1) to help convergence
-
-
-*********************
-Reducing Memory Usage
-*********************
-
-If you are using a HPO library like :py:class:`sklearn.model_selection.GridSearchCV`,
-please control the number of threads it can use. It's best to let XGBoost to run in
-parallel instead of asking `GridSearchCV` to run multiple experiments at the same
-time. For instance, creating a fold of data for cross validation can consume a significant
-amount of memory:
-
-.. code-block:: python
-
-    # This creates a copy of dataset. X and X_train are both in memory at the same time.
-
-    # This happens for every thread at the same time if you run `GridSearchCV` with
-    # `n_jobs` larger than 1
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y)
-
-.. code-block:: python
-
-    df = pd.DataFrame()
-    # This creates a new copy of the dataframe, even if you specify the inplace parameter
-    new_df = df.drop(...)
-
-.. code-block:: python
-
-    array = np.array(...)
-    # This may or may not make a copy of the data, depending on the type of the data
-    array.astype(np.float32)
-
-.. code-block::
-
-    # np by default uses double, do you actually need it?
-    array = np.array(...)
-
-You can find some more specific memory reduction practices scattered through the documents
-For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`. However, before going into
-these, being conscious about making data copies is a good starting point. It usually
-consumes a lot more memory than people expect.
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -173,6 +173,7 @@ Will print out something similar to (not actual output as it's too long for demo
        "gradient_booster": {
          "gbtree_train_param": {
            "num_parallel_tree": "1",
+            "predictor": "gpu_predictor",
            "process_type": "default",
            "tree_method": "gpu_hist",
            "updater": "grow_gpu_hist",
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -146,7 +146,7 @@ using a list of feature names and the additional parameter ``use_gpu``:
  label_name = "class"

  # get a list with feature column names
-  feature_names = [x.name for x in train_df.schema if x.name != label_name]
+  feature_names = [x.name for x in train_df.schema if x.name != label]

  # create a xgboost pyspark regressor estimator and set use_gpu=True
  regressor = SparkXGBRegressor(
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -10,7 +10,6 @@
 #include <dmlc/omp.h>

 #include <cmath>
-#include <cstdint>
 #include <iostream>
 #include <string>
 #include <utility>
@@ -113,7 +112,7 @@ using bst_row_t = std::size_t;   // NOLINT
 /*! \brief Type for tree node index. */
 using bst_node_t = std::int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
-using bst_group_t = std::uint32_t;  // NOLINT
+using bst_group_t = std::uint32_t;      // NOLINT
 /**
 * \brief Type for indexing into output targets.
 */
@@ -126,10 +125,6 @@ using bst_layer_t = std::int32_t;  // NOLINT
 * \brief Type for indexing trees.
 */
 using bst_tree_t = std::int32_t;  // NOLINT
-/**
- * @brief Ordinal of a CUDA device.
- */
-using bst_d_ordinal_t = std::int16_t;  // NOLINT

 namespace detail {
 /*! \brief Implementation of gradient statistics pair. Template specialisation
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1067,9 +1067,6 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
 /**
 * \brief Inplace prediction from CPU dense matrix.
 *
- * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle.
 * \param values        JSON encoded __array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1094,9 +1091,6 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
 /**
 * \brief Inplace prediction from CPU CSR matrix.
 *
- * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle.
 * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
 * \param indices       JSON encoded __array_interface__ to column indices in CSR.
@@ -1122,9 +1116,6 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
 /**
 * \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
 *
- * \note If the booster is configured to run on a CPU, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle
 * \param values        JSON encoded __cuda_array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1146,9 +1137,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *valu
 /**
 * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
 *
- * \note If the booster is configured to run on a CPU, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle
 * \param values        List of __cuda_array_interface__ for all columns encoded in JSON list.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -1,80 +1,20 @@
-/**
- * Copyright 2014-2023, XGBoost Contributors
+/*!
+ * Copyright 2014-2022 by Contributors
 * \file context.h
 */
 #ifndef XGBOOST_CONTEXT_H_
 #define XGBOOST_CONTEXT_H_

-#include <xgboost/base.h>       // for bst_d_ordinal_t
-#include <xgboost/logging.h>    // for CHECK_GE
-#include <xgboost/parameter.h>  // for XGBoostParameter
+#include <xgboost/logging.h>
+#include <xgboost/parameter.h>

-#include <cstdint>      // for int16_t, int32_t, int64_t
-#include <memory>       // for shared_ptr
-#include <string>       // for string, to_string
-#include <type_traits>  // for invoke_result_t, is_same_v
+#include <memory>  // std::shared_ptr
+#include <string>

 namespace xgboost {

 struct CUDAContext;

-/**
- * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
- *        viewing types like `linalg::TensorView`.
- */
-struct DeviceOrd {
-  enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
-  // CUDA device ordinal.
-  bst_d_ordinal_t ordinal{-1};
-
-  [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
-  [[nodiscard]] bool IsCPU() const { return device == kCPU; }
-
-  DeviceOrd() = default;
-  constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
-
-  DeviceOrd(DeviceOrd const& that) = default;
-  DeviceOrd& operator=(DeviceOrd const& that) = default;
-  DeviceOrd(DeviceOrd&& that) = default;
-  DeviceOrd& operator=(DeviceOrd&& that) = default;
-
-  /**
-   * @brief Constructor for CPU.
-   */
-  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
-  /**
-   * @brief Constructor for CUDA device.
-   *
-   * @param ordinal CUDA device ordinal.
-   */
-  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
-
-  [[nodiscard]] bool operator==(DeviceOrd const& that) const {
-    return device == that.device && ordinal == that.ordinal;
-  }
-  [[nodiscard]] bool operator!=(DeviceOrd const& that) const { return !(*this == that); }
-  /**
-   * @brief Get a string representation of the device and the ordinal.
-   */
-  [[nodiscard]] std::string Name() const {
-    switch (device) {
-      case DeviceOrd::kCPU:
-        return "CPU";
-      case DeviceOrd::kCUDA:
-        return "CUDA:" + std::to_string(ordinal);
-      default: {
-        LOG(FATAL) << "Unknown device.";
-        return "";
-      }
-    }
-  }
-};
-
-static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
-
-/**
- * @brief Runtime context for XGBoost. Contains information like threads and device.
- */
 struct Context : public XGBoostParameter<Context> {
 public:
  // Constant representing the device ID of CPU.
@@ -96,82 +36,33 @@ struct Context : public XGBoostParameter<Context> {
  // fail when gpu_id is invalid
  bool fail_on_invalid_gpu_id{false};
  bool validate_parameters{false};
-  /**
-   * @brief Configure the parameter `gpu_id'.
+
+  /*!
+   * \brief Configure the parameter `gpu_id'.
   *
-   * @param require_gpu Whether GPU is explicitly required by the user through other
-   *                    configurations.
+   * \param require_gpu  Whether GPU is explicitly required from user.
   */
  void ConfigureGpuId(bool require_gpu);
-  /**
-   * @brief Returns the automatically chosen number of threads based on the `nthread`
-   *        parameter and the system settting.
+  /*!
+   * Return automatically chosen threads.
   */
-  [[nodiscard]] std::int32_t Threads() const;
-  /**
-   * @brief Is XGBoost running on CPU?
-   */
-  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
-  /**
-   * @brief Is XGBoost running on a CUDA device?
-   */
-  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
-  /**
-   * @brief Get the current device and ordinal.
-   */
-  [[nodiscard]] DeviceOrd Device() const {
-    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
-  }
-  /**
-   * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
-   */
-  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
-  /**
-   * @brief Name of the current device.
-   */
-  [[nodiscard]] std::string DeviceName() const { return Device().Name(); }
-  /**
-   * @brief Get a CUDA device context for allocator and stream.
-   */
-  [[nodiscard]] CUDAContext const* CUDACtx() const;
-  /**
-   * @brief Make a CUDA context based on the current context.
-   *
-   * @param ordinal The CUDA device ordinal.
-   */
-  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
+  std::int32_t Threads() const;
+
+  bool IsCPU() const { return gpu_id == kCpuId; }
+  bool IsCUDA() const { return !IsCPU(); }
+
+  CUDAContext const* CUDACtx() const;
+  // Make a CUDA context based on the current context.
+  Context MakeCUDA(std::int32_t device = 0) const {
    Context ctx = *this;
-    CHECK_GE(ordinal, 0);
-    ctx.gpu_id = ordinal;
+    ctx.gpu_id = device;
    return ctx;
  }
-  /**
-   * @brief Make a CPU context based on the current context.
-   */
-  [[nodiscard]] Context MakeCPU() const {
+  Context MakeCPU() const {
    Context ctx = *this;
    ctx.gpu_id = kCpuId;
    return ctx;
  }
-  /**
-   * @brief Call function based on the current device.
-   */
-  template <typename CPUFn, typename CUDAFn>
-  decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn) const {
-    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
-    switch (this->Device().device) {
-      case DeviceOrd::kCPU:
-        return cpu_fn();
-      case DeviceOrd::kCUDA:
-        return cuda_fn();
-      default:
-        // Do not use the device name as this is likely an internal error, the name
-        // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
-        break;
-    }
-    return std::invoke_result_t<CPUFn>();
-  }

  // declare parameters
  DMLC_DECLARE_PARAMETER(Context) {
@@ -196,9 +87,9 @@ struct Context : public XGBoostParameter<Context> {
  }

 private:
-  // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
-  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
-  // p_impl while trying to hide CUDA code from the host compiler.
+  // mutable for lazy initialization for cuda context to avoid initializing CUDA at load.
+  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define p_impl
+  // while trying to hide CUDA code from host compiler.
  mutable std::shared_ptr<CUDAContext> cuctx_;
  // cached value for CFS CPU limit. (used in containerized env)
  std::int32_t cfs_cpu_count_;  // NOLINT
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -149,14 +149,18 @@ class GradientBooster : public Model, public Configurable {
   * \param layer_begin Beginning of boosted tree layer used for prediction.
   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param approximate use a faster (inconsistent) approximation of SHAP values
+   * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
+   * \param condition_feature feature to condition on (i.e. fix) during calculations
   */
-  virtual void PredictContribution(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
-                                   bst_layer_t layer_begin, bst_layer_t layer_end,
-                                   bool approximate = false) = 0;
+  virtual void PredictContribution(DMatrix* dmat,
+                                   HostDeviceVector<bst_float>* out_contribs,
+                                   unsigned layer_begin, unsigned layer_end,
+                                   bool approximate = false, int condition = 0,
+                                   unsigned condition_feature = 0) = 0;

-  virtual void PredictInteractionContributions(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
-                                               bst_layer_t layer_begin, bst_layer_t layer_end,
-                                               bool approximate) = 0;
+  virtual void PredictInteractionContributions(
+      DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
+      unsigned layer_begin, unsigned layer_end, bool approximate) = 0;

  /*!
   * \brief dump the model in the requested format
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -6,22 +6,24 @@
 */
 #pragma once
 #include <xgboost/base.h>
-#include <xgboost/cache.h>    // for DMatrixCache
-#include <xgboost/context.h>  // for Context
+#include <xgboost/cache.h>  // DMatrixCache
 #include <xgboost/context.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>

-#include <functional>  // for function
-#include <memory>      // for shared_ptr
+#include <functional>  // std::function
+#include <memory>
 #include <string>
+#include <thread>   // for get_id
 #include <utility>  // for make_pair
 #include <vector>

 // Forward declarations
-namespace xgboost::gbm {
+namespace xgboost {
+namespace gbm {
 struct GBTreeModel;
-}  // namespace xgboost::gbm
+}  // namespace gbm
+}  // namespace xgboost

 namespace xgboost {
 /**
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -43,7 +43,7 @@
        <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
        <log.capi.invocation>OFF</log.capi.invocation>
        <use.cuda>OFF</use.cuda>
-        <cudf.version>23.04.0</cudf.version>
+        <cudf.version>23.06.0</cudf.version>
        <spark.rapids.version>23.04.1</spark.rapids.version>
        <cudf.classifier>cuda11</cudf.classifier>
        <scalatest.version>3.2.16</scalatest.version>
@@ -91,9 +91,6 @@
                    <value>ON</value>
                </property>
            </activation>
-            <properties>
-               <use.cuda>ON</use.cuda>
-            </properties>
            <modules>
                <module>xgboost4j-gpu</module>
                <module>xgboost4j-spark-gpu</module>
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -78,6 +78,7 @@ public class BoosterTest {
        put("num_round", round);
        put("num_workers", 1);
        put("tree_method", "gpu_hist");
+        put("predictor", "gpu_predictor");
        put("max_bin", maxBin);
      }
    };
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -281,6 +281,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
            // - predictor: Force to gpu predictor since native doesn't save predictor.
            val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
            booster.setParam("gpu_id", gpuId.toString)
+            booster.setParam("predictor", "gpu_predictor")
            logger.info("GPU transform on device: " + gpuId)
            boosterFlag.isGpuParamsSet = true;
          }
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
@@ -220,7 +220,7 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {

  test("Ranking: train with Group") {
    withGpuSparkSession(enableCsvConf()) { spark =>
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "rank:ndcg",
+      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "rank:pairwise",
        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
        "features_cols" -> featureNames, "label_col" -> labelName)
      val Array(trainingDf, testDf) = spark.read.option("header", "true").schema(schema)
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -25,7 +25,7 @@ private[spark] trait LearningTaskParams extends Params {
  /**
   * Specify the learning task and the corresponding learning objective.
   * options: reg:squarederror, reg:squaredlogerror, reg:logistic, binary:logistic, binary:logitraw,
-   * count:poisson, multi:softmax, multi:softprob, rank:ndcg, reg:gamma.
+   * count:poisson, multi:softmax, multi:softprob, rank:pairwise, reg:gamma.
   * default: reg:squarederror
   */
  final val objective = new Param[String](this, "objective",
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -201,7 +201,7 @@ class XGBoostGeneralSuite extends AnyFunSuite with TmpFolderPerSuite with PerTes
      sc,
      buildTrainingRDD,
      List("eta" -> "1", "max_depth" -> "6",
-        "objective" -> "rank:ndcg", "num_round" -> 5, "num_workers" -> numWorkers,
+        "objective" -> "rank:pairwise", "num_round" -> 5, "num_workers" -> numWorkers,
        "custom_eval" -> null, "custom_obj" -> null, "use_external_memory" -> false,
        "missing" -> Float.NaN).toMap)

@@ -268,7 +268,7 @@ class XGBoostGeneralSuite extends AnyFunSuite with TmpFolderPerSuite with PerTes
    val training = buildDataFrameWithGroup(Ranking.train, 5)
    val Array(train, eval1, eval2) = training.randomSplit(Array(0.6, 0.2, 0.2), 0)
    val paramMap1 = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "rank:ndcg",
+      "objective" -> "rank:pairwise",
      "num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group")
    val xgb1 = new XGBoostRegressor(paramMap1).setEvalSets(Map("eval1" -> eval1, "eval2" -> eval2))
    val model1 = xgb1.fit(train)
@@ -281,7 +281,7 @@ class XGBoostGeneralSuite extends AnyFunSuite with TmpFolderPerSuite with PerTes
    assert(model1.summary.trainObjectiveHistory !== model1.summary.validationObjectiveHistory(1))

    val paramMap2 = Map("eta" -> "1", "max_depth" -> "6",
-      "objective" -> "rank:ndcg",
+      "objective" -> "rank:pairwise",
      "num_round" -> 5, "num_workers" -> numWorkers, "group_col" -> "group",
      "eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
    val xgb2 = new XGBoostRegressor(paramMap2)
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -121,7 +121,7 @@ class XGBoostRegressorSuite extends AnyFunSuite with PerTest with TmpFolderPerSu

  test("ranking: use group data") {
    val paramMap = Map("eta" -> "1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "rank:ndcg", "num_workers" -> numWorkers, "num_round" -> 5,
+      "objective" -> "rank:pairwise", "num_workers" -> numWorkers, "num_round" -> 5,
      "group_col" -> "group", "tree_method" -> treeMethod)

    val trainingDF = buildDataFrameWithGroup(Ranking.train)
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(

    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
-        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
+        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
+        libxgboost_sys = sys_prefix / "lib" / _lib_name()
        if not libxgboost_sys.exists():
            raise RuntimeError(
                f"use_system_libxgboost was specified but {_lib_name()} is "
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -31,7 +31,7 @@ ArrayLike = Any
 PathLike = Union[str, os.PathLike]
 CupyT = ArrayLike  # maybe need a stub for cupy arrays
 NumpyOrCupy = Any
-NumpyDType = Union[str, Type[np.number]]  # pylint: disable=invalid-name
+NumpyDType = Union[str, Type[np.number]]
 PandasDType = Any  # real type is pandas.core.dtypes.base.ExtensionDtype

 FloatCompatible = Union[float, np.float32, np.float64]
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2187,25 +2187,20 @@ class Booster:
        base_margin: Any = None,
        strict_shape: bool = False,
    ) -> NumpyOrCupy:
-        """Run prediction in-place when possible, Unlike :py:meth:`predict` method,
-        inplace prediction does not cache the prediction result.
+        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
+        does not cache the prediction result.

        Calling only ``inplace_predict`` in multiple threads is safe and lock
        free.  But the safety does not hold when used in conjunction with other
        methods. E.g. you can't train the booster in one thread and perform
        prediction in the other.

-        .. note::
-
-            If the device ordinal of the input data doesn't match the one configured for
-            the booster, data will be copied to the booster device.
-
        .. code-block:: python

-            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+            booster.set_param({"predictor": "gpu_predictor"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
+            booster.set_param({"predictor": "cpu_predictor"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2213,7 +2208,9 @@ class Booster:
        Parameters
        ----------
        data :
-            The input data.
+            The input data, must not be a view for numpy array.  Set
+            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
+            array or CuDF DataFrame.
        iteration_range :
            See :py:meth:`predict` for details.
        predict_type :
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
        os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
        # use libxgboost from a system prefix, if available.  This should be the last
        # option.
-        os.path.join(sys.base_prefix, "lib"),
+        os.path.join(sys.prefix, "lib"),
    ]

    if sys.platform == "win32":
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
            + ("\n- ".join(dll_path))
            + "\nXGBoost Python package path: "
            + curr_path
-            + "\nsys.base_prefix: "
-            + sys.base_prefix
+            + "\nsys.prefix: "
+            + sys.prefix
            + "\nSee: "
            + link
            + " for installing XGBoost."
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -277,6 +277,9 @@ __model_doc = f"""
        Device ordinal.
    validate_parameters : Optional[bool]
        Give warnings for unknown parameter.
+    predictor : Optional[str]
+        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
+        gpu_predictor].
    enable_categorical : bool

        .. versionadded:: 1.5.0
@@ -378,21 +381,17 @@ __model_doc = f"""
          every **early_stopping_rounds** round(s) to continue training.  Requires at
          least one item in **eval_set** in :py:meth:`fit`.

-        - If early stopping occurs, the model will have two additional attributes:
-          :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
-          :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
-          number of trees during inference. If users want to access the full model
-          (including trees built after early stopping), they can specify the
-          `iteration_range` in these inference methods. In addition, other utilities
-          like model plotting can also use the entire model.
-
-        - If you prefer to discard the trees after `best_iteration`, consider using the
-          callback function :py:class:`xgboost.callback.EarlyStopping`.
+        - The method returns the model from the last iteration, not the best one, use a
+          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
+          model is preferred.

        - If there's more than one item in **eval_set**, the last entry will be used for
          early stopping.  If there's more than one metric in **eval_metric**, the last
          metric will be used for early stopping.

+        - If early stopping occurs, the model will have three additional fields:
+          :py:attr:`best_score`, :py:attr:`best_iteration`.
+
        .. note::

            This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
@@ -649,6 +648,7 @@ class XGBModel(XGBModelBase):
        importance_type: Optional[str] = None,
        gpu_id: Optional[int] = None,
        validate_parameters: Optional[bool] = None,
+        predictor: Optional[str] = None,
        enable_categorical: bool = False,
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
@@ -695,6 +695,7 @@ class XGBModel(XGBModelBase):
        self.importance_type = importance_type
        self.gpu_id = gpu_id
        self.validate_parameters = validate_parameters
+        self.predictor = predictor
        self.enable_categorical = enable_categorical
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
@@ -930,7 +931,8 @@ class XGBModel(XGBModelBase):
        callbacks = self.callbacks if self.callbacks is not None else callbacks

        tree_method = params.get("tree_method", None)
-        if self.enable_categorical and tree_method == "exact":
+        cat_support = {"gpu_hist", "approx", "hist"}
+        if self.enable_categorical and tree_method not in cat_support:
            raise ValueError(
                "Experimental support for categorical data is not implemented for"
                " current tree method yet."
@@ -1087,7 +1089,12 @@ class XGBModel(XGBModelBase):
            return self

    def _can_use_inplace_predict(self) -> bool:
-        if self.booster != "gblinear":
+        # When predictor is explicitly set, using `inplace_predict` might result into
+        # error with incompatible data type.
+        # Inplace predict doesn't handle as many data types as DMatrix, but it's
+        # sufficient for dask interface where input is simpiler.
+        predictor = self.get_xgb_params().get("predictor", None)
+        if predictor in ("auto", None) and self.booster != "gblinear":
            return True
        return False

@@ -1113,9 +1120,9 @@ class XGBModel(XGBModelBase):
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then
-        :py:attr:`best_iteration` is used automatically. The estimator uses
-        `inplace_predict` by default and falls back to using :py:class:`DMatrix` if
-        devices between the data and the estimator don't match.
+        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
+        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
+        prediction is run on GPU automatically, otherwise it will run on CPU.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1577,9 +1584,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    ) -> np.ndarray:
        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
-        automatically. The estimator uses `inplace_predict` by default and falls back to
-        using :py:class:`DMatrix` if devices between the data and the estimator don't
-        match.
+        automatically.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1791,11 +1796,7 @@ def _get_qid(


@xgboost_model_doc(
-    """Implementation of the Scikit-Learn API for XGBoost Ranking.
-
-See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an introducion.
-
-    """,
+    """Implementation of the Scikit-Learn API for XGBoost Ranking.""",
    ["estimators", "model"],
    end_note="""
        .. note::
@@ -1844,7 +1845,7 @@ See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an introducion.
 class XGBRanker(XGBModel, XGBRankerMixIn):
    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
    @_deprecate_positional_args
-    def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any):
+    def __init__(self, *, objective: str = "rank:pairwise", **kwargs: Any):
        super().__init__(objective=objective, **kwargs)
        if callable(self.objective):
            raise ValueError("custom objective function not supported by XGBRanker")
@@ -2028,7 +2029,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
            self._Booster = train(
                params,
                train_dmatrix,
-                num_boost_round=self.get_num_boosting_rounds(),
+                self.get_num_boosting_rounds(),
                early_stopping_rounds=early_stopping_rounds,
                evals=evals,
                evals_result=evals_result,
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -25,7 +25,6 @@ from typing import (
    Set,
    Tuple,
    TypedDict,
-    TypeVar,
    Union,
 )

@@ -199,14 +198,14 @@ class IteratorForTest(xgb.core.DataIter):
        X: Sequence,
        y: Sequence,
        w: Optional[Sequence],
-        cache: Optional[str],
+        cache: Optional[str] = "./",
    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.w = w
        self.it = 0
-        super().__init__(cache_prefix=cache)
+        super().__init__(cache)

    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
@@ -348,9 +347,7 @@ class TestDataset:
            if w is not None:
                weight.append(w)

-        it = IteratorForTest(
-            predictor, response, weight if weight else None, cache="cache"
-        )
+        it = IteratorForTest(predictor, response, weight if weight else None)
        return xgb.DMatrix(it)

    def __repr__(self) -> str:
@@ -712,27 +709,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
    )


-M = TypeVar("M", xgb.Booster, xgb.XGBModel)
-
-
-def set_ordinal(ordinal: int, booster: M) -> M:
-    """Temporary solution for setting the device ordinal until we move away from
-    `gpu_id`.
-
-    """
-    if ordinal < 0:
-        params = {"gpu_id": -1, "tree_method": "hist"}
-    else:
-        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
-
-    if isinstance(booster, xgb.Booster):
-        booster.set_param(params)
-    elif isinstance(booster, xgb.XGBModel):
-        booster.set_params(**params)
-
-    return booster
-
-
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
--- a/python-package/xgboost/testing/dask.py
+++ b/python-package/xgboost/testing/dask.py
@@ -1,8 +1,6 @@
 """Tests for dask shared by different test modules."""
 import numpy as np
-import pandas as pd
 from dask import array as da
-from dask import dataframe as dd
 from distributed import Client

 import xgboost as xgb
@@ -54,22 +52,3 @@ def check_init_estimation(tree_method: str, client: Client) -> None:
    """Test init estimation."""
    check_init_estimation_reg(tree_method, client)
    check_init_estimation_clf(tree_method, client)
-
-
-def check_uneven_nan(client: Client, tree_method: str, n_workers: int) -> None:
-    """Issue #9271, not every worker has missing value."""
-    assert n_workers >= 2
-
-    with client.as_current():
-        clf = xgb.dask.DaskXGBClassifier(tree_method=tree_method)
-        X = pd.DataFrame({"a": range(10000), "b": range(10000, 0, -1)})
-        y = pd.Series([*[0] * 5000, *[1] * 5000])
-
-        X["a"][:3000:1000] = np.NaN
-
-        client.wait_for_workers(n_workers=n_workers)
-
-        clf.fit(
-            dd.from_pandas(X, npartitions=n_workers),
-            dd.from_pandas(y, npartitions=n_workers),
-        )
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -1,14 +1,11 @@
-# pylint: disable=invalid-name
 """Utilities for data generation."""
 import os
 import zipfile
-from dataclasses import dataclass
-from typing import Any, Generator, List, NamedTuple, Optional, Tuple, Union
+from typing import Any, Generator, List, Tuple, Union
 from urllib import request

 import numpy as np
 import pytest
-from numpy import typing as npt
 from numpy.random import Generator as RNG
 from scipy import sparse

@@ -343,263 +340,3 @@ def get_mq2008(
        y_valid,
        qid_valid,
    )
-
-
-RelData = Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]]
-
-
-@dataclass
-class ClickFold:
-    """A structure containing information about generated user-click data."""
-
-    X: sparse.csr_matrix
-    y: npt.NDArray[np.int32]
-    qid: npt.NDArray[np.int32]
-    score: npt.NDArray[np.float32]
-    click: npt.NDArray[np.int32]
-    pos: npt.NDArray[np.int64]
-
-
-class RelDataCV(NamedTuple):
-    """Simple data struct for holding a train-test split of a learning to rank dataset."""
-
-    train: RelData
-    test: RelData
-    max_rel: int
-
-    def is_binary(self) -> bool:
-        """Whether the label consists of binary relevance degree."""
-        return self.max_rel == 1
-
-
-class PBM:  # pylint: disable=too-few-public-methods
-    """Simulate click data with position bias model. There are other models available in
-    `ULTRA <https://github.com/ULTR-Community/ULTRA.git>`_ like the cascading model.
-
-    References
-    ----------
-    Unbiased LambdaMART: An Unbiased Pairwise Learning-to-Rank Algorithm
-
-    """
-
-    def __init__(self, eta: float) -> None:
-        # click probability for each relevance degree. (from 0 to 4)
-        self.click_prob = np.array([0.1, 0.16, 0.28, 0.52, 1.0])
-        exam_prob = np.array(
-            [0.68, 0.61, 0.48, 0.34, 0.28, 0.20, 0.11, 0.10, 0.08, 0.06]
-        )
-        # Observation probability, encoding positional bias for each position
-        self.exam_prob = np.power(exam_prob, eta)
-
-    def sample_clicks_for_query(
-        self, labels: npt.NDArray[np.int32], position: npt.NDArray[np.int64]
-    ) -> npt.NDArray[np.int32]:
-        """Sample clicks for one query based on input relevance degree and position.
-
-        Parameters
-        ----------
-
-        labels :
-            relevance_degree
-
-        """
-        labels = np.array(labels, copy=True)
-
-        click_prob = np.zeros(labels.shape)
-        # minimum
-        labels[labels < 0] = 0
-        # maximum
-        labels[labels >= len(self.click_prob)] = -1
-        click_prob = self.click_prob[labels]
-
-        exam_prob = np.zeros(labels.shape)
-        assert position.size == labels.size
-        ranks = np.array(position, copy=True)
-        # maximum
-        ranks[ranks >= self.exam_prob.size] = -1
-        exam_prob = self.exam_prob[ranks]
-
-        rng = np.random.default_rng(1994)
-        prob = rng.random(size=labels.shape[0], dtype=np.float32)
-
-        clicks: npt.NDArray[np.int32] = np.zeros(labels.shape, dtype=np.int32)
-        clicks[prob < exam_prob * click_prob] = 1
-        return clicks
-
-
-def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.NDArray]:
-    """Run length encoding using numpy, modified from:
-    https://gist.github.com/nvictus/66627b580c13068589957d6ab0919e66
-
-    """
-    x = np.asarray(x)
-    n = x.size
-    starts = np.r_[0, np.flatnonzero(~np.isclose(x[1:], x[:-1], equal_nan=True)) + 1]
-    lengths = np.diff(np.r_[starts, n])
-    values = x[starts]
-    indptr = np.append(starts, np.array([x.size]))
-
-    return indptr, lengths, values
-
-
-def init_rank_score(
-    X: sparse.csr_matrix,
-    y: npt.NDArray[np.int32],
-    qid: npt.NDArray[np.int32],
-    sample_rate: float = 0.1,
-) -> npt.NDArray[np.float32]:
-    """We use XGBoost to generate the initial score instead of SVMRank for
-    simplicity. Sample rate is set to 0.1 by default so that we can test with small
-    datasets.
-
-    """
-    # random sample
-    rng = np.random.default_rng(1994)
-    n_samples = int(X.shape[0] * sample_rate)
-    index = np.arange(0, X.shape[0], dtype=np.uint64)
-    rng.shuffle(index)
-    index = index[:n_samples]
-
-    X_train = X[index]
-    y_train = y[index]
-    qid_train = qid[index]
-
-    # Sort training data based on query id, required by XGBoost.
-    sorted_idx = np.argsort(qid_train)
-    X_train = X_train[sorted_idx]
-    y_train = y_train[sorted_idx]
-    qid_train = qid_train[sorted_idx]
-
-    ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
-    ltr.fit(X_train, y_train, qid=qid_train)
-
-    # Use the original order of the data.
-    scores = ltr.predict(X)
-    return scores
-
-
-def simulate_one_fold(
-    fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
-    scores_fold: npt.NDArray[np.float32],
-) -> ClickFold:
-    """Simulate clicks for one fold."""
-    X_fold, y_fold, qid_fold = fold
-    assert qid_fold.dtype == np.int32
-
-    qids = np.unique(qid_fold)
-
-    position = np.empty((y_fold.size,), dtype=np.int64)
-    clicks = np.empty((y_fold.size,), dtype=np.int32)
-    pbm = PBM(eta=1.0)
-
-    # Avoid grouping by qid as we want to preserve the original data partition by
-    # the dataset authors.
-    for q in qids:
-        qid_mask = q == qid_fold
-        qid_mask = qid_mask.reshape(qid_mask.shape[0])
-        query_scores = scores_fold[qid_mask]
-        # Initial rank list, scores sorted to decreasing order
-        query_position = np.argsort(query_scores)[::-1]
-        position[qid_mask] = query_position
-        # get labels
-        relevance_degrees = y_fold[qid_mask]
-        query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position)
-        clicks[qid_mask] = query_clicks
-
-    assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
-    assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)
-
-    return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
-
-
-# pylint: disable=too-many-locals
-def simulate_clicks(cv_data: RelDataCV) -> Tuple[ClickFold, Optional[ClickFold]]:
-    """Simulate click data using position biased model (PBM)."""
-    X, y, qid = list(zip(cv_data.train, cv_data.test))
-
-    # ptr to train-test split
-    indptr = np.array([0] + [v.shape[0] for v in X])
-    indptr = np.cumsum(indptr)
-
-    assert len(indptr) == 2 + 1  # train, test
-    X_full = sparse.vstack(X)
-    y_full = np.concatenate(y)
-    qid_full = np.concatenate(qid)
-
-    # Obtain initial relevance score for click simulation
-    scores_full = init_rank_score(X_full, y_full, qid_full)
-    # partition it back to (train, test) tuple
-    scores = [scores_full[indptr[i - 1] : indptr[i]] for i in range(1, indptr.size)]
-
-    X_lst, y_lst, q_lst, s_lst, c_lst, p_lst = [], [], [], [], [], []
-    for i in range(indptr.size - 1):
-        fold = simulate_one_fold((X[i], y[i], qid[i]), scores[i])
-        X_lst.append(fold.X)
-        y_lst.append(fold.y)
-        q_lst.append(fold.qid)
-        s_lst.append(fold.score)
-        c_lst.append(fold.click)
-        p_lst.append(fold.pos)
-
-    scores_check_1 = [s_lst[i] for i in range(indptr.size - 1)]
-    for i in range(2):
-        assert (scores_check_1[i] == scores[i]).all()
-
-    if len(X_lst) == 1:
-        train = ClickFold(X_lst[0], y_lst[0], q_lst[0], s_lst[0], c_lst[0], p_lst[0])
-        test = None
-    else:
-        train, test = (
-            ClickFold(X_lst[i], y_lst[i], q_lst[i], s_lst[i], c_lst[i], p_lst[i])
-            for i in range(len(X_lst))
-        )
-    return train, test
-
-
-def sort_ltr_samples(
-    X: sparse.csr_matrix,
-    y: npt.NDArray[np.int32],
-    qid: npt.NDArray[np.int32],
-    clicks: npt.NDArray[np.int32],
-    pos: npt.NDArray[np.int64],
-) -> Tuple[
-    sparse.csr_matrix,
-    npt.NDArray[np.int32],
-    npt.NDArray[np.int32],
-    npt.NDArray[np.int32],
-]:
-    """Sort data based on query index and position."""
-    sorted_idx = np.argsort(qid)
-    X = X[sorted_idx]
-    clicks = clicks[sorted_idx]
-    qid = qid[sorted_idx]
-    pos = pos[sorted_idx]
-
-    indptr, _, _ = rlencode(qid)
-
-    for i in range(1, indptr.size):
-        beg = indptr[i - 1]
-        end = indptr[i]
-
-        assert beg < end, (beg, end)
-        assert np.unique(qid[beg:end]).size == 1, (beg, end)
-
-        query_pos = pos[beg:end]
-        assert query_pos.min() == 0, query_pos.min()
-        assert query_pos.max() >= query_pos.size - 1, (
-            query_pos.max(),
-            query_pos.size,
-            i,
-            np.unique(qid[beg:end]),
-        )
-        sorted_idx = np.argsort(query_pos)
-
-        X[beg:end] = X[beg:end][sorted_idx]
-        clicks[beg:end] = clicks[beg:end][sorted_idx]
-        y[beg:end] = y[beg:end][sorted_idx]
-        # not necessary
-        qid[beg:end] = qid[beg:end][sorted_idx]
-
-    data = X, clicks, y, qid
-
-    return data
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -67,17 +67,3 @@ cat_parameter_strategy = strategies.fixed_dictionaries(
        "max_cat_threshold": strategies.integers(1, 128),
    }
 )
-
-lambdarank_parameter_strategy = strategies.fixed_dictionaries(
-    {
-        "lambdarank_unbiased": strategies.sampled_from([True, False]),
-        "lambdarank_pair_method": strategies.sampled_from(["topk", "mean"]),
-        "lambdarank_num_pair_per_sample": strategies.integers(1, 8),
-        "lambdarank_bias_norm": strategies.floats(0.5, 2.0),
-        "objective": strategies.sampled_from(
-            ["rank:ndcg", "rank:map", "rank:pairwise"]
-        ),
-    }
-).filter(
-    lambda x: not (x["lambdarank_unbiased"] and x["lambdarank_pair_method"] == "mean")
-)
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -1,79 +1,73 @@
-/**
- *  Copyright 2014-2023, XGBoost Contributors
+/*!
+ *  Copyright (c) 2014-2019 by Contributors
 * \file io.h
 * \brief utilities with different serializable implementations
 * \author Tianqi Chen
 */
 #ifndef RABIT_INTERNAL_IO_H_
 #define RABIT_INTERNAL_IO_H_
-
-#include <algorithm>
-#include <cstddef>  // for size_t
 #include <cstdio>
-#include <cstring>  // for memcpy
-#include <limits>
-#include <numeric>
-#include <string>
 #include <vector>
-
+#include <cstring>
+#include <string>
+#include <algorithm>
+#include <numeric>
+#include <limits>
 #include "rabit/internal/utils.h"
 #include "rabit/serializable.h"

-namespace rabit::utils {
+namespace rabit {
+namespace utils {
 /*! \brief re-use definition of dmlc::SeekStream */
 using SeekStream = dmlc::SeekStream;
-/**
- * @brief Fixed size memory buffer as a stream.
- */
+/*! \brief fixed size memory buffer */
 struct MemoryFixSizeBuffer : public SeekStream {
 public:
  // similar to SEEK_END in libc
-  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
+  static size_t constexpr kSeekEnd = std::numeric_limits<size_t>::max();

 public:
-  /**
-   * @brief Ctor
-   *
-   * @param p_buffer Pointer to the source buffer with size `buffer_size`.
-   * @param buffer_size Size of the source buffer
-   */
-  MemoryFixSizeBuffer(void *p_buffer, std::size_t buffer_size)
-      : p_buffer_(reinterpret_cast<char *>(p_buffer)), buffer_size_(buffer_size) {}
+  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
+      : p_buffer_(reinterpret_cast<char*>(p_buffer)),
+        buffer_size_(buffer_size) {
+    curr_ptr_ = 0;
+  }
  ~MemoryFixSizeBuffer() override = default;
-
-  std::size_t Read(void *ptr, std::size_t size) override {
-    std::size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+  size_t Read(void *ptr, size_t size) override {
+    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
    if (nread != 0) std::memcpy(ptr, p_buffer_ + curr_ptr_, nread);
    curr_ptr_ += nread;
    return nread;
  }
-  void Write(const void *ptr, std::size_t size) override {
+  void Write(const void *ptr, size_t size) override {
    if (size == 0) return;
-    CHECK_LE(curr_ptr_ + size, buffer_size_);
+    utils::Assert(curr_ptr_ + size <=  buffer_size_,
+                  "write position exceed fixed buffer size");
    std::memcpy(p_buffer_ + curr_ptr_, ptr, size);
    curr_ptr_ += size;
  }
-  void Seek(std::size_t pos) override {
+  void Seek(size_t pos) override {
    if (pos == kSeekEnd) {
      curr_ptr_ = buffer_size_;
    } else {
-      curr_ptr_ = static_cast<std::size_t>(pos);
+      curr_ptr_ = static_cast<size_t>(pos);
    }
  }
-  /**
-   * @brief Current position in the buffer (stream).
-   */
-  std::size_t Tell() override { return curr_ptr_; }
-  [[nodiscard]] virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
+  size_t Tell() override {
+    return curr_ptr_;
+  }
+  virtual bool AtEnd() const {
+    return curr_ptr_ == buffer_size_;
+  }

- protected:
+ private:
  /*! \brief in memory buffer */
-  char *p_buffer_{nullptr};
+  char *p_buffer_;
  /*! \brief current pointer */
-  std::size_t buffer_size_{0};
+  size_t buffer_size_;
  /*! \brief current pointer */
-  std::size_t curr_ptr_{0};
-};
+  size_t curr_ptr_;
+};  // class MemoryFixSizeBuffer

 /*! \brief a in memory buffer that can be read and write as stream interface */
 struct MemoryBufferStream : public SeekStream {
@@ -115,5 +109,6 @@ struct MemoryBufferStream : public SeekStream {
  /*! \brief current pointer */
  size_t curr_ptr_;
 };  // class MemoryBufferStream
-}  // namespace rabit::utils
+}  // namespace utils
+}  // namespace rabit
 #endif  // RABIT_INTERNAL_IO_H_
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1023,6 +1023,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
                        const float **out_result) {
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
+  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";

  HostDeviceVector<float> *p_predt{nullptr};
  auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1041,7 +1042,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
  xgboost_CHECK_C_ARG_PTR(out_dim);
  CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
                   learner->BoostedRounds(), &shape, out_dim);
-  CHECK_GE(p_predt->Size(), n_samples);

  xgboost_CHECK_C_ARG_PTR(out_result);
  xgboost_CHECK_C_ARG_PTR(out_shape);
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -92,7 +92,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  API_END();
 }

-int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
+int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
                       char const *c_json_config, std::shared_ptr<DMatrix> p_m,
                       xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
                       const float **out_result) {
@@ -107,6 +107,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
  proxy->SetCUDAArray(c_array_interface);

  auto config = Json::Load(StringView{c_json_config});
+  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
  auto *learner = static_cast<Learner *>(handle);

  HostDeviceVector<float> *p_predt{nullptr};
@@ -117,13 +118,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                          RequiredArg<Integer>(config, "iteration_begin", __func__),
                          RequiredArg<Integer>(config, "iteration_end", __func__));
  CHECK(p_predt);
-  if (learner->Ctx()->IsCPU()) {
-    // Prediction using DMatrix as fallback.
-    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
-  } else {
-    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
-  }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());

  auto &shape = learner->GetThreadLocal().prediction_shape;
  size_t n_samples = p_m->Info().num_row_;
@@ -151,7 +146,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
  if (m) {
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
-  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }

@@ -164,6 +159,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
  xgboost_CHECK_C_ARG_PTR(out_result);
-  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }
--- a/src/collective/communicator-inl.cuh
+++ b/src/collective/communicator-inl.cuh
@@ -1,81 +0,0 @@
-/**
- * Copyright 2023 by XGBoost contributors
- */
-#pragma once
-#include <string>
-#include <vector>
-
-#include "communicator.h"
-#include "device_communicator.cuh"
-
-namespace xgboost {
-namespace collective {
-
-/**
- * @brief Reduce values from all processes and distribute the result back to all processes.
- * @param device              ID of the device.
- * @param send_receive_buffer Buffer storing the data.
- * @param count               Number of elements in the buffer.
- */
-template <Operation op>
-inline void AllReduce(int device, std::int8_t *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kInt8, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, std::uint8_t *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kUInt8, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, std::int32_t *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kInt32, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, std::uint32_t *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kUInt32, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, std::int64_t *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kInt64, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, std::uint64_t *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, float *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kFloat, op);
-}
-
-template <Operation op>
-inline void AllReduce(int device, double *send_receive_buffer, size_t count) {
-  Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
-}
-
-/**
- * @brief Gather variable-length values from all processes.
- * @param device         ID of the device.
- * @param send_buffer    Buffer storing the input data.
- * @param length_bytes   Length in bytes of the input data.
- * @param segments       Size of each segment.
- * @param receive_buffer Buffer storing the output data.
- */
-inline void AllGatherV(int device, void const *send_buffer, size_t length_bytes,
-                       std::vector<size_t> *segments,
-                       dh::caching_device_vector<char> *receive_buffer) {
-  Communicator::GetDevice(device)->AllGatherV(send_buffer, length_bytes, segments, receive_buffer);
-}
-
-/**
- * @brief Synchronize device operations.
- * @param device ID of the device.
- */
-inline void Synchronize(int device) { Communicator::GetDevice(device)->Synchronize(); }
-
-}  // namespace collective
-}  // namespace xgboost
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -29,21 +29,13 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
    old_device_ordinal = device_ordinal;
    old_world_size = communicator_->GetWorldSize();
 #ifdef XGBOOST_USE_NCCL
-    switch (type_) {
-      case CommunicatorType::kRabit:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
-        break;
-      case CommunicatorType::kFederated:
-        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
-        break;
-      case CommunicatorType::kInMemory:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
-        break;
-      default:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+    if (type_ != CommunicatorType::kFederated) {
+      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
+    } else {
+      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
    }
 #else
-    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
+    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
 #endif
  }
  return device_communicator_.get();
--- a/src/collective/device_communicator.cuh
+++ b/src/collective/device_communicator.cuh
@@ -17,15 +17,32 @@ class DeviceCommunicator {
  virtual ~DeviceCommunicator() = default;

  /**
-   * @brief Combines values from all processes and distributes the result back to all processes.
-   *
+   * @brief Sum values from all processes and distribute the result back to all processes.
   * @param send_receive_buffer Buffer storing the data.
   * @param count               Number of elements in the buffer.
-   * @param data_type           Data type stored in the buffer.
-   * @param op                  The operation to perform.
   */
-  virtual void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                         Operation op) = 0;
+  virtual void AllReduceSum(float *send_receive_buffer, size_t count) = 0;
+
+  /**
+   * @brief Sum values from all processes and distribute the result back to all processes.
+   * @param send_receive_buffer Buffer storing the data.
+   * @param count               Number of elements in the buffer.
+   */
+  virtual void AllReduceSum(double *send_receive_buffer, size_t count) = 0;
+
+  /**
+   * @brief Sum values from all processes and distribute the result back to all processes.
+   * @param send_receive_buffer Buffer storing the data.
+   * @param count               Number of elements in the buffer.
+   */
+  virtual void AllReduceSum(int64_t *send_receive_buffer, size_t count) = 0;
+
+  /**
+   * @brief Sum values from all processes and distribute the result back to all processes.
+   * @param send_receive_buffer Buffer storing the data.
+   * @param count               Number of elements in the buffer.
+   */
+  virtual void AllReduceSum(uint64_t *send_receive_buffer, size_t count) = 0;

  /**
   * @brief Gather variable-length values from all processes.
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -11,53 +11,61 @@ namespace collective {

 class DeviceCommunicatorAdapter : public DeviceCommunicator {
 public:
-  explicit DeviceCommunicatorAdapter(int device_ordinal)
-      : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
+  DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
+      : device_ordinal_{device_ordinal}, communicator_{communicator} {
    if (device_ordinal_ < 0) {
      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
    }
+    if (communicator_ == nullptr) {
+      LOG(FATAL) << "Communicator cannot be null.";
+    }
  }

  ~DeviceCommunicatorAdapter() override = default;

-  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                 Operation op) override {
-    if (world_size_ == 1) {
-      return;
-    }
+  void AllReduceSum(float *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<collective::DataType::kFloat>(send_receive_buffer, count);
+  }

-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    auto size = count * GetTypeSize(data_type);
-    host_buffer_.reserve(size);
-    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
-    Allreduce(host_buffer_.data(), count, data_type, op);
-    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
+  void AllReduceSum(double *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<collective::DataType::kDouble>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(int64_t *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<collective::DataType::kInt64>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(uint64_t *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<collective::DataType::kUInt64>(send_receive_buffer, count);
  }

  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override {
-    if (world_size_ == 1) {
+    if (communicator_->GetWorldSize() == 1) {
      return;
    }

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    int const world_size = communicator_->GetWorldSize();
+    int const rank = communicator_->GetRank();

    segments->clear();
-    segments->resize(world_size_, 0);
-    segments->at(rank_) = length_bytes;
-    Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
+    segments->resize(world_size, 0);
+    segments->at(rank) = length_bytes;
+    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
+                             Operation::kMax);
    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
    receive_buffer->resize(total_bytes);

    host_buffer_.reserve(total_bytes);
    size_t offset = 0;
-    for (int32_t i = 0; i < world_size_; ++i) {
+    for (int32_t i = 0; i < world_size; ++i) {
      size_t as_bytes = segments->at(i);
-      if (i == rank_) {
-        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
+      if (i == rank) {
+        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
                                 cudaMemcpyDefault));
      }
-      Broadcast(host_buffer_.data() + offset, as_bytes, i);
+      communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
@@ -69,9 +77,22 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
  }

 private:
+  template <collective::DataType data_type, typename T>
+  void DoAllReduceSum(T *send_receive_buffer, size_t count) {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    auto size = count * sizeof(T);
+    host_buffer_.reserve(size);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
+    communicator_->AllReduce(host_buffer_.data(), count, data_type, collective::Operation::kSum);
+    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
+  }
+
  int const device_ordinal_;
-  int const world_size_;
-  int const rank_;
+  Communicator *communicator_;
  /// Host buffer used to call communicator functions.
  std::vector<char> host_buffer_{};
 };
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -1,225 +0,0 @@
-/*!
- * Copyright 2023 XGBoost contributors
- */
-#if defined(XGBOOST_USE_NCCL)
-#include "nccl_device_communicator.cuh"
-
-namespace xgboost {
-namespace collective {
-
-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
-    : device_ordinal_{device_ordinal},
-      needs_sync_{needs_sync},
-      world_size_{GetWorldSize()},
-      rank_{GetRank()} {
-  if (device_ordinal_ < 0) {
-    LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
-  }
-  if (world_size_ == 1) {
-    return;
-  }
-
-  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
-  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
-  auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
-  GetCudaUUID(s_this_uuid);
-
-  // TODO(rongou): replace this with allgather.
-  Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
-
-  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
-  size_t j = 0;
-  for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
-    converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
-    j++;
-  }
-
-  auto iter = std::unique(converted.begin(), converted.end());
-  auto n_uniques = std::distance(converted.begin(), iter);
-
-  CHECK_EQ(n_uniques, world_size_)
-      << "Multiple processes within communication group running on same CUDA "
-      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
-
-  nccl_unique_id_ = GetUniqueId();
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
-  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
-}
-
-NcclDeviceCommunicator::~NcclDeviceCommunicator() {
-  if (world_size_ == 1) {
-    return;
-  }
-  if (cuda_stream_) {
-    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-  }
-  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
-  }
-  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-    LOG(CONSOLE) << "======== NCCL Statistics========";
-    LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
-    LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
-  }
-}
-
-namespace {
-ncclDataType_t GetNcclDataType(DataType const &data_type) {
-  ncclDataType_t result{ncclInt8};
-  switch (data_type) {
-    case DataType::kInt8:
-      result = ncclInt8;
-      break;
-    case DataType::kUInt8:
-      result = ncclUint8;
-      break;
-    case DataType::kInt32:
-      result = ncclInt32;
-      break;
-    case DataType::kUInt32:
-      result = ncclUint32;
-      break;
-    case DataType::kInt64:
-      result = ncclInt64;
-      break;
-    case DataType::kUInt64:
-      result = ncclUint64;
-      break;
-    case DataType::kFloat:
-      result = ncclFloat;
-      break;
-    case DataType::kDouble:
-      result = ncclDouble;
-      break;
-    default:
-      LOG(FATAL) << "Unknown data type.";
-  }
-  return result;
-}
-
-bool IsBitwiseOp(Operation const &op) {
-  return op == Operation::kBitwiseAND || op == Operation::kBitwiseOR ||
-         op == Operation::kBitwiseXOR;
-}
-
-ncclRedOp_t GetNcclRedOp(Operation const &op) {
-  ncclRedOp_t result{ncclMax};
-  switch (op) {
-    case Operation::kMax:
-      result = ncclMax;
-      break;
-    case Operation::kMin:
-      result = ncclMin;
-      break;
-    case Operation::kSum:
-      result = ncclSum;
-      break;
-    default:
-      LOG(FATAL) << "Unsupported reduce operation.";
-  }
-  return result;
-}
-
-template <typename Func>
-void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
-                         std::size_t size, cudaStream_t stream) {
-  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
-    auto result = device_buffer[idx];
-    for (auto rank = 1; rank < world_size; rank++) {
-      result = func(result, device_buffer[rank * size + idx]);
-    }
-    out_buffer[idx] = result;
-  });
-}
-}  // anonymous namespace
-
-void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
-                                              DataType data_type, Operation op) {
-  auto const size = count * GetTypeSize(data_type);
-  dh::caching_device_vector<char> buffer(size * world_size_);
-  auto *device_buffer = buffer.data().get();
-
-  // First gather data from all the workers.
-  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, cuda_stream_));
-  if (needs_sync_) {
-    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
-  }
-
-  // Then reduce locally.
-  auto *out_buffer = static_cast<char *>(send_receive_buffer);
-  switch (op) {
-    case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
-                          cuda_stream_);
-      break;
-    case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
-                          cuda_stream_);
-      break;
-    case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
-                          cuda_stream_);
-      break;
-    default:
-      LOG(FATAL) << "Not a bitwise reduce operation.";
-  }
-}
-
-void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
-                                       DataType data_type, Operation op) {
-  if (world_size_ == 1) {
-    return;
-  }
-
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  if (IsBitwiseOp(op)) {
-    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
-  } else {
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                cuda_stream_));
-  }
-  allreduce_bytes_ += count * GetTypeSize(data_type);
-  allreduce_calls_ += 1;
-}
-
-void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
-                                        std::vector<std::size_t> *segments,
-                                        dh::caching_device_vector<char> *receive_buffer) {
-  if (world_size_ == 1) {
-    return;
-  }
-
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-
-  segments->clear();
-  segments->resize(world_size_, 0);
-  segments->at(rank_) = length_bytes;
-  Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
-  auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
-  receive_buffer->resize(total_bytes);
-
-  size_t offset = 0;
-  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size_; ++i) {
-    size_t as_bytes = segments->at(i);
-    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, cuda_stream_));
-    offset += as_bytes;
-  }
-  dh::safe_nccl(ncclGroupEnd());
-}
-
-void NcclDeviceCommunicator::Synchronize() {
-  if (world_size_ == 1) {
-    return;
-  }
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
-}
-
-}  // namespace collective
-}  // namespace xgboost
-#endif
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022-2023 XGBoost contributors
+ * Copyright 2022 XGBoost contributors
 */
 #pragma once

@@ -12,26 +12,118 @@ namespace collective {

 class NcclDeviceCommunicator : public DeviceCommunicator {
 public:
-  /**
-   * @brief Construct a new NCCL communicator.
-   * @param device_ordinal The GPU device id.
-   * @param needs_sync Whether extra CUDA stream synchronization is needed.
-   *
-   * In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
-   * a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
-   * makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
-   *
-   * The Rabit communicator runs with one process per GPU, so the additional synchronization is not
-   * needed. The in-memory communicator is used in tests with multiple threads, each thread
-   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
-   */
-  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
-  ~NcclDeviceCommunicator() override;
-  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                 Operation op) override;
+  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
+      : device_ordinal_{device_ordinal}, communicator_{communicator} {
+    if (device_ordinal_ < 0) {
+      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
+    }
+    if (communicator_ == nullptr) {
+      LOG(FATAL) << "Communicator cannot be null.";
+    }
+
+    int32_t const rank = communicator_->GetRank();
+    int32_t const world = communicator_->GetWorldSize();
+
+    if (world == 1) {
+      return;
+    }
+
+    std::vector<uint64_t> uuids(world * kUuidLength, 0);
+    auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
+    auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
+    GetCudaUUID(s_this_uuid);
+
+    // TODO(rongou): replace this with allgather.
+    communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
+
+    std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
+    size_t j = 0;
+    for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
+      converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
+      j++;
+    }
+
+    auto iter = std::unique(converted.begin(), converted.end());
+    auto n_uniques = std::distance(converted.begin(), iter);
+
+    CHECK_EQ(n_uniques, world)
+        << "Multiple processes within communication group running on same CUDA "
+        << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
+
+    nccl_unique_id_ = GetUniqueId();
+    dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
+    dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
+  }
+
+  ~NcclDeviceCommunicator() override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+    if (cuda_stream_) {
+      dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
+    }
+    if (nccl_comm_) {
+      dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    }
+    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      LOG(CONSOLE) << "======== NCCL Statistics========";
+      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
+      LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
+    }
+  }
+
+  void AllReduceSum(float *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclFloat>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(double *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclDouble>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(int64_t *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclInt64>(send_receive_buffer, count);
+  }
+
+  void AllReduceSum(uint64_t *send_receive_buffer, size_t count) override {
+    DoAllReduceSum<ncclUint64>(send_receive_buffer, count);
+  }
+
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
-                  dh::caching_device_vector<char> *receive_buffer) override;
-  void Synchronize() override;
+                  dh::caching_device_vector<char> *receive_buffer) override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    int const world_size = communicator_->GetWorldSize();
+    int const rank = communicator_->GetRank();
+
+    segments->clear();
+    segments->resize(world_size, 0);
+    segments->at(rank) = length_bytes;
+    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
+                             Operation::kMax);
+    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
+    receive_buffer->resize(total_bytes);
+
+    size_t offset = 0;
+    dh::safe_nccl(ncclGroupStart());
+    for (int32_t i = 0; i < world_size; ++i) {
+      size_t as_bytes = segments->at(i);
+      dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                  ncclChar, i, nccl_comm_, cuda_stream_));
+      offset += as_bytes;
+    }
+    dh::safe_nccl(ncclGroupEnd());
+  }
+
+  void Synchronize() override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+  }

 private:
  static constexpr std::size_t kUuidLength =
@@ -62,20 +154,29 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  ncclUniqueId GetUniqueId() {
    static const int kRootRank = 0;
    ncclUniqueId id;
-    if (rank_ == kRootRank) {
+    if (communicator_->GetRank() == kRootRank) {
      dh::safe_nccl(ncclGetUniqueId(&id));
    }
-    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
+    communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
+                             static_cast<int>(kRootRank));
    return id;
  }

-  void BitwiseAllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                        Operation op);
+  template <ncclDataType_t data_type, typename T>
+  void DoAllReduceSum(T *send_receive_buffer, size_t count) {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count, data_type, ncclSum,
+                                nccl_comm_, cuda_stream_));
+    allreduce_bytes_ += count * sizeof(T);
+    allreduce_calls_ += 1;
+  }

  int const device_ordinal_;
-  bool const needs_sync_;
-  int const world_size_;
-  int const rank_;
+  Communicator *communicator_;
  ncclComm_t nccl_comm_{};
  cudaStream_t cuda_stream_{};
  ncclUniqueId nccl_unique_id_{};
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -1,5 +1,5 @@
-/**
- * Copyright 2019-2023, XGBoost Contributors
+/*!
+ * Copyright 2019 by Contributors
 * \file bitfield.h
 */
 #ifndef XGBOOST_COMMON_BITFIELD_H_
@@ -50,17 +50,14 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
 }
 #endif  // defined(__CUDACC__)

-/**
- * @brief A non-owning type with auxiliary methods defined for manipulating bits.
+/*!
+ * \brief A non-owning type with auxiliary methods defined for manipulating bits.
 *
- * @tparam VT        Underlying value type, must be an unsigned integer.
- * @tparam Direction Whether the bits start from left or from right.
- * @tparam IsConst   Whether the view is const.
+ * \tparam Direction Whether the bits start from left or from right.
 */
 template <typename VT, typename Direction, bool IsConst = false>
 struct BitFieldContainer {
  using value_type = std::conditional_t<IsConst, VT const, VT>;  // NOLINT
-  using size_type = size_t;                                      // NOLINT
  using index_type = size_t;                                     // NOLINT
  using pointer = value_type*;                                   // NOLINT

@@ -73,9 +70,8 @@ struct BitFieldContainer {
  };

 private:
-  value_type* bits_{nullptr};
-  size_type n_values_{0};
-  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
+  common::Span<value_type> bits_;
+  static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");

 public:
  XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -90,15 +86,13 @@ struct BitFieldContainer {

 public:
  BitFieldContainer() = default;
-  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
-      : bits_{bits.data()}, n_values_{bits.size()} {}
-  BitFieldContainer(BitFieldContainer const& other) = default;
-  BitFieldContainer(BitFieldContainer&& other) = default;
+  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
+  XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
  BitFieldContainer &operator=(BitFieldContainer const &that) = default;
  BitFieldContainer &operator=(BitFieldContainer &&that) = default;

-  XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
-  XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }
+  XGBOOST_DEVICE common::Span<value_type>       Bits()       { return bits_; }
+  XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }

  /*\brief Compute the size of needed memory allocation.  The returned value is in terms
   *       of number of elements with `BitFieldContainer::value_type'.
@@ -109,17 +103,17 @@ struct BitFieldContainer {
 #if defined(__CUDA_ARCH__)
  __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t min_size = min(NumValues(), rhs.NumValues());
+    size_t min_size = min(bits_.size(), rhs.bits_.size());
    if (tid < min_size) {
-      Data()[tid] |= rhs.Data()[tid];
+      bits_[tid] |= rhs.bits_[tid];
    }
    return *this;
  }
 #else
  BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(NumValues(), rhs.NumValues());
+    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
    for (size_t i = 0; i < min_size; ++i) {
-      Data()[i] |= rhs.Data()[i];
+      bits_[i] |= rhs.bits_[i];
    }
    return *this;
  }
@@ -127,85 +121,75 @@ struct BitFieldContainer {

 #if defined(__CUDA_ARCH__)
  __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = min(NumValues(), rhs.NumValues());
+    size_t min_size = min(bits_.size(), rhs.bits_.size());
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < min_size) {
-      Data()[tid] &= rhs.Data()[tid];
+      bits_[tid] &= rhs.bits_[tid];
    }
    return *this;
  }
 #else
  BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(NumValues(), rhs.NumValues());
+    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
    for (size_t i = 0; i < min_size; ++i) {
-      Data()[i] &= rhs.Data()[i];
+      bits_[i] &= rhs.bits_[i];
    }
    return *this;
  }
 #endif  // defined(__CUDA_ARCH__)

 #if defined(__CUDA_ARCH__)
-  __device__ auto Set(index_type pos) noexcept(true) {
+  __device__ auto Set(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type set_bit = kOne << pos_v.bit_pos;
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicOr(reinterpret_cast<Type *>(&value), set_bit);
  }
-  __device__ void Clear(index_type pos) noexcept(true) {
+  __device__ void Clear(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
  }
 #else
-  void Set(index_type pos) noexcept(true) {
+  void Set(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type set_bit = kOne << pos_v.bit_pos;
    value |= set_bit;
  }
-  void Clear(index_type pos) noexcept(true) {
+  void Clear(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    value &= clear_bit;
  }
 #endif  // defined(__CUDA_ARCH__)

-  XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
+  XGBOOST_DEVICE bool Check(Pos pos_v) const {
    pos_v = Direction::Shift(pos_v);
-    assert(pos_v.int_pos < NumValues());
-    value_type const value = Data()[pos_v.int_pos];
+    SPAN_LT(pos_v.int_pos, bits_.size());
+    value_type const value = bits_[pos_v.int_pos];
    value_type const test_bit = kOne << pos_v.bit_pos;
    value_type result = test_bit & value;
    return static_cast<bool>(result);
  }
-  [[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
+  XGBOOST_DEVICE bool Check(index_type pos) const {
    Pos pos_v = ToBitPos(pos);
    return Check(pos_v);
  }
-  /**
-   * @brief Returns the total number of bits that can be viewed. This is equal to or
-   *        larger than the acutal number of valid bits.
-   */
-  [[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
-    return kValueSize * NumValues();
-  }
-  /**
-   * @brief Number of storage unit used in this bit field.
-   */
-  [[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }

-  XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }
+  XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }

-  inline friend std::ostream& operator<<(std::ostream& os,
-                                         BitFieldContainer<VT, Direction, IsConst> field) {
-    os << "Bits "
-       << "storage size: " << field.NumValues() << "\n";
-    for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
-      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
+  XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
+
+  inline friend std::ostream &
+  operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
+    os << "Bits " << "storage size: " << field.bits_.size() << "\n";
+    for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
+      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
      os << bset << "\n";
    }
    return os;
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -1,5 +1,5 @@
-/**
- * Copyright 2020-2023, XGBoost Contributors
+/*!
+ * Copyright 2020-2022 by XGBoost Contributors
 * \file categorical.h
 */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -10,6 +10,7 @@
 #include "bitfield.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
+#include "xgboost/parameter.h"
 #include "xgboost/span.h"

 namespace xgboost {
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -1,27 +1,16 @@
-/**
- * Copyright 2017-2023, XGBoost Contributors
+/*!
+ * Copyright 2017-2022 by XGBoost Contributors
 * \brief Utility for fast column-wise access
 */
 #include "column_matrix.h"

-#include <algorithm>    // for transform
-#include <cstddef>      // for size_t
-#include <cstdint>      // for uint64_t, uint8_t
-#include <limits>       // for numeric_limits
-#include <type_traits>  // for remove_reference_t
-#include <vector>       // for vector
-
-#include "../data/gradient_index.h"  // for GHistIndexMatrix
-#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "xgboost/base.h"            // for bst_feaature_t
-#include "xgboost/span.h"            // for Span
-
-namespace xgboost::common {
+namespace xgboost {
+namespace common {
 void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
  auto const nfeature = gmat.Features();
  const size_t nrow = gmat.Size();
  // identify type of each column
-  type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});
+  type_.resize(nfeature);

  uint32_t max_val = std::numeric_limits<uint32_t>::max();
  for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
@@ -45,7 +34,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  // want to compute storage boundary for each feature
  // using variants of prefix sum scan
-  feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
+  feature_offsets_.resize(nfeature + 1);
  size_t accum_index = 0;
  feature_offsets_[0] = accum_index;
  for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
@@ -60,11 +49,9 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
-
-  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
-
+  index_.resize(storage_size, 0);
  if (!all_dense_column) {
-    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
+    row_ind_.resize(feature_offsets_[nfeature]);
  }

  // store least bin id for each feature
@@ -72,51 +59,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  any_missing_ = !gmat.IsDense();

-  missing_ = MissingIndicator{0, false};
+  missing_flags_.clear();
 }
-
-// IO procedures for external memory.
-bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
-  if (!common::ReadVec(fi, &index_)) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &type_)) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &row_ind_)) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &feature_offsets_)) {
-    return false;
-  }
-
-  if (!common::ReadVec(fi, &missing_.storage)) {
-    return false;
-  }
-  missing_.InitView();
-
-  index_base_ = index_base;
-  if (!fi->Read(&bins_type_size_)) {
-    return false;
-  }
-  if (!fi->Read(&any_missing_)) {
-    return false;
-  }
-  return true;
-}
-
-std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
-  std::size_t bytes{0};
-
-  bytes += common::WriteVec(fo, index_);
-  bytes += common::WriteVec(fo, type_);
-  bytes += common::WriteVec(fo, row_ind_);
-  bytes += common::WriteVec(fo, feature_offsets_);
-  bytes += common::WriteVec(fo, missing_.storage);
-
-  bytes += fo->Write(bins_type_size_);
-  bytes += fo->Write(any_missing_);
-
-  return bytes;
-}
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
-/**
- * Copyright 2017-2023, XGBoost Contributors
+/*!
+ * Copyright 2017-2022 by Contributors
 * \file column_matrix.h
 * \brief Utility for fast column-wise access
 * \author Philip Cho
@@ -8,30 +8,25 @@
 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
 #define XGBOOST_COMMON_COLUMN_MATRIX_H_

+#include <dmlc/endian.h>
+
 #include <algorithm>
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
-#include <utility>  // for move
+#include <utility>  // std::move
 #include <vector>

 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "algorithm.h"
-#include "bitfield.h"  // for RBitField8
 #include "hist_util.h"
-#include "ref_resource_view.h"  // for RefResourceView
-#include "xgboost/base.h"       // for bst_bin_t
-#include "xgboost/span.h"       // for Span

-namespace xgboost::common {
+namespace xgboost {
+namespace common {
+
 class ColumnMatrix;
-class AlignedFileWriteStream;
-class AlignedResourceReadStream;
-
 /*! \brief column type */
-enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };
+enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };

 /*! \brief a column storage, to be used with ApplySplit. Note that each
    bin id is stored as index[i] + index_base.
@@ -46,12 +41,12 @@ class Column {
      : index_(index), index_base_(least_bin_idx) {}
  virtual ~Column() = default;

-  [[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
+  bst_bin_t GetGlobalBinIdx(size_t idx) const {
    return index_base_ + static_cast<bst_bin_t>(index_[idx]);
  }

  /* returns number of elements in column */
-  [[nodiscard]] size_t Size() const { return index_.size(); }
+  size_t Size() const { return index_.size(); }

 private:
  /* bin indexes in range [0, max_bins - 1] */
@@ -68,7 +63,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  common::Span<const size_t> row_ind_;
  size_t idx_;

-  [[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }
+  size_t const* RowIndices() const { return row_ind_.data(); }

 public:
  SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
@@ -86,7 +81,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  SparseColumnIter(SparseColumnIter const&) = delete;
  SparseColumnIter(SparseColumnIter&&) = default;

-  [[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
+  size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
  bst_bin_t operator[](size_t rid) {
    const size_t column_size = this->Size();
    if (!((idx_) < column_size)) {
@@ -106,10 +101,6 @@ class SparseColumnIter : public Column<BinIdxT> {
  }
 };

-/**
- * @brief Column stored as a dense vector. It might still contain missing values as
- *        indicated by the missing flags.
- */
 template <typename BinIdxT, bool any_missing>
 class DenseColumnIter : public Column<BinIdxT> {
 public:
@@ -118,19 +109,17 @@ class DenseColumnIter : public Column<BinIdxT> {
 private:
  using Base = Column<BinIdxT>;
  /* flags for missing values in dense columns */
-  LBitField32 missing_flags_;
+  std::vector<ByteType> const& missing_flags_;
  size_t feature_offset_;

 public:
  explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
-                           LBitField32 missing_flags, size_t feature_offset)
+                           std::vector<ByteType> const& missing_flags, size_t feature_offset)
      : Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
  DenseColumnIter(DenseColumnIter const&) = delete;
  DenseColumnIter(DenseColumnIter&&) = default;

-  [[nodiscard]] bool IsMissing(size_t ridx) const {
-    return missing_flags_.Check(feature_offset_ + ridx);
-  }
+  bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }

  bst_bin_t operator[](size_t ridx) const {
    if (any_missing) {
@@ -142,54 +131,12 @@ class DenseColumnIter : public Column<BinIdxT> {
 };

 /**
- * @brief Column major matrix for gradient index on CPU.
- *
- *    This matrix contains both dense columns and sparse columns, the type of the column
- *    is controlled by the sparse threshold parameter. When the number of missing values
- *    in a column is below the threshold it's classified as dense column.
+ * \brief Column major matrix for gradient index. This matrix contains both dense column
+ * and sparse column, the type of the column is controlled by sparse threshold. When the
+ * number of missing values in a column is below the threshold it's classified as dense
+ * column.
 */
 class ColumnMatrix {
-  /**
-   * @brief A bit set for indicating whether an element in a dense column is missing.
-   */
-  struct MissingIndicator {
-    LBitField32 missing;
-    RefResourceView<std::uint32_t> storage;
-
-    MissingIndicator() = default;
-    /**
-     * @param n_elements Size of the bit set
-     * @param init       Initialize the indicator to true or false.
-     */
-    MissingIndicator(std::size_t n_elements, bool init) {
-      auto m_size = missing.ComputeStorageSize(n_elements);
-      storage = common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
-      this->InitView();
-    }
-    /** @brief Set the i^th element to be a valid element (instead of missing). */
-    void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
-    /** @brief assign the storage to the view. */
-    void InitView() {
-      missing = LBitField32{Span{storage.data(), storage.size()}};
-    }
-
-    void GrowTo(std::size_t n_elements, bool init) {
-      CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
-          << "[Internal Error]: Cannot grow the vector when external memory is used.";
-      auto m_size = missing.ComputeStorageSize(n_elements);
-      CHECK_GE(m_size, storage.size());
-      if (m_size == storage.size()) {
-        return;
-      }
-
-      auto new_storage =
-          common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
-      std::copy_n(storage.cbegin(), storage.size(), new_storage.begin());
-      storage = std::move(new_storage);
-      this->InitView();
-    }
-  };
-
  void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);

  template <typename ColumnBinT, typename BinT, typename RIdx>
@@ -197,10 +144,9 @@ class ColumnMatrix {
    if (type_[fid] == kDenseColumn) {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[rid] = bin_id - index_base_[fid];
-      // not thread-safe with bit field.
-      // FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
-      // flags.
-      missing_.SetValid(feature_offsets_[fid] + rid);
+      // not thread-safe with bool vector.  FIXME(jiamingy): We can directly assign
+      // kMissingId to the index to avoid missing flags.
+      missing_flags_[feature_offsets_[fid] + rid] = false;
    } else {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
@@ -212,9 +158,7 @@ class ColumnMatrix {
 public:
  using ByteType = bool;
  // get number of features
-  [[nodiscard]] bst_feature_t GetNumFeature() const {
-    return static_cast<bst_feature_t>(type_.size());
-  }
+  bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }

  ColumnMatrix() = default;
  ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
@@ -222,7 +166,7 @@ class ColumnMatrix {
  }

  /**
-   * @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
+   * \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
   *        SparsePage.
   */
  void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
@@ -234,8 +178,8 @@ class ColumnMatrix {
  }

  /**
-   * @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
-   *        data.
+   * \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
+   * data.
   *
   *    This function requires a binary search for each bin to get back the feature index
   *    for those bins.
@@ -255,7 +199,7 @@ class ColumnMatrix {
    }
  }

-  [[nodiscard]] bool IsInitialized() const { return !type_.empty(); }
+  bool IsInitialized() const { return !type_.empty(); }

  /**
   * \brief Push batch of data for Quantile DMatrix support.
@@ -313,7 +257,7 @@ class ColumnMatrix {
        reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
        column_size};
    return std::move(DenseColumnIter<BinIdxType, any_missing>{
-        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
+        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
  }

  // all columns are dense column and has no missing value
@@ -321,8 +265,7 @@ class ColumnMatrix {
  template <typename RowBinIdxT>
  void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                         const size_t n_features, int32_t n_threads) {
-    missing_.GrowTo(feature_offsets_[n_features], false);
-
+    missing_flags_.resize(feature_offsets_[n_features], false);
    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
      auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
@@ -347,15 +290,9 @@ class ColumnMatrix {
  void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
                            float missing) {
    auto n_features = gmat.Features();
-
-    missing_.GrowTo(feature_offsets_[n_features], true);
-    auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
-    if (num_nonzeros_.empty()) {
-      num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
-    } else {
-      CHECK_EQ(num_nonzeros_.size(), n_features);
-    }
-
+    missing_flags_.resize(feature_offsets_[n_features], true);
+    auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
+    num_nonzeros_.resize(n_features, 0);
    auto is_valid = data::IsValidFunctor{missing};

    DispatchBinType(bins_type_size_, [&](auto t) {
@@ -384,9 +321,8 @@ class ColumnMatrix {
   */
  void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
    auto n_features = gmat.Features();
-
-    missing_ = MissingIndicator{feature_offsets_[n_features], true};
-    num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
+    missing_flags_.resize(feature_offsets_[n_features], true);
+    num_nonzeros_.resize(n_features, 0);

    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
@@ -399,34 +335,106 @@ class ColumnMatrix {
    });
  }

-  [[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
-  [[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
+  BinTypeSize GetTypeSize() const { return bins_type_size_; }
+  auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }

  // And this returns part of state
-  [[nodiscard]] bool AnyMissing() const { return any_missing_; }
+  bool AnyMissing() const { return any_missing_; }

  // IO procedures for external memory.
-  [[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
-  [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
+  bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
+    fi->Read(&index_);
+#if !DMLC_LITTLE_ENDIAN
+    // s390x
+    std::vector<std::underlying_type<ColumnType>::type> int_types;
+    fi->Read(&int_types);
+    type_.resize(int_types.size());
+    std::transform(
+        int_types.begin(), int_types.end(), type_.begin(),
+        [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
+#else
+    fi->Read(&type_);
+#endif  // !DMLC_LITTLE_ENDIAN
+
+    fi->Read(&row_ind_);
+    fi->Read(&feature_offsets_);
+
+    std::vector<std::uint8_t> missing;
+    fi->Read(&missing);
+    missing_flags_.resize(missing.size());
+    std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
+                   [](std::uint8_t flag) { return !!flag; });
+
+    index_base_ = index_base;
+#if !DMLC_LITTLE_ENDIAN
+    std::underlying_type<BinTypeSize>::type v;
+    fi->Read(&v);
+    bins_type_size_ = static_cast<BinTypeSize>(v);
+#else
+    fi->Read(&bins_type_size_);
+#endif
+
+    fi->Read(&any_missing_);
+    return true;
+  }
+
+  size_t Write(dmlc::Stream* fo) const {
+    size_t bytes{0};
+
+    auto write_vec = [&](auto const& vec) {
+      fo->Write(vec);
+      bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
+               sizeof(uint64_t);
+    };
+    write_vec(index_);
+#if !DMLC_LITTLE_ENDIAN
+    // s390x
+    std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
+    std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
+      return static_cast<std::underlying_type<ColumnType>::type>(t);
+    });
+    write_vec(int_types);
+#else
+    write_vec(type_);
+#endif  // !DMLC_LITTLE_ENDIAN
+    write_vec(row_ind_);
+    write_vec(feature_offsets_);
+    // dmlc can not handle bool vector
+    std::vector<std::uint8_t> missing(missing_flags_.size());
+    std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
+                   [](bool flag) { return static_cast<std::uint8_t>(flag); });
+    write_vec(missing);
+
+#if !DMLC_LITTLE_ENDIAN
+    auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
+    fo->Write(v);
+#else
+    fo->Write(bins_type_size_);
+#endif  // DMLC_LITTLE_ENDIAN
+    bytes += sizeof(bins_type_size_);
+    fo->Write(any_missing_);
+    bytes += sizeof(any_missing_);
+
+    return bytes;
+  }

 private:
-  RefResourceView<std::uint8_t> index_;
+  std::vector<uint8_t> index_;

-  RefResourceView<ColumnType> type_;
-  /** @brief indptr of a CSC matrix. */
-  RefResourceView<std::size_t> row_ind_;
-  /** @brief indicate where each column's index and row_ind is stored. */
-  RefResourceView<std::size_t> feature_offsets_;
-  /** @brief The number of nnz of each column. */
-  RefResourceView<std::size_t> num_nonzeros_;
+  std::vector<ColumnType> type_;
+  /* indptr of a CSC matrix. */
+  std::vector<size_t> row_ind_;
+  /* indicate where each column's index and row_ind is stored. */
+  std::vector<size_t> feature_offsets_;
+  /* The number of nnz of each column. */
+  std::vector<size_t> num_nonzeros_;

  // index_base_[fid]: least bin id for feature fid
-  std::uint32_t const* index_base_;
-
-  MissingIndicator missing_;
-
+  uint32_t const* index_base_;
+  std::vector<ByteType> missing_flags_;
  BinTypeSize bins_type_size_;
  bool any_missing_;
 };
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
 #endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -6,11 +6,6 @@
 #ifndef XGBOOST_COMMON_ERROR_MSG_H_
 #define XGBOOST_COMMON_ERROR_MSG_H_

-#include <cinttypes>  // for uint64_t
-#include <limits>     // for numeric_limits
-
-#include "xgboost/base.h"  // for bst_feature_t
-#include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView

 namespace xgboost::error {
@@ -38,46 +33,5 @@ constexpr StringView InconsistentMaxBin() {
  return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
         "and consistent with the Booster being trained.";
 }
-
-constexpr StringView UnknownDevice() { return "Unknown device type."; }
-
-inline void MaxFeatureSize(std::uint64_t n_features) {
-  auto max_n_features = std::numeric_limits<bst_feature_t>::max();
-  CHECK_LE(n_features, max_n_features)
-      << "Unfortunately, XGBoost does not support data matrices with "
-      << std::numeric_limits<bst_feature_t>::max() << " features or greater";
-}
-
-constexpr StringView InplacePredictProxy() {
-  return "Inplace predict accepts only DMatrixProxy as input.";
-}
-
-inline void MaxSampleSize(std::size_t n) {
-  LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
-             << ". Consider using a different updater or tree_method.";
-}
-
-constexpr StringView OldSerialization() {
-  return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
-configuration generated by an older version of XGBoost, please export the model by calling
-`Booster.save_model` from that version first, then load it back in current version. See:
-
-    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
-
-for more details about differences between saving model and serializing.
-)doc";
-}
-
-inline void WarnOldSerialization() {
-  // Display it once is enough. Otherwise this can be really verbose in distributed
-  // environments.
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-
-  LOG(WARNING) << OldSerialization();
-  logged = true;
-}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -127,76 +127,55 @@ void SortByWeight(dh::device_vector<float>* weights,
                                });
 }

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
-                                dh::device_vector<Entry>* p_sorted_entries,
-                                dh::device_vector<float>* p_sorted_weights,
-                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
+void RemoveDuplicatedCategories(
+    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
+    dh::device_vector<Entry> *p_sorted_entries,
+    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
  info.feature_types.SetDevice(device);
  auto d_feature_types = info.feature_types.ConstDeviceSpan();
  CHECK(!d_feature_types.empty());
-  auto& column_sizes_scan = *p_column_sizes_scan;
-  auto& sorted_entries = *p_sorted_entries;
+  auto &column_sizes_scan = *p_column_sizes_scan;
+  auto &sorted_entries = *p_sorted_entries;
  // Removing duplicated entries in categorical features.
-
-  // We don't need to accumulate weight for duplicated entries as there's no weighted
-  // sketching for categorical features, the categories are the cut values.
  dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
-  std::size_t n_uniques{0};
-  if (p_sorted_weights) {
-    using Pair = thrust::tuple<Entry, float>;
-    auto d_sorted_entries = dh::ToSpan(sorted_entries);
-    auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
-    auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
-    auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
-    n_uniques = dh::SegmentedUnique(
-        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
-        val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
-        [=] __device__(Pair const& l, Pair const& r) {
-          Entry const& le = thrust::get<0>(l);
-          Entry const& re = thrust::get<0>(r);
-          if (le.index == re.index && IsCat(d_feature_types, le.index)) {
-            return le.fvalue == re.fvalue;
-          }
-          return false;
-        });
-    p_sorted_weights->resize(n_uniques);
-  } else {
-    n_uniques = dh::SegmentedUnique(
-        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
-        sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
-        sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
-          if (l.index == r.index) {
-            if (IsCat(d_feature_types, l.index)) {
-              return l.fvalue == r.fvalue;
-            }
-          }
-          return false;
-        });
-  }
-  sorted_entries.resize(n_uniques);
+  dh::SegmentedUnique(column_sizes_scan.data().get(),
+                      column_sizes_scan.data().get() + column_sizes_scan.size(),
+                      sorted_entries.begin(), sorted_entries.end(),
+                      new_column_scan.data().get(), sorted_entries.begin(),
+                      [=] __device__(Entry const &l, Entry const &r) {
+                        if (l.index == r.index) {
+                          if (IsCat(d_feature_types, l.index)) {
+                            return l.fvalue == r.fvalue;
+                          }
+                        }
+                        return false;
+                      });

  // Renew the column scan and cut scan based on categorical data.
  auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
-  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
+  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
+      info.num_col_ + 1);
  CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
-  dh::LaunchN(new_column_scan.size(),
-              [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
-               d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
-               d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
-                d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
-                if (idx == d_new_columns_ptr.size() - 1) {
-                  return;
-                }
-                if (IsCat(d_feature_types, idx)) {
-                  // Cut size is the same as number of categories in input.
-                  d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
-                } else {
-                  d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
-                }
-              });
+  dh::LaunchN(
+      new_column_scan.size(),
+      [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
+       d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
+       d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
+        d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
+        if (idx == d_new_columns_ptr.size() - 1) {
+          return;
+        }
+        if (IsCat(d_feature_types, idx)) {
+          // Cut size is the same as number of categories in input.
+          d_new_cuts_size[idx] =
+              d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
+        } else {
+          d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
+        }
+      });
  // Turn size into ptr.
-  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
-                         d_cuts_ptr.data());
+  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
+                         new_cuts_size.cend(), d_cuts_ptr.data());
 }
 }  // namespace detail

@@ -230,8 +209,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();

  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
@@ -297,8 +276,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -240,10 +240,10 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
-                                dh::device_vector<Entry>* p_sorted_entries,
-                                dh::device_vector<float>* p_sorted_weights,
-                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
+void RemoveDuplicatedCategories(
+    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
+    dh::device_vector<Entry> *p_sorted_entries,
+    dh::caching_device_vector<size_t> *p_column_sizes_scan);
 }  // namespace detail

 // Compute sketch on DMatrix.
@@ -275,8 +275,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -354,8 +354,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -84,7 +84,7 @@ class HistogramCuts {
    return *this;
  }

-  [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
+  uint32_t FeatureBins(bst_feature_t feature) const {
    return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
  }

@@ -92,8 +92,8 @@ class HistogramCuts {
  std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
  std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }

-  [[nodiscard]] bool HasCategorical() const { return has_categorical_; }
-  [[nodiscard]] float MaxCategory() const { return max_cat_; }
+  bool HasCategorical() const { return has_categorical_; }
+  float MaxCategory() const { return max_cat_; }
  /**
   * \brief Set meta info about categorical features.
   *
@@ -105,13 +105,12 @@ class HistogramCuts {
    max_cat_ = max_cat;
  }

-  [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
+  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }

  // Return the index of a cut point that is strictly greater than the input
  // value, or the last available index if none exists
-  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
-                                    std::vector<uint32_t> const& ptrs,
-                                    std::vector<float> const& values) const {
+  bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
+                      std::vector<float> const& values) const {
    auto end = ptrs[column_id + 1];
    auto beg = ptrs[column_id];
    auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
@@ -120,20 +119,20 @@ class HistogramCuts {
    return idx;
  }

-  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
+  bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
    return this->SearchBin(value, column_id, Ptrs(), Values());
  }
+
  /**
   * \brief Search the bin index for numerical feature.
   */
-  [[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
+  bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }

  /**
   * \brief Search the bin index for categorical feature.
   */
-  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
-                                       std::vector<uint32_t> const& ptrs,
-                                       std::vector<float> const& vals) const {
+  bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
+                         std::vector<float> const& vals) const {
    auto end = ptrs.at(fidx + 1) + vals.cbegin();
    auto beg = ptrs[fidx] + vals.cbegin();
    // Truncates the value in case it's not perfectly rounded.
@@ -144,14 +143,12 @@ class HistogramCuts {
    }
    return bin_idx;
  }
-  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
+  bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
    auto const& ptrs = this->Ptrs();
    auto const& vals = this->Values();
    return this->SearchCatBin(value, fidx, ptrs, vals);
  }
-  [[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
-    return SearchCatBin(e.fvalue, e.index);
-  }
+  bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }

  /**
   * \brief Return numerical bin value given bin index.
@@ -203,33 +200,13 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
 }

 /**
- * @brief Optionally compressed gradient index. The compression works only with dense
+ * \brief Optionally compressed gradient index. The compression works only with dense
 *        data.
 *
 *   The main body of construction code is in gradient_index.cc, this struct is only a
- *   view class.
+ *   storage class.
 */
-class Index {
- private:
-  void SetBinTypeSize(BinTypeSize binTypeSize) {
-    binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
-              binTypeSize == kUint32BinsTypeSize);
-    }
-  }
-
- public:
+struct Index {
  // Inside the compressor, bin_idx is the index for cut value across all features. By
  // subtracting it with starting pointer of each feature, we can reduce it to smaller
  // value and store it with smaller types. Usable only with dense data.
@@ -253,24 +230,10 @@ class Index {
  }

  Index() { SetBinTypeSize(binTypeSize_); }
-
-  Index(Index const& i) = delete;
-  Index& operator=(Index const& i) = delete;
+  Index(const Index& i) = delete;
+  Index& operator=(Index i) = delete;
  Index(Index&& i) = delete;
-
-  /** @brief Move assignment for lazy initialization. */
-  Index& operator=(Index&& i) = default;
-
-  /**
-   * @brief Construct the index from data.
-   *
-   * @param data     Storage for compressed histogram bin.
-   * @param bin_size Number of bytes for each bin.
-   */
-  Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
-    this->SetBinTypeSize(bin_size);
-  }
-
+  Index& operator=(Index&& i) = delete;
  uint32_t operator[](size_t i) const {
    if (!bin_offset_.empty()) {
      // dense, compressed
@@ -281,7 +244,26 @@ class Index {
      return func_(data_.data(), i);
    }
  }
-  [[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
+              binTypeSize == kUint32BinsTypeSize);
+    }
+  }
+  BinTypeSize GetBinTypeSize() const {
+    return binTypeSize_;
+  }
  template <typename T>
  T const* data() const {  // NOLINT
    return reinterpret_cast<T const*>(data_.data());
@@ -290,27 +272,30 @@ class Index {
  T* data() {  // NOLINT
    return reinterpret_cast<T*>(data_.data());
  }
-  [[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
-  [[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
-  [[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }
+  uint32_t const* Offset() const { return bin_offset_.data(); }
+  size_t OffsetSize() const { return bin_offset_.size(); }
+  size_t Size() const { return data_.size() / (binTypeSize_); }

+  void Resize(const size_t n_bytes) {
+    data_.resize(n_bytes);
+  }
  // set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
  void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
    bin_offset_.resize(cut_ptrs.size() - 1);  // resize to number of features.
    std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
  }
-  auto begin() const {  // NOLINT
-    return data_.data();
+  std::vector<uint8_t>::const_iterator begin() const {  // NOLINT
+    return data_.begin();
  }
-  auto end() const {  // NOLINT
-    return data_.data() + data_.size();
+  std::vector<uint8_t>::const_iterator end() const {  // NOLINT
+    return data_.end();
  }

-  auto begin() {  // NOLINT
-    return data_.data();
+  std::vector<uint8_t>::iterator begin() {  // NOLINT
+    return data_.begin();
  }
-  auto end() {  // NOLINT
-    return data_.data() + data_.size();
+  std::vector<uint8_t>::iterator end() {  // NOLINT
+    return data_.end();
  }

 private:
@@ -325,12 +310,12 @@ class Index {

  using Func = uint32_t (*)(uint8_t const*, size_t);

-  Span<std::uint8_t> data_;
+  std::vector<uint8_t> data_;
  // starting position of each feature inside the cut values (the indptr of the CSC cut matrix
  // HistogramCuts without the last entry.) Used for bin compression.
  std::vector<uint32_t> bin_offset_;

-  BinTypeSize binTypeSize_{kUint8BinsTypeSize};
+  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
  Func func_;
 };

--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,47 +1,24 @@
-/**
- * Copyright 2019-2023, by XGBoost Contributors
+/*!
+ * Copyright (c) by XGBoost Contributors 2019-2022
 */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
-#if !defined(xgboost_IS_WIN)
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define xgboost_IS_WIN 1
-#endif  // defined(_MSC_VER) || defined(__MINGW32__)
-
-#endif  // !defined(xgboost_IS_WIN)
-
-#if defined(__unix__) || defined(__APPLE__)
-#include <fcntl.h>     // for open, O_RDONLY
-#include <sys/mman.h>  // for mmap, mmap64, munmap
-#include <unistd.h>    // for close, getpagesize
-#elif defined(xgboost_IS_WIN)
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+#if defined(__unix__)
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 #endif  // defined(__unix__)
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <memory>
+#include <utility>
+#include <cstdio>

-#include <algorithm>     // for copy, transform
-#include <cctype>        // for tolower
-#include <cerrno>        // for errno
-#include <cstddef>       // for size_t
-#include <cstdint>       // for int32_t, uint32_t
-#include <cstring>       // for memcpy
-#include <fstream>       // for ifstream
-#include <iterator>      // for distance
-#include <limits>        // for numeric_limits
-#include <memory>        // for unique_ptr
-#include <string>        // for string
-#include <system_error>  // for error_code, system_category
-#include <utility>       // for move
-#include <vector>        // for vector
-
-#include "io.h"
-#include "xgboost/collective/socket.h"  // for LastError
 #include "xgboost/logging.h"
+#include "io.h"
+
+namespace xgboost {
+namespace common {

-namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
  size_t nbuffer = buffer_.length() - buffer_ptr_;
  if (nbuffer == 0) return strm_->Read(dptr, size);
@@ -117,32 +94,11 @@ void FixedSizeStream::Take(std::string* out) {
  *out = std::move(buffer_);
 }

-namespace {
-// Get system alignment value for IO with mmap.
-std::size_t GetMmapAlignment() {
-#if defined(xgboost_IS_WIN)
-  SYSTEM_INFO sys_info;
-  GetSystemInfo(&sys_info);
-  // During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
-  // size 65536.
-  return sys_info.dwAllocationGranularity;
-#else
-  return getpagesize();
-#endif
-}
-
-auto SystemErrorMsg() {
-  std::int32_t errsv = system::LastError();
-  auto err = std::error_code{errsv, std::system_category()};
-  return err.message();
-}
-}  // anonymous namespace
-
 std::string LoadSequentialFile(std::string uri, bool stream) {
  auto OpenErr = [&uri]() {
    std::string msg;
    msg = "Opening " + uri + " failed: ";
-    msg += SystemErrorMsg();
+    msg += strerror(errno);
    LOG(FATAL) << msg;
  };

@@ -199,159 +155,5 @@ std::string FileExtension(std::string fname, bool lower) {
    return "";
  }
 }
-
-// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
-// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
-ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
-
-struct MMAPFile {
-#if defined(xgboost_IS_WIN)
-  HANDLE fd{INVALID_HANDLE_VALUE};
-  HANDLE file_map{INVALID_HANDLE_VALUE};
-#else
-  std::int32_t fd{0};
-#endif
-  std::byte* base_ptr{nullptr};
-  std::size_t base_size{0};
-  std::size_t delta{0};
-  std::string path;
-
-  MMAPFile() = default;
-
-#if defined(xgboost_IS_WIN)
-  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd},
-        file_map{fm},
-        base_ptr{base_ptr},
-        base_size{base_size},
-        delta{delta},
-        path{std::move(path)} {}
-#else
-  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
-#endif
-};
-
-std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
-  if (length == 0) {
-    return std::make_unique<MMAPFile>();
-  }
-
-#if defined(xgboost_IS_WIN)
-  HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
-                         FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
-  CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
-#else
-  auto fd = open(path.c_str(), O_RDONLY);
-  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
-#endif
-
-  std::byte* ptr{nullptr};
-  // Round down for alignment.
-  auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
-  auto view_size = length + (offset - view_start);
-
-#if defined(__linux__) || defined(__GLIBC__)
-  int prot{PROT_READ};
-  ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  madvise(ptr, view_size, MADV_WILLNEED);
-  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
-#elif defined(xgboost_IS_WIN)
-  auto file_size = GetFileSize(fd, nullptr);
-  DWORD access = PAGE_READONLY;
-  auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
-  access = FILE_MAP_READ;
-  std::uint32_t loff = static_cast<std::uint32_t>(view_start);
-  std::uint32_t hoff = view_start >> 32;
-  CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
-  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
-                                           std::move(path));
-#else
-  CHECK_LE(offset, std::numeric_limits<off_t>::max())
-      << "File size has exceeded the limit on the current system.";
-  int prot{PROT_READ};
-  ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
-#endif  // defined(__linux__)
-
-  return handle;
-}
-
-MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
-    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
-
-MmapResource::~MmapResource() noexcept(false) {
-  if (!handle_) {
-    return;
-  }
-#if defined(xgboost_IS_WIN)
-  if (handle_->base_ptr) {
-    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
-  }
-  if (handle_->fd != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
-  }
-  if (handle_->file_map != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
-  }
-#else
-  if (handle_->base_ptr) {
-    CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
-        << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
-  }
-  if (handle_->fd != 0) {
-    CHECK_NE(close(handle_->fd), -1)
-        << "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
-  }
-#endif
-}
-
-[[nodiscard]] void* MmapResource::Data() {
-  if (!handle_) {
-    return nullptr;
-  }
-  return handle_->base_ptr + handle_->delta;
-}
-
-[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
-
-// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
-// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
-AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {}  // NOLINT
-PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {}        // NOLINT
-
-AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
-    : pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
-
-[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
-                                                          std::size_t n_bytes) noexcept(true) {
-  pimpl_->Write(ptr, n_bytes);
-  return n_bytes;
-}
-
-AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
-    : pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
-AlignedMemWriteStream::~AlignedMemWriteStream() = default;
-
-[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
-                                                         std::size_t n_bytes) noexcept(true) {
-  this->pimpl_->Write(ptr, n_bytes);
-  return n_bytes;
-}
-
-[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
-  return this->pimpl_->Tell();
-}
-}  // namespace xgboost::common
-
-#if defined(xgboost_IS_WIN)
-#undef xgboost_IS_WIN
-#endif  // defined(xgboost_IS_WIN)
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -1,32 +1,23 @@
-/**
- * Copyright 2014-2023, XGBoost Contributors
+/*!
+ * Copyright by XGBoost Contributors 2014-2022
 * \file io.h
 * \brief general stream interface for serialization, I/O
 * \author Tianqi Chen
 */
+
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_

 #include <dmlc/io.h>
 #include <rabit/rabit.h>
-
-#include <algorithm>    // for min
-#include <array>        // for array
-#include <cstddef>      // for byte, size_t
-#include <cstdlib>      // for malloc, realloc, free
-#include <cstring>      // for memcpy
-#include <fstream>      // for ifstream
-#include <limits>       // for numeric_limits
-#include <memory>       // for unique_ptr
-#include <string>       // for string
-#include <type_traits>  // for alignment_of_v, enable_if_t
-#include <utility>      // for move
-#include <vector>       // for vector
+#include <string>
+#include <cstring>
+#include <fstream>

 #include "common.h"
-#include "xgboost/string_view.h"  // for StringView

-namespace xgboost::common {
+namespace xgboost {
+namespace common {
 using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
 using MemoryBufferStream = rabit::utils::MemoryBufferStream;

@@ -65,8 +56,8 @@ class FixedSizeStream : public PeekableInStream {

  size_t Read(void* dptr, size_t size) override;
  size_t PeekRead(void* dptr, size_t size) override;
-  [[nodiscard]] std::size_t Size() const { return buffer_.size(); }
-  [[nodiscard]] std::size_t Tell() const { return pointer_; }
+  size_t Size() const { return buffer_.size(); }
+  size_t Tell() const { return pointer_; }
  void Seek(size_t pos);

  void Write(const void*, size_t) override {
@@ -136,318 +127,6 @@ inline std::string ReadAll(std::string const &path) {
  return content;
 }

-struct MMAPFile;
-
-/**
- * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
- *        fixed once it's constructed. Users cannot use mutable operations like resize
- *        without acquiring the specific resource first.
- */
-class ResourceHandler {
- public:
-  // RTTI
-  enum Kind : std::uint8_t {
-    kMalloc = 0,
-    kMmap = 1,
-  };
-
- private:
-  Kind kind_{kMalloc};
-
- public:
-  virtual void* Data() = 0;
-  template <typename T>
-  [[nodiscard]] T* DataAs() {
-    return reinterpret_cast<T*>(this->Data());
-  }
-
-  [[nodiscard]] virtual std::size_t Size() const = 0;
-  [[nodiscard]] auto Type() const { return kind_; }
-
-  // Allow exceptions for cleaning up resource.
-  virtual ~ResourceHandler() noexcept(false);
-
-  explicit ResourceHandler(Kind kind) : kind_{kind} {}
-  // Use shared_ptr to manage a pool like resource handler. All copy and assignment
-  // operators are disabled.
-  ResourceHandler(ResourceHandler const& that) = delete;
-  ResourceHandler& operator=(ResourceHandler const& that) = delete;
-  ResourceHandler(ResourceHandler&& that) = delete;
-  ResourceHandler& operator=(ResourceHandler&& that) = delete;
-  /**
-   * @brief Wether two resources have the same type. (both malloc or both mmap).
-   */
-  [[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
-    return this->Type() == that.Type();
-  }
-};
-
-class MallocResource : public ResourceHandler {
-  void* ptr_{nullptr};
-  std::size_t n_{0};
-
-  void Clear() noexcept(true) {
-    std::free(ptr_);
-    ptr_ = nullptr;
-    n_ = 0;
-  }
-
- public:
-  explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
-  ~MallocResource() noexcept(true) override { this->Clear(); }
-
-  void* Data() override { return ptr_; }
-  [[nodiscard]] std::size_t Size() const override { return n_; }
-  /**
-   * @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
-   *        over malloc.
-   *
-   * @tparam force_malloc Force the use of malloc over realloc. Used for testing.
-   *
-   * @param n_bytes The new size.
-   */
-  template <bool force_malloc = false>
-  void Resize(std::size_t n_bytes) {
-    // realloc(ptr, 0) works, but is deprecated.
-    if (n_bytes == 0) {
-      this->Clear();
-      return;
-    }
-
-    // If realloc fails, we need to copy the data ourselves.
-    bool need_copy{false};
-    void* new_ptr{nullptr};
-    // use realloc first, it can handle nullptr.
-    if constexpr (!force_malloc) {
-      new_ptr = std::realloc(ptr_, n_bytes);
-    }
-    // retry with malloc if realloc fails
-    if (!new_ptr) {
-      // ptr_ is preserved if realloc fails
-      new_ptr = std::malloc(n_bytes);
-      need_copy = true;
-    }
-    if (!new_ptr) {
-      // malloc fails
-      LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
-    }
-
-    if (need_copy) {
-      std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
-    }
-    // default initialize
-    std::memset(reinterpret_cast<std::byte*>(new_ptr) + n_, '\0', n_bytes - n_);
-    // free the old ptr if malloc is used.
-    if (need_copy) {
-      this->Clear();
-    }
-
-    ptr_ = new_ptr;
-    n_ = n_bytes;
-  }
-};
-
-/**
- * @brief A class for wrapping mmap as a resource for RAII.
- */
-class MmapResource : public ResourceHandler {
-  std::unique_ptr<MMAPFile> handle_;
-  std::size_t n_;
-
- public:
-  MmapResource(std::string path, std::size_t offset, std::size_t length);
-  ~MmapResource() noexcept(false) override;
-
-  [[nodiscard]] void* Data() override;
-  [[nodiscard]] std::size_t Size() const override;
-};
-
-/**
- * @param Alignment for resource read stream and aligned write stream.
- */
-constexpr std::size_t IOAlignment() {
-  // For most of the pod types in XGBoost, 8 byte is sufficient.
-  return 8;
-}
-
-/**
- * @brief Wrap resource into a dmlc stream.
- *
- *  This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
- *  method or the `Consume()` method. The former copies data into output, while the latter
- *  makes copy only if it's a primitive type.
- *
- *  Input is required to be aligned to IOAlignment().
- */
-class AlignedResourceReadStream {
-  std::shared_ptr<ResourceHandler> resource_;
-  std::size_t curr_ptr_{0};
-
-  // Similar to SEEK_END in libc
-  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
-
- public:
-  explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
-      : resource_{std::move(resource)} {}
-
-  [[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
-  /**
-   * @brief Consume n_bytes of data, no copying is performed.
-   *
-   * @return A pair with the beginning pointer and the number of available bytes, which
-   *         may be smaller than requested.
-   */
-  [[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
-    auto res_size = resource_->Size();
-    auto data = reinterpret_cast<std::byte*>(resource_->Data());
-    auto ptr = data + curr_ptr_;
-
-    // Move the cursor
-    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
-    auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
-    std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
-
-    curr_ptr_ += aligned_forward;
-
-    return std::pair{ptr, forward};
-  }
-
-  template <typename T>
-  [[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
-    auto [ptr, size] = this->Consume(sizeof(T));
-    if (size != sizeof(T)) {
-      return false;
-    }
-    CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
-    *out = *reinterpret_cast<T*>(ptr);
-    return true;
-  }
-
-  [[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
-  /**
-   * @brief Read n_bytes of data, output is copied into ptr.
-   */
-  [[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
-    auto [res_ptr, forward] = this->Consume(n_bytes);
-    if (forward != 0) {
-      std::memcpy(ptr, res_ptr, forward);
-    }
-    return forward;
-  }
-  /**
-   * @brief Read a primitive type.
-   *
-   * @return Whether the read is successful.
-   */
-  template <typename T>
-  [[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
-    return this->Consume(out);
-  }
-  /**
-   * @brief Read a vector.
-   *
-   * @return Whether the read is successful.
-   */
-  template <typename T>
-  [[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
-    std::uint64_t n{0};
-    if (!this->Consume(&n)) {
-      return false;
-    }
-    out->resize(n);
-
-    auto n_bytes = sizeof(T) * n;
-    if (this->Read(out->data(), n_bytes) != n_bytes) {
-      return false;
-    }
-    return true;
-  }
-
-  virtual ~AlignedResourceReadStream() noexcept(false);
-};
-
-/**
- * @brief Private mmap file as a read-only stream.
- *
- *  It can calculate alignment automatically based on system page size (or allocation
- *  granularity on Windows).
- *
- *  The file is required to be aligned by IOAlignment().
- */
-class PrivateMmapConstStream : public AlignedResourceReadStream {
- public:
-  /**
-   * @brief Construct a private mmap stream.
-   *
-   * @param path      File path.
-   * @param offset    See the `offset` parameter of `mmap` for details.
-   * @param length    See the `length` parameter of `mmap` for details.
-   */
-  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
-  ~PrivateMmapConstStream() noexcept(false) override;
-};
-
-/**
- * @brief Base class for write stream with alignment defined by IOAlignment().
- */
-class AlignedWriteStream {
- protected:
-  [[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
-                                            std::size_t n_bytes) noexcept(true) = 0;
-
- public:
-  virtual ~AlignedWriteStream() = default;
-
-  [[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
-    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
-    auto w_n_bytes = this->DoWrite(ptr, n_bytes);
-    CHECK_EQ(w_n_bytes, n_bytes);
-    auto remaining = aligned_n_bytes - n_bytes;
-    if (remaining > 0) {
-      std::array<std::uint8_t, IOAlignment()> padding;
-      std::memset(padding.data(), '\0', padding.size());
-      w_n_bytes = this->DoWrite(padding.data(), remaining);
-      CHECK_EQ(w_n_bytes, remaining);
-    }
-    return aligned_n_bytes;
-  }
-
-  template <typename T>
-  [[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
-    return this->Write(&v, sizeof(T));
-  }
-};
-
-/**
- * @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
- */
-class AlignedFileWriteStream : public AlignedWriteStream {
-  std::unique_ptr<dmlc::Stream> pimpl_;
-
- protected:
-  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
-
- public:
-  AlignedFileWriteStream() = default;
-  AlignedFileWriteStream(StringView path, StringView flags);
-  ~AlignedFileWriteStream() override = default;
-};
-
-/**
- * @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
- */
-class AlignedMemWriteStream : public AlignedFileWriteStream {
-  std::unique_ptr<MemoryBufferStream> pimpl_;
-
- protected:
-  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
-
- public:
-  explicit AlignedMemWriteStream(std::string* p_buf);
-  ~AlignedMemWriteStream() override;
-
-  [[nodiscard]] std::size_t Tell() const noexcept(true);
-};
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
 #endif  // XGBOOST_COMMON_IO_H_
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -12,7 +12,8 @@
 #include <memory>
 #include <utility>

-#include "../collective/communicator-inl.cuh"
+#include "../collective/communicator.h"
+#include "../collective/device_communicator.cuh"
 #include "categorical.h"
 #include "common.h"
 #include "device_helpers.cuh"
@@ -509,6 +510,7 @@ void SketchContainer::AllReduce() {
  }

  timer_.Start(__func__);
+  auto* communicator = collective::Communicator::GetDevice(device_);
  // Reduce the overhead on syncing.
  size_t global_sum_rows = num_rows_;
  collective::Allreduce<collective::Operation::kSum>(&global_sum_rows, 1);
@@ -529,15 +531,14 @@ void SketchContainer::AllReduce() {
  auto offset = rank * d_columns_ptr.size();
  thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
               gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
-                                                     gathered_ptrs.size());
+  communicator->AllReduceSum(gathered_ptrs.data().get(), gathered_ptrs.size());

  // Get the data from all workers.
  std::vector<size_t> recv_lengths;
  dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
-                         dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  communicator->AllGatherV(this->Current().data().get(), dh::ToSpan(this->Current()).size_bytes(),
+                            &recv_lengths, &recvbuf);
+  communicator->Synchronize();

  // Segment the received data.
  auto s_recvbuf = dh::ToSpan(recvbuf);
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -1,158 +0,0 @@
-/**
- * Copyright 2023, XGBoost Contributors
- */
-#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
-#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
-
-#include <algorithm>    // for fill_n
-#include <cstdint>      // for uint64_t
-#include <cstring>      // for memcpy
-#include <memory>       // for shared_ptr, make_shared
-#include <type_traits>  // for is_reference_v, remove_reference_t, is_same_v
-#include <utility>      // for swap, move
-
-#include "io.h"  // for ResourceHandler, AlignedResourceReadStream, MallocResource
-#include "xgboost/logging.h"
-#include "xgboost/span.h"  // for Span
-
-namespace xgboost::common {
-/**
- * @brief A vector-like type that holds a reference counted resource.
- *
- *    The vector size is immutable after construction. This way we can swap the underlying
- *    resource when needed.
- */
-template <typename T>
-class RefResourceView {
-  static_assert(!std::is_reference_v<T>);
-
- public:
-  using value_type = T;             // NOLINT
-  using size_type = std::uint64_t;  // NOLINT
-
- private:
-  value_type* ptr_{nullptr};
-  size_type size_{0};
-  std::shared_ptr<common::ResourceHandler> mem_{nullptr};
-
- public:
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
-      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
-    CHECK_GE(mem_->Size(), n);
-  }
-  /**
-   * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
-   *
-   * @param ptr  The pointer to view.
-   * @param n    The length of the view.
-   * @param mem  The owner of the pointer.
-   * @param init Initialize the view with this value.
-   */
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
-                  T const& init)
-      : RefResourceView{ptr, n, mem} {
-    if (n != 0) {
-      std::fill_n(ptr_, n, init);
-    }
-  }
-
-  ~RefResourceView() = default;
-
-  RefResourceView() = default;
-  RefResourceView(RefResourceView const& that) = delete;
-  RefResourceView(RefResourceView&& that) = delete;
-  RefResourceView& operator=(RefResourceView const& that) = delete;
-  /**
-   * @brief We allow move assignment for lazy initialization.
-   */
-  RefResourceView& operator=(RefResourceView&& that) = default;
-
-  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
-  [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
-  }
-  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
-  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
-  [[nodiscard]] bool empty() const { return size() == 0; }        // NOLINT
-
-  [[nodiscard]] auto cbegin() const { return data(); }         // NOLINT
-  [[nodiscard]] auto begin() { return data(); }                // NOLINT
-  [[nodiscard]] auto begin() const { return cbegin(); }        // NOLINT
-  [[nodiscard]] auto cend() const { return data() + size(); }  // NOLINT
-  [[nodiscard]] auto end() { return data() + size(); }         // NOLINT
-  [[nodiscard]] auto end() const { return cend(); }            // NOLINT
-
-  [[nodiscard]] auto const& front() const { return data()[0]; }          // NOLINT
-  [[nodiscard]] auto& front() { return data()[0]; }                      // NOLINT
-  [[nodiscard]] auto const& back() const { return data()[size() - 1]; }  // NOLINT
-  [[nodiscard]] auto& back() { return data()[size() - 1]; }              // NOLINT
-
-  [[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
-  [[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
-
-  /**
-   * @brief Get the underlying resource.
-   */
-  auto Resource() const { return mem_; }
-};
-
-/**
- * @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
- *
- *  If the output vector is a referenced counted view, no copying occur.
- */
-template <typename Vec>
-[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
-  std::uint64_t n{0};
-  if (!fi->Read(&n)) {
-    return false;
-  }
-  if (n == 0) {
-    return true;
-  }
-
-  using T = typename Vec::value_type;
-  auto expected_bytes = sizeof(T) * n;
-
-  auto [ptr, n_bytes] = fi->Consume(expected_bytes);
-  if (n_bytes != expected_bytes) {
-    return false;
-  }
-
-  if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
-    *vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
-  } else {
-    vec->resize(n);
-    std::memcpy(vec->data(), ptr, n_bytes);
-  }
-  return true;
-}
-
-/**
- * @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
- */
-template <typename Vec>
-[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
-  std::size_t bytes{0};
-  auto n = static_cast<std::uint64_t>(vec.size());
-  bytes += fo->Write(n);
-  if (n == 0) {
-    return sizeof(n);
-  }
-
-  using T = typename std::remove_reference_t<decltype(vec)>::value_type;
-  bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
-
-  return bytes;
-}
-
-/**
- * @brief Make a fixed size `RefResourceView` with malloc resource.
- */
-template <typename T>
-[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
-  auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
-  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
-}
-}  // namespace xgboost::common
-#endif  // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -7,7 +7,7 @@
 #include <dmlc/data.h>

 #include <algorithm>
-#include <cstddef>  // for size_t
+#include <cstddef>  // std::size_t
 #include <functional>
 #include <limits>
 #include <map>
@@ -17,7 +17,6 @@
 #include <vector>

 #include "../c_api/c_api_error.h"
-#include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
 #include "arrow-cdi.h"
@@ -301,9 +300,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
    array_interface_ = ArrayInterface<2>(get<Object const>(j));
    batch_ = ArrayAdapterBatch{array_interface_};
  }
-  [[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
-  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
-  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  ArrayAdapterBatch const& Value() const override { return batch_; }
+  size_t NumRows() const { return array_interface_.Shape(0); }
+  size_t NumColumns() const { return array_interface_.Shape(1); }

 private:
  ArrayAdapterBatch batch_;
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -590,7 +590,7 @@ class ArrayInterface {
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
  // Only used for cuDF at the moment.
-  CHECK_EQ(array.valid.Capacity(), 0);
+  CHECK_EQ(array.valid.Size(), 0);
  auto dispatch = [&](auto t) {
    using T = std::remove_const_t<decltype(t)> const;
    // Set the data size to max as we don't know the original size of a sliced array:
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -416,8 +416,7 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK_EQ(array.valid.Capacity(), 0)
-      << "Meta info like label or weight can not have missing value.";
+  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
  if (array.is_contiguous && array.type == ToDType<T>::kType) {
    // Handle contigious
    p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -33,8 +33,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK_EQ(array.valid.Capacity(), 0)
-      << "Meta info like label or weight can not have missing value.";
+  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
  auto ptr_device = SetDeviceToPtr(array.data);
  p_out->SetDevice(ptr_device);

--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -5,7 +5,6 @@
 #include <thrust/iterator/transform_output_iterator.h>

 #include "../common/categorical.h"
-#include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
@@ -314,8 +313,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
  auto d_csc_indptr = dh::ToSpan(csc_indptr);

  auto bin_type = page.index.GetBinTypeSize();
-  common::CompressedBufferWriter writer{page.cut.TotalBins() +
-                                        static_cast<std::size_t>(1)};  // +1 for null value
+  common::CompressedBufferWriter writer{page.cut.TotalBins() + 1};  // +1 for null value

  dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
    auto ridx = idx / row_stride;
@@ -359,10 +357,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag

  // copy gidx
  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
-  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
+  dh::device_vector<size_t> row_ptr(page.row_ptr);
  auto d_row_ptr = dh::ToSpan(row_ptr);
-  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
-                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));

  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
  auto null = accessor.NullValue();
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,59 +1,60 @@
-/**
- * Copyright 2019-2023, XGBoost contributors
+/*!
+ * Copyright 2019-2021 XGBoost contributors
 */
+#include <xgboost/data.h>
 #include <dmlc/registry.h>

-#include <cstddef>  // for size_t
-
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
 #include "ellpack_page.cuh"
-#include "histogram_cut_format.h"  // for ReadHistogramCuts, WriteHistogramCuts
-#include "sparse_page_writer.h"    // for SparsePageFormat
+#include "sparse_page_writer.h"
+#include "histogram_cut_format.h"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);

+
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
-  bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
+  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
    auto* impl = page->Impl();
    if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
      return false;
    }
-    if (!fi->Read(&impl->n_rows)) {
-      return false;
-    }
-    if (!fi->Read(&impl->is_dense)) {
-      return false;
-    }
-    if (!fi->Read(&impl->row_stride)) {
-      return false;
-    }
-    if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
-      return false;
-    }
+    fi->Read(&impl->n_rows);
+    fi->Read(&impl->is_dense);
+    fi->Read(&impl->row_stride);
+    fi->Read(&impl->gidx_buffer.HostVector());
    if (!fi->Read(&impl->base_rowid)) {
      return false;
    }
    return true;
  }

-  size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
-    std::size_t bytes{0};
+  size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
+    size_t bytes = 0;
    auto* impl = page.Impl();
    bytes += WriteHistogramCuts(impl->Cuts(), fo);
-    bytes += fo->Write(impl->n_rows);
-    bytes += fo->Write(impl->is_dense);
-    bytes += fo->Write(impl->row_stride);
+    fo->Write(impl->n_rows);
+    bytes += sizeof(impl->n_rows);
+    fo->Write(impl->is_dense);
+    bytes += sizeof(impl->is_dense);
+    fo->Write(impl->row_stride);
+    bytes += sizeof(impl->row_stride);
    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-    bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
-    bytes += fo->Write(impl->base_rowid);
+    fo->Write(impl->gidx_buffer.HostVector());
+    bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+    fo->Write(impl->base_rowid);
+    bytes += sizeof(impl->base_rowid);
    return bytes;
  }
 };

 XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
    .describe("Raw ELLPACK binary data format.")
-    .set_body([]() { return new EllpackPageRawFormat(); });
-}  // namespace xgboost::data
+    .set_body([]() {
+      return new EllpackPageRawFormat();
+    });
+
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -29,7 +29,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
  cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);

  const uint32_t nbins = cut.Ptrs().back();
-  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
+  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(ctx->Threads() * nbins, 0);

  size_t new_size = 1;
@@ -37,7 +37,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
    new_size += batch.Size();
  }

-  row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});
+  row_ptr.resize(new_size);
+  row_ptr[0] = 0;

  const bool isDense = p_fmat->IsDense();
  this->isDense_ = isDense;
@@ -60,8 +61,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_

 GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
                                   bst_bin_t max_bin_per_feat)
-    : row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
-      hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
+    : row_ptr(info.num_row_ + 1, 0),
+      hit_count(cuts.TotalBins(), 0),
      cut{std::forward<common::HistogramCuts>(cuts)},
      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
@@ -94,10 +95,12 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
  CHECK_EQ(row_ptr.size(), 0);
-  row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
-
+  // The number of threads is pegged to the batch size. If the OMP
+  // block is parallelized on anything other than the batch/block size,
+  // it should be reassigned
+  row_ptr.resize(batch.Size() + 1, 0);
  const uint32_t nbins = cut.Ptrs().back();
-  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
+  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(n_threads * nbins, 0);

  this->PushBatch(batch, ft, n_threads);
@@ -125,45 +128,20 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
-    // Must resize instead of allocating a new one. This function is called everytime a
-    // new batch is pushed, and we grow the size accordingly without loosing the data the
-    // previous batches.
-    using T = decltype(t);
-    std::size_t n_bytes = sizeof(T) * n_index;
-    CHECK_GE(n_bytes, this->data.size());
-
-    auto resource = this->data.Resource();
-    decltype(this->data) new_vec;
-    if (!resource) {
-      CHECK(this->data.empty());
-      new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
-    } else {
-      CHECK(resource->Type() == common::ResourceHandler::kMalloc);
-      auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
-      CHECK(malloc_resource);
-      malloc_resource->Resize(n_bytes);
-
-      // gcc-11.3 doesn't work if DataAs is used.
-      std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
-      new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
-    }
-    this->data = std::move(new_vec);
-    this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
-  };
-
  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
      isDense) {
    // compress dense index to uint8
-    make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
+    index.SetBinTypeSize(common::kUint8BinsTypeSize);
+    index.Resize((sizeof(uint8_t)) * n_index);
  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
-    make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
+    index.SetBinTypeSize(common::kUint16BinsTypeSize);
+    index.Resize((sizeof(uint16_t)) * n_index);
  } else {
-    // no compression
-    make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
+    index.SetBinTypeSize(common::kUint32BinsTypeSize);
+    index.Resize((sizeof(uint32_t)) * n_index);
  }
 }

@@ -236,11 +214,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
  return std::numeric_limits<float>::quiet_NaN();
 }

-bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
+bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
  return this->columns_->Read(fi, this->cut.Ptrs().data());
 }

-std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
+size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
  return this->columns_->Write(fo);
 }
 }  // namespace xgboost
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,5 +1,5 @@
-/**
- * Copyright 2022-2023, XGBoost Contributors
+/*!
+ * Copyright 2022 by XGBoost Contributors
 */
 #include <memory>  // std::unique_ptr

@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
 }

 void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
-                          common::RefResourceView<std::size_t>* p_out) {
+                          std::vector<size_t>* p_out) {
  auto& row_ptr = *p_out;
-  row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
+  row_ptr.resize(page->Size() + 1, 0);
  if (page->is_dense) {
    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
  } else {
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
        ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
  }

-  this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
+  this->hit_count.resize(n_bins_total, 0);
  this->GatherHitCount(ctx->Threads(), n_bins_total);

  // sanity checks
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -9,14 +9,13 @@
 #include <atomic>     // for atomic
 #include <cinttypes>  // for uint32_t
 #include <cstddef>    // for size_t
-#include <memory>     // for make_unique
+#include <memory>
 #include <vector>

 #include "../common/categorical.h"
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
-#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
@@ -26,11 +25,9 @@
 namespace xgboost {
 namespace common {
 class ColumnMatrix;
-class AlignedFileWriteStream;
 }  // namespace common
-
-/**
- * @brief preprocessed global index matrix, in CSR format.
+/*!
+ * \brief preprocessed global index matrix, in CSR format
 *
 *  Transform floating values to integer index in histogram This is a global histogram
 *  index for CPU histogram.  On GPU ellpack page is used.
@@ -136,22 +133,20 @@ class GHistIndexMatrix {
  }

 public:
-  /** @brief row pointer to rows by element position */
-  common::RefResourceView<std::size_t> row_ptr;
-  /** @brief data storage for index. */
-  common::RefResourceView<std::uint8_t> data;
-  /** @brief The histogram index. */
+  /*! \brief row pointer to rows by element position */
+  std::vector<size_t> row_ptr;
+  /*! \brief The index data */
  common::Index index;
-  /** @brief hit count of each index, used for constructing the ColumnMatrix */
-  common::RefResourceView<std::size_t> hit_count;
-  /** @brief The corresponding cuts */
+  /*! \brief hit count of each index, used for constructing the ColumnMatrix */
+  std::vector<size_t> hit_count;
+  /*! \brief The corresponding cuts */
  common::HistogramCuts cut;
-  /** @brief max_bin for each feature. */
+  /** \brief max_bin for each feature. */
  bst_bin_t max_numeric_bins_per_feat;
-  /** @brief base row index for current page (used by external memory) */
-  bst_row_t base_rowid{0};
+  /*! \brief base row index for current page (used by external memory) */
+  size_t base_rowid{0};

-  [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
+  bst_bin_t MaxNumBinPerFeat() const {
    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
  }

@@ -223,27 +218,29 @@ class GHistIndexMatrix {
    }
  }

-  [[nodiscard]] bool IsDense() const { return isDense_; }
+  bool IsDense() const {
+    return isDense_;
+  }
  void SetDense(bool is_dense) { isDense_ = is_dense; }
  /**
-   * @brief Get the local row index.
+   * \brief Get the local row index.
   */
-  [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
+  size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }

-  [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
-  [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
+  bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
+  bst_feature_t Features() const { return cut.Ptrs().size() - 1; }

-  [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
-  [[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;
+  bool ReadColumnPage(dmlc::SeekStream* fi);
+  size_t WriteColumnPage(dmlc::Stream* fo) const;

-  [[nodiscard]] common::ColumnMatrix const& Transpose() const;
+  common::ColumnMatrix const& Transpose() const;

-  [[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;

-  [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
-  [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
-                                std::vector<float> const& values, std::vector<float> const& mins,
-                                bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
+  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
+                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
+                  bool is_cat) const;

 private:
  std::unique_ptr<common::ColumnMatrix> columns_;
@@ -297,5 +294,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
    }
  });
 }
-}  // namespace xgboost
+}      // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,49 +1,38 @@
-/**
- * Copyright 2021-2023 XGBoost contributors
+/*!
+ * Copyright 2021-2022 XGBoost contributors
 */
-#include <cstddef>      // for size_t
-#include <cstdint>      // for uint8_t
-#include <type_traits>  // for underlying_type_t
-#include <vector>       // for vector
+#include "sparse_page_writer.h"
+#include "gradient_index.h"
+#include "histogram_cut_format.h"

-#include "../common/io.h"                 // for AlignedResourceReadStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
-#include "gradient_index.h"               // for GHistIndexMatrix
-#include "histogram_cut_format.h"         // for ReadHistogramCuts
-#include "sparse_page_writer.h"           // for SparsePageFormat
-
-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
 public:
-  bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
-    CHECK(fi);
-
+  bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
    if (!ReadHistogramCuts(&page->cut, fi)) {
      return false;
    }
-
    // indptr
-    if (!common::ReadVec(fi, &page->row_ptr)) {
+    fi->Read(&page->row_ptr);
+    // data
+    std::vector<uint8_t> data;
+    if (!fi->Read(&data)) {
      return false;
    }
-
-    // data
-    // - bin type
+    page->index.Resize(data.size());
+    std::copy(data.cbegin(), data.cend(), page->index.begin());
+    // bin type
    // Old gcc doesn't support reading from enum.
    std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
    if (!fi->Read(&uint_bin_type)) {
      return false;
    }
-    common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
-    // - index buffer
-    if (!common::ReadVec(fi, &page->data)) {
-      return false;
-    }
-    // - index
-    page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
-
+    common::BinTypeSize size_type =
+        static_cast<common::BinTypeSize>(uint_bin_type);
+    page->index.SetBinTypeSize(size_type);
    // hit count
-    if (!common::ReadVec(fi, &page->hit_count)) {
+    if (!fi->Read(&page->hit_count)) {
      return false;
    }
    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
@@ -61,34 +50,38 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
      page->index.SetBinOffset(page->cut.Ptrs());
    }

-    if (!page->ReadColumnPage(fi)) {
-      return false;
-    }
+    page->ReadColumnPage(fi);
    return true;
  }

-  std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
-    CHECK_NE(page.index.Size(), 0) << "Empty page is not supported.";
-    std::size_t bytes = 0;
+  size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
+    size_t bytes = 0;
    bytes += WriteHistogramCuts(page.cut, fo);
    // indptr
-    bytes += common::WriteVec(fo, page.row_ptr);
-
+    fo->Write(page.row_ptr);
+    bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
+             sizeof(uint64_t);
    // data
-    // - bin type
-    std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
-    bytes += fo->Write(uint_bin_type);
-    // - index buffer
-    std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
-    bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
-    bytes += fo->Write(data.data(), data.size());
-
+    std::vector<uint8_t> data(page.index.begin(), page.index.end());
+    fo->Write(data);
+    bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
+    // bin type
+    std::underlying_type_t<common::BinTypeSize> uint_bin_type =
+        page.index.GetBinTypeSize();
+    fo->Write(uint_bin_type);
+    bytes += sizeof(page.index.GetBinTypeSize());
    // hit count
-    bytes += common::WriteVec(fo, page.hit_count);
+    fo->Write(page.hit_count);
+    bytes +=
+        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
+        sizeof(uint64_t);
    // max_bins, base row, is_dense
-    bytes += fo->Write(page.max_numeric_bins_per_feat);
-    bytes += fo->Write(page.base_rowid);
-    bytes += fo->Write(page.IsDense());
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
+    fo->Write(page.base_rowid);
+    bytes += sizeof(page.base_rowid);
+    fo->Write(page.IsDense());
+    bytes += sizeof(page.IsDense());

    bytes += page.WriteColumnPage(fo);
    return bytes;
@@ -100,4 +93,6 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
 XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
    .describe("Raw GHistIndex binary data format.")
    .set_body([]() { return new GHistIndexRawFormat(); });
-}  // namespace xgboost::data
+
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -1,9 +1,10 @@
-/**
- * Copyright 2021-2023, XGBoost Contributors
+/*!
+ * Copyright 2021-2022 by XGBoost Contributors
 */
 #include "gradient_index_page_source.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 void GradientIndexPageSource::Fetch() {
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
@@ -20,4 +21,5 @@ void GradientIndexPageSource::Fetch() {
    this->WriteCache();
  }
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,38 +1,36 @@
-/**
- * Copyright 2021-2023, XGBoost contributors
+/*!
+ * Copyright 2021 XGBoost contributors
 */
 #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_

-#include <dmlc/io.h>  // for Stream
+#include "../common/hist_util.h"

-#include <cstddef>  // for size_t
-
-#include "../common/hist_util.h"          // for HistogramCuts
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
-
-namespace xgboost::data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
-  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
+namespace xgboost {
+namespace data {
+inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
+  if (!fi->Read(&cuts->cut_values_.HostVector())) {
    return false;
  }
-  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
+  if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
    return false;
  }
-  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
+  if (!fi->Read(&cuts->min_vals_.HostVector())) {
    return false;
  }
  return true;
 }

-inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
-                                      common::AlignedFileWriteStream *fo) {
-  std::size_t bytes = 0;
-  bytes += common::WriteVec(fo, cuts.Values());
-  bytes += common::WriteVec(fo, cuts.Ptrs());
-  bytes += common::WriteVec(fo, cuts.MinValues());
+inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
+  size_t bytes = 0;
+  fo->Write(cuts.cut_values_.ConstHostVector());
+  bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+  fo->Write(cuts.cut_ptrs_.ConstHostVector());
+  bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+  fo->Write(cuts.min_vals_.ConstHostVector());
+  bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
  return bytes;
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -240,9 +240,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   * Generate gradient index.
   */
  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
-  std::size_t rbegin = 0;
-  std::size_t prev_sum = 0;
-  std::size_t i = 0;
+  size_t rbegin = 0;
+  size_t prev_sum = 0;
+  size_t i = 0;
  while (iter.Next()) {
    HostAdapterDispatch(proxy, [&](auto const& batch) {
      proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  dh::XGBCachingDeviceAllocator<char> alloc;

  auto num_rows = [&]() {
-    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
+    return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
  };
  auto num_cols = [&]() {
-    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
+    return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
  };

  size_t row_stride = 0;
@@ -74,7 +74,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
                                     get_device());
      auto* p_sketch = &sketch_containers.back();
      proxy->Info().weights_.SetDevice(get_device());
-      cuda_impl::Dispatch(proxy, [&](auto const& value) {
+      Dispatch(proxy, [&](auto const& value) {
        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
      });
    }
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    accumulated_rows += batch_rows;
    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
+    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
                          }));
    nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
@@ -136,14 +136,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    cuda_impl::Dispatch(proxy, [=](auto const& value) {
+    Dispatch(proxy, [=](auto const& value) {
      return GetRowCounts(value, row_counts_span, get_device(), missing);
    });
    auto is_dense = this->IsDense();

    proxy->Info().feature_types.SetDevice(get_device());
    auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
-    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
+    auto new_impl = Dispatch(proxy, [&](auto const& value) {
      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
                             d_feature_types, row_stride, rows, cuts);
    });
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -1,13 +1,14 @@
-/**
- * Copyright 2021-2023, XGBoost Contributors
+/*!
+ * Copyright 2021 by Contributors
 * \file proxy_dmatrix.cc
 */

 #include "proxy_dmatrix.h"

-namespace xgboost::data {
-void DMatrixProxy::SetArrayData(StringView interface_str) {
-  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
+namespace xgboost {
+namespace data {
+void DMatrixProxy::SetArrayData(char const *c_interface) {
+  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
@@ -24,36 +25,5 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
  this->Info().num_row_ = adapter->NumRows();
  this->ctx_.gpu_id = Context::kCpuId;
 }
-
-namespace cuda_impl {
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
-#if !defined(XGBOOST_USE_CUDA)
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
-                                                float) {
-  return nullptr;
-}
-#endif  // XGBOOST_USE_CUDA
-}  // namespace cuda_impl
-
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy,
-                                                float missing) {
-  bool type_error{false};
-  std::shared_ptr<DMatrix> p_fmat{nullptr};
-  if (proxy->Ctx()->IsCPU()) {
-    p_fmat = data::HostAdapterDispatch<false>(
-        proxy.get(),
-        [&](auto const &adapter) {
-          auto p_fmat =
-              std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
-          return p_fmat;
-        },
-        &type_error);
-  } else {
-    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
-  }
-
-  return p_fmat;
-}
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -1,11 +1,12 @@
-/**
- * Copyright 2020-2023, XGBoost contributors
+/*!
+ * Copyright 2020-2022, XGBoost contributors
 */
-#include "device_adapter.cuh"
-#include "proxy_dmatrix.cuh"
 #include "proxy_dmatrix.h"
+#include "device_adapter.cuh"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
  auto const& value = adapter->Value();
@@ -30,15 +31,5 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
    ctx_.gpu_id = dh::CurrentDevice();
  }
 }
-
-namespace cuda_impl {
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy,
-                                                float missing) {
-  return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
-    auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
-    return p_fmat;
-  });
-}
-}  // namespace cuda_impl
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -6,34 +6,19 @@
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"

-namespace xgboost::data::cuda_impl {
-template <bool get_value = true, typename Fn>
+namespace xgboost::data {
+template <typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
  } else {
    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
  }
 }
-}  // namespace xgboost::data::cuda_impl
+}  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
  }

-  void SetArrayData(StringView interface_str);
+  void SetArrayData(char const* c_interface);
  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
                  bst_feature_t n_features, bool on_host);

@@ -114,62 +114,28 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
  return typed;
 }

-/**
- * @brief Dispatch function call based on input type.
- *
- * @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
- * @tparam Fn        The type of the function to be dispatched.
- *
- * @param proxy The proxy object holding the reference to the input.
- * @param fn    The function to be dispatched.
- * @param type_error[out] Set to ture if it's not null and the input data is not recognized by
- *                        the host.
- *
- * @return The return value of the function being dispatched.
- */
-template <bool get_value = true, typename Fn>
+template <typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
    if (type_error) {
      *type_error = false;
    }
+    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
    if (type_error) {
      *type_error = false;
    }
+    return fn(value);
  } else {
    if (type_error) {
      *type_error = true;
    } else {
      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
    }
-    if constexpr (get_value) {
-      return std::result_of_t<Fn(
-          decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
-    } else {
-      return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
-    }
+    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
  }
 }
-
-/**
- * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
- */
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -21,7 +21,8 @@
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 MetaInfo& SimpleDMatrix::Info() { return info_; }

 const MetaInfo& SimpleDMatrix::Info() const { return info_; }
@@ -96,10 +97,6 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
  // column page doesn't exist, generate it
  if (!column_page_) {
-    auto n = std::numeric_limits<decltype(Entry::index)>::max();
-    if (this->sparse_page_->Size() > n) {
-      error::MaxSampleSize(n);
-    }
    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
  }
  auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
@@ -109,10 +106,6 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
 BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
  // Sorted column page doesn't exist, generate it
  if (!sorted_column_page_) {
-    auto n = std::numeric_limits<decltype(Entry::index)>::max();
-    if (this->sparse_page_->Size() > n) {
-      error::MaxSampleSize(n);
-    }
    sorted_column_page_.reset(
        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
    sorted_column_page_->SortRows(ctx->Threads());
@@ -434,4 +427,5 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i

  fmat_ctx_ = ctx;
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -7,6 +7,9 @@
 #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_

+#include <xgboost/data.h>
+#include <xgboost/logging.h>
+
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -17,33 +20,35 @@
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
 #include "sparse_page_source.h"
-#include "xgboost/data.h"
-#include "xgboost/logging.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 /**
 * \brief DMatrix used for external memory.
 *
 * The external memory is created for controlling memory usage by splitting up data into
- * multiple batches.  However that doesn't mean we will actually process exactly 1 batch
- * at a time, which would be terribly slow considering that we have to loop through the
- * whole dataset for every tree split.  So we use async to pre-fetch pages and let the
- * caller to decide how many batches it wants to process by returning data as a shared
- * pointer. The caller can use async function to process the data or just stage those
- * batches based on its use cases. These two optimizations might defeat the purpose of
- * splitting up dataset since if you stage all the batches then the memory usage might be
- * even worse than using a single batch. As a result, we must control how many batches can
- * be in memory at any given time.
+ * multiple batches.  However that doesn't mean we will actually process exact 1 batch at
+ * a time, which would be terribly slow considering that we have to loop through the
+ * whole dataset for every tree split.  So we use async pre-fetch and let caller to decide
+ * how many batches it wants to process by returning data as shared pointer.  The caller
+ * can use async function to process the data or just stage those batches, making the
+ * decision is out of the scope for sparse page dmatrix.  These 2 optimizations might
+ * defeat the purpose of splitting up dataset since if you load all the batches then the
+ * memory usage is even worse than using a single batch.  Essentially we need to control
+ * how many batches can be in memory at the same time.
 *
- * Right now the write to the cache is a sequential operation and is blocking. Reading
- * from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
- * heuristic.  So by sparse dmatrix itself there can be only 7 pages in main memory (might
- * be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
- * pages, 3 pre-fetched dependent pages.
+ * Right now the write to the cache is sequential operation and is blocking, reading from
+ * cache is async but with a hard coded limit of 4 pages as an heuristic.  So by sparse
+ * dmatrix itself there can be only 9 pages in main memory (might be of different types)
+ * at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
+ * dependent pages.  If the caller stops iteration at the middle and start again, then the
+ * number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
+ * caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
+ * sampling algo that samples only the first portion of data).
 *
 * Of course if the caller decides to retain some batches to perform parallel processing,
 * then we might load all pages in memory, which is also considered as a bug in caller's
- * code. So if the algo supports external memory, it must be careful that queue for async
+ * code.  So if the algo supports external memory, it must be careful that queue for async
 * call must have an upper limit.
 *
 * Another assumption we make is that the data must be immutable so caller should never
@@ -96,7 +101,7 @@ class SparsePageDMatrix : public DMatrix {
  MetaInfo &Info() override;
  const MetaInfo &Info() const override;
  Context const *Ctx() const override { return &fmat_ctx_; }
-  // The only DMatrix implementation that returns false.
+
  bool SingleColBlock() const override { return false; }
  DMatrix *Slice(common::Span<int32_t const>) override {
    LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
@@ -148,5 +153,6 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
  }
  return id;
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -1,57 +1,59 @@
-/**
- * Copyright 2015-2023, XGBoost Contributors
+/*!
+ * Copyright (c) 2015-2021 by Contributors
 * \file sparse_page_raw_format.cc
 *  Raw binary format of sparse page.
 */
+#include <xgboost/data.h>
 #include <dmlc/registry.h>

-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec
-#include "./sparse_page_writer.h"
-#include "xgboost/data.h"
 #include "xgboost/logging.h"
+#include "./sparse_page_writer.h"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);

-template <typename T>
+template<typename T>
 class SparsePageRawFormat : public SparsePageFormat<T> {
 public:
-  bool Read(T* page, common::AlignedResourceReadStream* fi) override {
+  bool Read(T* page, dmlc::SeekStream* fi) override {
    auto& offset_vec = page->offset.HostVector();
-    if (!common::ReadVec(fi, &offset_vec)) {
+    if (!fi->Read(&offset_vec)) {
      return false;
    }
    auto& data_vec = page->data.HostVector();
    CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
    data_vec.resize(offset_vec.back());
    if (page->data.Size() != 0) {
-      if (!common::ReadVec(fi, &data_vec)) {
-        return false;
-      }
-    }
-    if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
-      return false;
+      size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
+                                (page->data).Size() * sizeof(Entry));
+      CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
+          << "Invalid SparsePage file";
    }
+    fi->Read(&page->base_rowid, sizeof(page->base_rowid));
    return true;
  }

-  std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
+  size_t Write(const T& page, dmlc::Stream* fo) override {
    const auto& offset_vec = page.offset.HostVector();
    const auto& data_vec = page.data.HostVector();
    CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
    CHECK_EQ(offset_vec.back(), page.data.Size());
-
-    std::size_t bytes{0};
-    bytes += common::WriteVec(fo, offset_vec);
+    fo->Write(offset_vec);
+    auto bytes = page.MemCostBytes();
+    bytes += sizeof(uint64_t);
    if (page.data.Size() != 0) {
-      bytes += common::WriteVec(fo, data_vec);
+      fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
    }
-    bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
+    fo->Write(&page.base_rowid, sizeof(page.base_rowid));
+    bytes += sizeof(page.base_rowid);
    return bytes;
  }

 private:
+  /*! \brief external memory column offset */
+  std::vector<size_t> disk_offset_;
 };

 XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
@@ -72,4 +74,5 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
    return new SparsePageRawFormat<SortedCSCPage>();
  });

-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -1,31 +1,33 @@
-/**
- * Copyright 2021-2023, XGBoost contributors
+/*!
+ * Copyright 2021 XGBoost contributors
 */
-#include "../common/device_helpers.cuh"  // for CurrentDevice
-#include "proxy_dmatrix.cuh"             // for Dispatch, DMatrixProxy
-#include "simple_dmatrix.cuh"            // for CopyToSparsePage
 #include "sparse_page_source.h"
-#include "xgboost/data.h"  // for SparsePage
+#include "proxy_dmatrix.cuh"
+#include "simple_dmatrix.cuh"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 namespace detail {
 std::size_t NSamplesDevice(DMatrixProxy *proxy) {
-  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
+  return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
 }

 std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
-  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
+  return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
 }
 }  // namespace detail

-void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
+void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
  auto device = proxy->DeviceIdx();
  if (device < 0) {
    device = dh::CurrentDevice();
  }
  CHECK_GE(device, 0);

-  cuda_impl::Dispatch(proxy,
-                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
+  Dispatch(proxy, [&](auto const &value) {
+    CopyToSparsePage(value, device, missing, page);
+  });
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -1,49 +1,45 @@
-/**
- *  Copyright 2014-2023, XGBoost Contributors
+/*!
+ *  Copyright 2014-2022 by XGBoost Contributors
 * \file sparse_page_source.h
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

-#include <algorithm>  // for min
-#include <atomic>     // for atomic
-#include <future>     // for async
+#include <algorithm>  // std::min
+#include <string>
+#include <utility>
+#include <vector>
+#include <future>
+#include <thread>
 #include <map>
 #include <memory>
-#include <mutex>  // for mutex
-#include <string>
-#include <thread>
-#include <utility>  // for pair, move
-#include <vector>

-#include "../common/common.h"
-#include "../common/io.h"     // for PrivateMmapConstStream
-#include "../common/timer.h"  // for Monitor, Timer
-#include "adapter.h"
-#include "proxy_dmatrix.h"       // for DMatrixProxy
-#include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
 #include "xgboost/data.h"

-namespace xgboost::data {
+#include "adapter.h"
+#include "sparse_page_writer.h"
+#include "proxy_dmatrix.h"
+
+#include "../common/common.h"
+#include "../common/timer.h"
+
+namespace xgboost {
+namespace data {
 inline void TryDeleteCacheFile(const std::string& file) {
  if (std::remove(file.c_str()) != 0) {
-    // Don't throw, this is called in a destructor.
    LOG(WARNING) << "Couldn't remove external memory cache file " << file
-                 << "; you may want to remove it manually";
+              << "; you may want to remove it manually";
  }
 }

-/**
- * @brief Information about the cache including path and page offsets.
- */
 struct Cache {
  // whether the write to the cache is complete
  bool written;
  std::string name;
  std::string format;
  // offset into binary cache file.
-  std::vector<std::uint64_t> offset;
+  std::vector<size_t> offset;

  Cache(bool w, std::string n, std::string fmt)
      : written{w}, name{std::move(n)}, format{std::move(fmt)} {
@@ -55,24 +51,11 @@ struct Cache {
    return name + format;
  }

-  [[nodiscard]] std::string ShardName() const {
+  std::string ShardName() {
    return ShardName(this->name, this->format);
  }
-  /**
-   * @brief Record a page with size of n_bytes.
-   */
-  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
-  /**
-   * @brief Returns the view start and length for the i^th page.
-   */
-  [[nodiscard]] auto View(std::size_t i) const {
-    std::uint64_t off = offset.at(i);
-    std::uint64_t len = offset.at(i + 1) - offset[i];
-    return std::pair{off, len};
-  }
-  /**
-   * @brief Call this once the write for the cache is complete.
-   */
+
+  // The write is completed.
  void Commit() {
    if (!written) {
      std::partial_sum(offset.begin(), offset.end(), offset.begin());
@@ -81,7 +64,7 @@ struct Cache {
  }
 };

-// Prevents multi-threaded call to `GetBatches`.
+// Prevents multi-threaded call.
 class TryLockGuard {
  std::mutex& lock_;

@@ -94,128 +77,74 @@ class TryLockGuard {
  }
 };

-// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
-class ExceHandler {
-  std::mutex mutex_;
-  std::atomic<bool> flag_{false};
-  std::exception_ptr curr_exce_{nullptr};
-
- public:
-  template <typename Fn>
-  decltype(auto) Run(Fn&& fn) noexcept(true) {
-    try {
-      return fn();
-    } catch (dmlc::Error const& e) {
-      std::lock_guard<std::mutex> guard{mutex_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-      }
-      flag_ = true;
-    } catch (std::exception const& e) {
-      std::lock_guard<std::mutex> guard{mutex_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-      }
-      flag_ = true;
-    } catch (...) {
-      std::lock_guard<std::mutex> guard{mutex_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-      }
-      flag_ = true;
-    }
-    return std::invoke_result_t<Fn>();
-  }
-
-  void Rethrow() noexcept(false) {
-    if (flag_) {
-      CHECK(curr_exce_);
-      std::rethrow_exception(curr_exce_);
-    }
-  }
-};
-
-/**
- * @brief Base class for all page sources. Handles fetching, writing, and iteration.
- */
 template <typename S>
 class SparsePageSourceImpl : public BatchIteratorImpl<S> {
 protected:
  // Prevents calling this iterator from multiple places(or threads).
  std::mutex single_threaded_;
-  // The current page.
+
  std::shared_ptr<S> page_;

  bool at_end_ {false};
  float missing_;
-  std::int32_t nthreads_;
+  int nthreads_;
  bst_feature_t n_features_;
-  // Index to the current page.
-  std::uint32_t count_{0};
-  // Total number of batches.
-  std::uint32_t n_batches_{0};
+
+  uint32_t count_{0};
+
+  uint32_t n_batches_ {0};

  std::shared_ptr<Cache> cache_info_;
+  std::unique_ptr<dmlc::Stream> fo_;

  using Ring = std::vector<std::future<std::shared_ptr<S>>>;
  // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
  // can pre-fetch data in a ring.
  std::unique_ptr<Ring> ring_{new Ring};
-  // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
-  // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
-  // OOM error should be rare.
-  ExceHandler exce_;
-  common::Monitor monitor_;

  bool ReadCache() {
    CHECK(!at_end_);
    if (!cache_info_->written) {
      return false;
    }
-    if (ring_->empty()) {
+    if (fo_) {
+      fo_.reset();  // flush the data to disk.
      ring_->resize(n_batches_);
    }
    // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
    // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 3;
+    uint32_t constexpr kPreFetch = 4;

    size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
    CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
-    std::size_t fetch_it = count_;
+    size_t fetch_it = count_;

-    exce_.Rethrow();
-
-    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
      fetch_it %= n_batches_;  // ring
      if (ring_->at(fetch_it).valid()) {
        continue;
      }
-      auto const* self = this;  // make sure it's const
+      auto const *self = this;  // make sure it's const
      CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
+      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
+        common::Timer timer;
+        timer.Start();
+        std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
+        auto n = self->cache_info_->ShardName();
+        size_t offset = self->cache_info_->offset.at(fetch_it);
+        std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(n.c_str())};
+        fi->Seek(offset);
+        CHECK_EQ(fi->Tell(), offset);
        auto page = std::make_shared<S>();
-        this->exce_.Run([&] {
-          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-          auto name = self->cache_info_->ShardName();
-          auto [offset, length] = self->cache_info_->View(fetch_it);
-          auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
-          CHECK(fmt->Read(page.get(), fi.get()));
-        });
+        CHECK(fmt->Read(page.get(), fi.get()));
+        LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
        return page;
      });
    }
-
    CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
             n_prefetch_batches)
        << "Sparse DMatrix assumes forward iteration.";
-
-    monitor_.Start("Wait");
    page_ = (*ring_)[count_].get();
-    CHECK(!(*ring_)[count_].valid());
-    monitor_.Stop("Wait");
-
-    exce_.Rethrow();
-
    return true;
  }

@@ -224,41 +153,29 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    common::Timer timer;
    timer.Start();
    std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-
-    auto name = cache_info_->ShardName();
-    std::unique_ptr<common::AlignedFileWriteStream> fo;
-    if (this->Iter() == 0) {
-      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
-    } else {
-      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
+    if (!fo_) {
+      auto n = cache_info_->ShardName();
+      fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
    }
-
-    auto bytes = fmt->Write(*page_, fo.get());
-
+    auto bytes = fmt->Write(*page_, fo_.get());
    timer.Stop();
-    // Not entirely accurate, the kernels doesn't have to flush the data.
+
    LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
              << timer.ElapsedSeconds() << " seconds.";
-    cache_info_->Push(bytes);
+    cache_info_->offset.push_back(bytes);
  }

  virtual void Fetch() = 0;

 public:
-  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
-                       std::shared_ptr<Cache> cache)
-      : missing_{missing},
-        nthreads_{nthreads},
-        n_features_{n_features},
-        n_batches_{n_batches},
-        cache_info_{std::move(cache)} {
-    monitor_.Init(typeid(S).name());  // not pretty, but works for basic profiling
-  }
+  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
+                       uint32_t n_batches, std::shared_ptr<Cache> cache)
+      : missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
+        n_batches_{n_batches}, cache_info_{std::move(cache)} {}

  SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;

  ~SparsePageSourceImpl() override {
-    // Don't orphan the threads.
    for (auto& fu : *ring_) {
      if (fu.valid()) {
        fu.get();
@@ -266,18 +183,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    }
  }

-  [[nodiscard]] uint32_t Iter() const { return count_; }
+  uint32_t Iter() const { return count_; }

  const S &operator*() const override {
    CHECK(page_);
    return *page_;
  }

-  [[nodiscard]] std::shared_ptr<S const> Page() const override {
+  std::shared_ptr<S const> Page() const override {
    return page_;
  }

-  [[nodiscard]] bool AtEnd() const override {
+  bool AtEnd() const override {
    return at_end_;
  }

@@ -285,23 +202,20 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    TryLockGuard guard{single_threaded_};
    at_end_ = false;
    count_ = 0;
-    // Pre-fetch for the next round of iterations.
    this->Fetch();
  }
 };

 #if defined(XGBOOST_USE_CUDA)
-// Push data from CUDA.
 void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
 #else
 inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
 #endif

 class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
-  // This is the source from the user.
  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
  DMatrixProxy* proxy_;
-  std::size_t base_row_id_{0};
+  size_t base_row_id_ {0};

  void Fetch() final {
    page_ = std::make_shared<SparsePage>();
@@ -330,7 +244,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
        iter_{iter}, proxy_{proxy} {
    if (!cache_info_->written) {
      iter_.Reset();
-      CHECK(iter_.Next()) << "Must have at least 1 batch.";
+      CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
    }
    this->Fetch();
  }
@@ -345,7 +259,6 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
    }

    if (at_end_) {
-      CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
      cache_info_->Commit();
      if (n_batches_ != 0) {
        CHECK_EQ(count_, n_batches_);
@@ -458,5 +371,6 @@ class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
    this->Fetch();
  }
 };
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,44 +1,52 @@
-/**
- * Copyright 2014-2023, XGBoost Contributors
+/*!
+ * Copyright (c) 2014-2019 by Contributors
 * \file sparse_page_writer.h
 * \author Tianqi Chen
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
 #define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_

-#include <functional>  // for function
-#include <string>      // for string
+#include <xgboost/data.h>
+#include <dmlc/io.h>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <memory>
+#include <functional>

-#include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "dmlc/io.h"        // for Stream
-#include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
-#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
+#if DMLC_ENABLE_STD_THREAD
+#include <dmlc/concurrency.h>
+#include <thread>
+#endif  // DMLC_ENABLE_STD_THREAD
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 template<typename T>
 struct SparsePageFormatReg;

-/**
- * @brief Format specification of various data formats like SparsePage.
+/*!
+ * \brief Format specification of SparsePage.
 */
-template <typename T>
+template<typename T>
 class SparsePageFormat {
 public:
+  /*! \brief virtual destructor */
  virtual ~SparsePageFormat() = default;
-  /**
-   * @brief Load all the segments into page, advance fi to end of the block.
-   *
-   * @param page The data to read page into.
-   * @param fi the input stream of the file
-   * @return true of the loading as successful, false if end of file was reached
+  /*!
+   * \brief Load all the segments into page, advance fi to end of the block.
+   * \param page The data to read page into.
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
   */
-  virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
-  /**
-   * @brief save the data to fo, when a page was written.
-   *
-   * @param fo output stream
+  virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
+  /*!
+   * \brief save the data to fo, when a page was written.
+   * \param fo output stream
   */
-  virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
+  virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
 };

 /*!
@@ -97,5 +105,6 @@ struct SparsePageFormatReg
  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
                         GHistIndexPageFmt, Name)

-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -172,7 +172,8 @@ class GBLinear : public GradientBooster {
  }

  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
+                           uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
+                           unsigned) override {
    model_.LazyInitModel();
    LinearCheckLayer(layer_begin);
    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
@@ -209,8 +210,8 @@ class GBLinear : public GradientBooster {
    }
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                                       bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                                       unsigned layer_begin, unsigned /*layer_end*/,
                                       bool) override {
    LinearCheckLayer(layer_begin);
    std::vector<bst_float>& contribs = out_contribs->HostVector();
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,11 +18,9 @@
 #include <vector>

 #include "../common/common.h"
-#include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
-#include "../data/proxy_dmatrix.h"  // for DMatrixProxy, HostAdapterDispatch
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -41,6 +39,7 @@ namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);

 void GBTree::Configure(Args const& cfg) {
+  this->cfg_ = cfg;
  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
  tree_param_.UpdateAllowUnknown(cfg);
@@ -60,8 +59,9 @@ void GBTree::Configure(Args const& cfg) {
  cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
  auto n_gpus = common::AllVisibleGPUs();
-  if (!gpu_predictor_) {
-    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
+  if (!gpu_predictor_ && n_gpus != 0) {
+    gpu_predictor_ = std::unique_ptr<Predictor>(
+        Predictor::Create("gpu_predictor", this->ctx_));
  }
  if (n_gpus != 0) {
    gpu_predictor_->Configure(cfg);
@@ -78,9 +78,10 @@ void GBTree::Configure(Args const& cfg) {

  monitor_.Init("GBTree");

-  specified_updater_ = std::any_of(
-      cfg.cbegin(), cfg.cend(),
-      [](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
+  specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
+                   [](std::pair<std::string, std::string> const& arg) {
+                     return arg.first == "updater";
+                   });

  if (specified_updater_ && !showed_updater_warning_) {
    LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
@@ -92,19 +93,12 @@ void GBTree::Configure(Args const& cfg) {
    showed_updater_warning_ = true;
  }

-  if (model_.learner_model_param->IsVectorLeaf()) {
-    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
-        << "Only the hist tree method is supported for building multi-target trees with vector "
-           "leaf.";
-  }
-  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
  this->ConfigureUpdaters();
-
  if (updater_seq != tparam_.updater_seq) {
    updaters_.clear();
    this->InitUpdater(cfg);
  } else {
-    for (auto& up : updaters_) {
+    for (auto &up : updaters_) {
      up->Configure(cfg);
    }
  }
@@ -112,6 +106,66 @@ void GBTree::Configure(Args const& cfg) {
  configured_ = true;
 }

+// FIXME(trivialfis): This handles updaters.  Because the choice of updaters depends on
+// whether external memory is used and how large is dataset.  We can remove the dependency
+// on DMatrix once `hist` tree method can handle external memory so that we can make it
+// default.
+void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
+  CHECK(this->configured_);
+  std::string updater_seq = tparam_.updater_seq;
+  CHECK(tparam_.GetInitialised());
+
+  tparam_.UpdateAllowUnknown(cfg);
+
+  this->PerformTreeMethodHeuristic(fmat);
+  this->ConfigureUpdaters();
+
+  // initialize the updaters only when needed.
+  if (updater_seq != tparam_.updater_seq) {
+    LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
+    this->updaters_.clear();
+    this->InitUpdater(cfg);
+  }
+}
+
+void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
+  if (specified_updater_) {
+    // This method is disabled when `updater` parameter is explicitly
+    // set, since only experts are expected to do so.
+    return;
+  }
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+
+  // tparam_ is set before calling this function.
+  if (tparam_.tree_method != TreeMethod::kAuto) {
+    return;
+  }
+
+  if (collective::IsDistributed()) {
+    LOG(INFO) << "Tree method is automatically selected to be 'approx' "
+                 "for distributed training.";
+    tparam_.tree_method = TreeMethod::kApprox;
+  } else if (!fmat->SingleColBlock()) {
+    LOG(INFO) << "Tree method is automatically set to 'approx' "
+                 "since external-memory data matrix is used.";
+    tparam_.tree_method = TreeMethod::kApprox;
+  } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
+    /* Choose tree_method='approx' automatically for large data matrix */
+    LOG(INFO) << "Tree method is automatically selected to be "
+                 "'approx' for faster speed. To use old behavior "
+                 "(exact greedy algorithm on single machine), "
+                 "set tree_method to 'exact'.";
+    tparam_.tree_method = TreeMethod::kApprox;
+  } else {
+    tparam_.tree_method = TreeMethod::kExact;
+  }
+  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
+}
+
 void GBTree::ConfigureUpdaters() {
  if (specified_updater_) {
    return;
@@ -119,25 +173,31 @@ void GBTree::ConfigureUpdaters() {
  // `updater` parameter was manually specified
  /* Choose updaters according to tree_method parameters */
  switch (tparam_.tree_method) {
-    case TreeMethod::kAuto:  // Use hist as default in 2.0
-    case TreeMethod::kHist: {
-      tparam_.updater_seq = "grow_quantile_histmaker";
+    case TreeMethod::kAuto:
+      // Use heuristic to choose between 'exact' and 'approx' This
+      // choice is carried out in PerformTreeMethodHeuristic() before
+      // calling this function.
      break;
-    }
    case TreeMethod::kApprox:
      tparam_.updater_seq = "grow_histmaker";
      break;
    case TreeMethod::kExact:
      tparam_.updater_seq = "grow_colmaker,prune";
      break;
+    case TreeMethod::kHist: {
+      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
+                   "grow_quantile_histmaker.";
+      tparam_.updater_seq = "grow_quantile_histmaker";
+      break;
+    }
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
      tparam_.updater_seq = "grow_gpu_hist";
      break;
    }
    default:
-      LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
-                 << ") detected";
+      LOG(FATAL) << "Unknown tree_method ("
+                 << static_cast<int>(tparam_.tree_method) << ") detected";
  }
 }

@@ -193,6 +253,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  TreesOneIter new_trees;
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
+  ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");

  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
@@ -375,7 +436,12 @@ void GBTree::LoadConfig(Json const& in) {
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
  tparam_.process_type = TreeProcessType::kDefault;
-  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
+  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
+  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
+    LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine.  "
+                    "Changing predictor to auto.";
+    tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
+  }

  auto msg = StringView{
      R"(
@@ -391,32 +457,19 @@ void GBTree::LoadConfig(Json const& in) {
    LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
  }

-  std::vector<Json> updater_seq;
-  if (IsA<Object>(in["updater"])) {
-    // before 2.0
-    error::WarnOldSerialization();
-    for (auto const& kv : get<Object const>(in["updater"])) {
-      auto name = kv.first;
-      auto config = kv.second;
-      config["name"] = name;
-      updater_seq.push_back(config);
-    }
-  } else {
-    // after 2.0
-    auto const& j_updaters = get<Array const>(in["updater"]);
-    updater_seq = j_updaters;
-  }
-
+  auto const& j_updaters = get<Object const>(in["updater"]);
  updaters_.clear();

-  for (auto const& config : updater_seq) {
-    auto name = get<String>(config["name"]);
+  for (auto const& kv : j_updaters) {
+    auto name = kv.first;
    if (n_gpus == 0 && name == "grow_gpu_hist") {
      name = "grow_quantile_histmaker";
      LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
    }
-    updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
-    updaters_.back()->LoadConfig(config);
+    std::unique_ptr<TreeUpdater> up{
+        TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
+    up->LoadConfig(kv.second);
+    updaters_.push_back(std::move(up));
  }

  specified_updater_ = get<Boolean>(in["specified_updater"]);
@@ -438,14 +491,13 @@ void GBTree::SaveConfig(Json* p_out) const {
  // language binding doesn't need to know about the forest size.
  out["gbtree_model_param"] = ToJson(model_.param);

-  out["updater"] = Array{};
-  auto& j_updaters = get<Array>(out["updater"]);
+  out["updater"] = Object();

-  for (auto const& up : this->updaters_) {
-    Json up_config{Object{}};
-    up_config["name"] = String{up->Name()};
-    up->SaveConfig(&up_config);
-    j_updaters.emplace_back(up_config);
+  auto& j_updaters = out["updater"];
+  for (auto const& up : updaters_) {
+    j_updaters[up->Name()] = Object();
+    auto& j_up = j_updaters[up->Name()];
+    up->SaveConfig(&j_up);
  }
  out["specified_updater"] = Boolean{specified_updater_};
 }
@@ -515,8 +567,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
  out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
 }

-void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
-                              bst_layer_t layer_begin, bst_layer_t layer_end) const {
+void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
+                          bst_layer_t layer_begin, bst_layer_t layer_end) {
  CHECK(configured_);
  if (layer_end == 0) {
    layer_end = this->BoostedRounds();
@@ -536,7 +588,7 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
    CHECK_EQ(out_preds->version, 0);
  }

-  auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
+  auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
  if (out_preds->version == 0) {
    // out_preds->Size() can be non-zero as it's initialized here before any
    // tree is built at the 0^th iterator.
@@ -556,83 +608,52 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
  }
 }

-namespace {
-inline void MismatchedDevices(Context const* booster, Context const* data) {
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-               << "is running on: " << booster->DeviceName()
-               << ", while the input data is on: " << data->DeviceName() << ".\n"
-               << R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
-)";
-}
-};  // namespace
-
-void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
-                          bst_layer_t layer_begin, bst_layer_t layer_end) {
-  // dispatch to const function.
-  this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
-}
-
-void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
-                            PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
-                            bst_layer_t layer_end) const {
+std::unique_ptr<Predictor> const &
+GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
+                     DMatrix *f_dmat) const {
  CHECK(configured_);
-  auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
-  if (p_m->Ctx()->Device() != this->ctx_->Device()) {
-    MismatchedDevices(this->ctx_, p_m->Ctx());
-    CHECK_EQ(out_preds->version, 0);
-    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-    CHECK(proxy) << error::InplacePredictProxy();
-    auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
-    this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
-    return;
-  }
-
-  bool known_type = this->ctx_->DispatchDevice(
-      [&, begin = tree_begin, end = tree_end] {
-        return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
-      },
-      [&, begin = tree_begin, end = tree_end] {
-        return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
-      });
-  if (!known_type) {
-    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-    CHECK(proxy) << error::InplacePredictProxy();
-    LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
-  }
-}
-
-[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
-    bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
-  CHECK(configured_);
-
-  // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
-  // prevent data copy.
-  if (f_dmat && !f_dmat->SingleColBlock()) {
-    if (ctx_->IsCPU()) {
-      return cpu_predictor_;
-    } else {
-      common::AssertGPUSupport();
+  if (tparam_.predictor != PredictorType::kAuto) {
+    if (tparam_.predictor == PredictorType::kGPUPredictor) {
+#if defined(XGBOOST_USE_CUDA)
+      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
      CHECK(gpu_predictor_);
      return gpu_predictor_;
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
    }
+    if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
+#if defined(XGBOOST_USE_ONEAPI)
+      CHECK(oneapi_predictor_);
+      return oneapi_predictor_;
+#else
+      common::AssertOneAPISupport();
+#endif  // defined(XGBOOST_USE_ONEAPI)
+    }
+    CHECK(cpu_predictor_);
+    return cpu_predictor_;
  }

  // Data comes from Device DMatrix.
-  auto is_ellpack =
-      f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
+  auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
+                    !f_dmat->PageExists<SparsePage>();
  // Data comes from device memory, like CuDF or CuPy.
-  auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
-                        (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
+  auto is_from_device =
+      f_dmat && f_dmat->PageExists<SparsePage>() &&
+      (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
  auto on_device = is_ellpack || is_from_device;

  // Use GPU Predictor if data is already on device and gpu_id is set.
-  if (on_device && ctx_->IsCUDA()) {
-    common::AssertGPUSupport();
+  if (on_device && ctx_->gpu_id >= 0) {
+#if defined(XGBOOST_USE_CUDA)
+    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
    CHECK(gpu_predictor_);
    return gpu_predictor_;
+#else
+    LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
+                  "CUDA support.";
+    return cpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
  }

  // GPU_Hist by default has prediction cache calculated from quantile values,
@@ -644,19 +665,23 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
  if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
      // FIXME(trivialfis): Implement a better method for testing whether data
      // is on device after DMatrix refactoring is done.
-      !on_device && is_training) {
+      !on_device) {
    CHECK(cpu_predictor_);
    return cpu_predictor_;
  }

-  if (ctx_->IsCPU()) {
-    return cpu_predictor_;
-  } else {
-    common::AssertGPUSupport();
+  if (tparam_.tree_method == TreeMethod::kGPUHist) {
+#if defined(XGBOOST_USE_CUDA)
+    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
    CHECK(gpu_predictor_);
    return gpu_predictor_;
+#else
+    common::AssertGPUSupport();
+    return cpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
  }

+  CHECK(cpu_predictor_);
  return cpu_predictor_;
 }

@@ -771,7 +796,7 @@ class Dart : public GBTree {
                        bool training, unsigned layer_begin,
                        unsigned layer_end) const {
    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
-    auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
+    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
    CHECK(predictor);
    predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                  model_);
@@ -835,16 +860,15 @@ class Dart : public GBTree {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    auto n_groups = model_.learner_model_param->num_output_group;

-    if (ctx_->Device() != p_fmat->Ctx()->Device()) {
-      MismatchedDevices(ctx_, p_fmat->Ctx());
-      auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
-      CHECK(proxy) << error::InplacePredictProxy();
-      auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
-      this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
-      return;
-    }
-
+    std::vector<Predictor const*> predictors {
+      cpu_predictor_.get(),
+#if defined(XGBOOST_USE_CUDA)
+      gpu_predictor_.get()
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    Predictor const* predictor{nullptr};
    StringView msg{"Unsupported data type for inplace predict."};
+
    PredictionCacheEntry predts;
    if (ctx_->gpu_id != Context::kCpuId) {
      predts.predictions.SetDevice(ctx_->gpu_id);
@@ -853,29 +877,32 @@ class Dart : public GBTree {

    auto predict_impl = [&](size_t i) {
      predts.predictions.Fill(0);
-      bool success = this->ctx_->DispatchDevice(
-          [&] {
-            return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-          },
-          [&] {
-            return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-          });
-      CHECK(success) << msg;
+      if (tparam_.predictor == PredictorType::kAuto) {
+        // Try both predictor implementations
+        bool success = false;
+        for (auto const& p : predictors) {
+          if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
+            success = true;
+            predictor = p;
+            break;
+          }
+        }
+        CHECK(success) << msg;
+      } else {
+        predictor = this->GetPredictor().get();
+        bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+        CHECK(success) << msg << std::endl
+                       << "Current Predictor: "
+                       << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
+                                                                             : "gpu_predictor");
+      }
    };

    // Inplace predict is not used for training, so no need to drop tree.
    for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
      predict_impl(i);
      if (i == tree_begin) {
-        this->ctx_->DispatchDevice(
-            [&] {
-              this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
-                                                       model_);
-            },
-            [&] {
-              this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
-                                                       model_);
-            });
+        predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
      }
      // Multiple the tree weight
      auto w = this->weight_drop_.at(i);
@@ -905,24 +932,25 @@ class Dart : public GBTree {
                       std::vector<bst_float> *out_preds,
                       unsigned layer_begin, unsigned layer_end) override {
    DropTrees(false);
-    auto &predictor = this->GetPredictor(false);
+    auto &predictor = this->GetPredictor();
    uint32_t _, tree_end;
    std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    predictor->PredictInstance(inst, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           bst_layer_t layer_begin, bst_layer_t layer_end,
-                           bool approximate) override {
+  void PredictContribution(DMatrix* p_fmat,
+                           HostDeviceVector<bst_float>* out_contribs,
+                           unsigned layer_begin, unsigned layer_end, bool approximate, int,
+                           unsigned) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
                                        approximate);
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                                       bst_layer_t layer_begin, bst_layer_t layer_end,
-                                       bool approximate) override {
+  void PredictInteractionContributions(
+      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+      unsigned layer_begin, unsigned layer_end, bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,11 +1,14 @@
-/**
- * Copyright 2021-2023, XGBoost Contributors
+/*!
+ * Copyright 2021 by Contributors
 */
 #include "../common/device_helpers.cuh"
+#include "xgboost/context.h"
 #include "xgboost/linalg.h"
 #include "xgboost/span.h"

-namespace xgboost::gbm {
+namespace xgboost {
+namespace gbm {
+
 void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
                     bst_group_t n_groups, bst_group_t group_id,
                     HostDeviceVector<GradientPair> *out_gpair) {
@@ -38,4 +41,5 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
    out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
  });
 }
-}  // namespace xgboost::gbm
+}  // namespace gbm
+}  // namespace xgboost
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -43,18 +43,30 @@ enum class TreeProcessType : int {
  kDefault = 0,
  kUpdate = 1
 };
+
+enum class PredictorType : int {
+  kAuto = 0,
+  kCPUPredictor,
+  kGPUPredictor,
+  kOneAPIPredictor
+};
 }  // namespace xgboost

 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
+DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
+
+namespace xgboost {
+namespace gbm {

-namespace xgboost::gbm {
 /*! \brief training parameters */
 struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
  /*! \brief tree updater sequence */
  std::string updater_seq;
  /*! \brief type of boosting process to run */
  TreeProcessType process_type;
+  // predictor type
+  PredictorType predictor;
  // tree construction method
  TreeMethod tree_method;
  // declare parameters
@@ -69,6 +81,13 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
        .describe("Whether to run the normal boosting process that creates new trees,"\
                  " or to update the trees in an existing model.");
    DMLC_DECLARE_ALIAS(updater_seq, updater);
+    DMLC_DECLARE_FIELD(predictor)
+        .set_default(PredictorType::kAuto)
+        .add_enum("auto", PredictorType::kAuto)
+        .add_enum("cpu_predictor", PredictorType::kCPUPredictor)
+        .add_enum("gpu_predictor", PredictorType::kGPUPredictor)
+        .add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
+        .describe("Predictor algorithm type");
    DMLC_DECLARE_FIELD(tree_method)
        .set_default(TreeMethod::kAuto)
        .add_enum("auto",      TreeMethod::kAuto)
@@ -173,8 +192,12 @@ class GBTree : public GradientBooster {
      : GradientBooster{ctx}, model_(booster_config, ctx_) {}

  void Configure(const Args& cfg) override;
+  // Revise `tree_method` and `updater` parameters after seeing the training
+  // data matrix, only useful when tree_method is auto.
+  void PerformTreeMethodHeuristic(DMatrix* fmat);
  /*! \brief Map `tree_method` parameter to `updater` parameter */
  void ConfigureUpdaters();
+  void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);

  /**
   * \brief Optionally update the leaf value.
@@ -189,11 +212,21 @@ class GBTree : public GradientBooster {
  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
               PredictionCacheEntry* predt, ObjFunction const* obj) override;

-  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
+  bool UseGPU() const override {
+    return
+        tparam_.predictor == PredictorType::kGPUPredictor ||
+        tparam_.tree_method == TreeMethod::kGPUHist;
+  }

-  [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
+  GBTreeTrainParam const& GetTrainParam() const {
+    return tparam_;
+  }
+
+  void Load(dmlc::Stream* fi) override {
+    model_.Load(fi);
+    this->cfg_.clear();
+  }

-  void Load(dmlc::Stream* fi) override { model_.Load(fi); }
  void Save(dmlc::Stream* fo) const override {
    model_.Save(fo);
  }
@@ -213,14 +246,39 @@ class GBTree : public GradientBooster {
    return !model_.trees.empty() || !model_.trees_to_update.empty();
  }

-  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
-                        bst_layer_t layer_begin, bst_layer_t layer_end) const;
-
  void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
                    bst_layer_t layer_begin, bst_layer_t layer_end) override;

  void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
-                      bst_layer_t layer_begin, bst_layer_t layer_end) const override;
+                      bst_layer_t layer_begin, bst_layer_t layer_end) const override {
+    CHECK(configured_);
+    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
+    CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
+    std::vector<Predictor const *> predictors{
+      cpu_predictor_.get(),
+#if defined(XGBOOST_USE_CUDA)
+      gpu_predictor_.get()
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    StringView msg{"Unsupported data type for inplace predict."};
+    if (tparam_.predictor == PredictorType::kAuto) {
+      // Try both predictor implementations
+      for (auto const &p : predictors) {
+        if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
+          return;
+        }
+      }
+      LOG(FATAL) << msg;
+    } else {
+      bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
+                                                          tree_begin, tree_end);
+      CHECK(success) << msg << std::endl
+                     << "Current Predictor: "
+                     << (tparam_.predictor == PredictorType::kCPUPredictor
+                             ? "cpu_predictor"
+                             : "gpu_predictor");
+    }
+  }

  void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
                    std::vector<bst_feature_t>* features,
@@ -301,29 +359,32 @@ class GBTree : public GradientBooster {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
                               "n_iteration), use model slicing instead.";
-    this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
+    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                           bst_layer_t layer_begin, bst_layer_t layer_end,
-                           bool approximate) override {
+  void PredictContribution(DMatrix* p_fmat,
+                           HostDeviceVector<bst_float>* out_contribs,
+                           uint32_t layer_begin, uint32_t layer_end, bool approximate,
+                           int, unsigned) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
-                               "n_iteration), using model slicing instead.";
-    this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
-                                                   approximate);
+    CHECK_EQ(tree_begin, 0)
+        << "Predict contribution supports only iteration end: (0, "
+           "n_iteration), using model slicing instead.";
+    this->GetPredictor()->PredictContribution(
+        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                                       bst_layer_t layer_begin, bst_layer_t layer_end,
-                                       bool approximate) override {
+  void PredictInteractionContributions(
+      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+      uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
-                               "n_iteration), using model slicing instead.";
-    this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
-                                                               tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0)
+        << "Predict interaction contribution supports only iteration end: (0, "
+           "n_iteration), using model slicing instead.";
+    this->GetPredictor()->PredictInteractionContributions(
+        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -339,9 +400,8 @@ class GBTree : public GradientBooster {
                     std::vector<HostDeviceVector<bst_node_t>>* out_position,
                     std::vector<std::unique_ptr<RegTree>>* ret);

-  [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
-      bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
-      DMatrix* f_dmat = nullptr) const;
+  std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
+                                                 DMatrix* f_dmat = nullptr) const;

  // commit new trees all at once
  virtual void CommitModel(TreesOneIter&& new_trees);
@@ -356,17 +416,22 @@ class GBTree : public GradientBooster {
  bool showed_updater_warning_ {false};
  bool specified_updater_   {false};
  bool configured_ {false};
+  // configurations for tree
+  Args cfg_;
  // the updaters that can be applied to each of tree
  std::vector<std::unique_ptr<TreeUpdater>> updaters_;
  // Predictors
  std::unique_ptr<Predictor> cpu_predictor_;
-  std::unique_ptr<Predictor> gpu_predictor_{nullptr};
+#if defined(XGBOOST_USE_CUDA)
+  std::unique_ptr<Predictor> gpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
 #if defined(XGBOOST_USE_ONEAPI)
  std::unique_ptr<Predictor> oneapi_predictor_;
 #endif  // defined(XGBOOST_USE_ONEAPI)
  common::Monitor monitor_;
 };

-}  // namespace xgboost::gbm
+}  // namespace gbm
+}  // namespace xgboost

 #endif  // XGBOOST_GBM_GBTREE_H_
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,7 +40,6 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -357,6 +356,21 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
    dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;

+namespace {
+StringView ModelMsg() {
+  return StringView{
+      R"doc(
+  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
+  older XGBoost, please export the model by calling `Booster.save_model` from that version
+  first, then load it back in current version. See:
+
+    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
+
+  for more details about differences between saving model and serializing.
+)doc"};
+}
+}  // anonymous namespace
+
 class LearnerConfiguration : public Learner {
 private:
  std::mutex config_lock_;
@@ -516,7 +530,7 @@ class LearnerConfiguration : public Learner {
    }

    if (!Version::Same(origin_version)) {
-      error::WarnOldSerialization();
+      LOG(WARNING) << ModelMsg();
      return;  // skip configuration if version is not matched
    }

@@ -547,7 +561,7 @@ class LearnerConfiguration : public Learner {
    for (size_t i = 0; i < n_metrics; ++i) {
      auto old_serialization = IsA<String>(j_metrics[i]);
      if (old_serialization) {
-        error::WarnOldSerialization();
+        LOG(WARNING) << ModelMsg();
        metric_names_[i] = get<String>(j_metrics[i]);
      } else {
        metric_names_[i] = get<String>(j_metrics[i]["name"]);
@@ -749,7 +763,9 @@ class LearnerConfiguration : public Learner {
        CHECK(matrix.first.ptr);
        CHECK(!matrix.second.ref.expired());
        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
-        error::MaxFeatureSize(num_col);
+        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
+            << "Unfortunately, XGBoost does not support data matrices with "
+            << std::numeric_limits<unsigned>::max() << " features or greater";
        num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
      }

@@ -1158,7 +1174,7 @@ class LearnerIO : public LearnerConfiguration {
      Json memory_snapshot;
      if (header[1] == '"') {
        memory_snapshot = Json::Load(StringView{buffer});
-        error::WarnOldSerialization();
+        LOG(WARNING) << ModelMsg();
      } else if (std::isalpha(header[1])) {
        memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
      } else {
@@ -1177,7 +1193,7 @@ class LearnerIO : public LearnerConfiguration {
      header.resize(serialisation_header_.size());
      CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
      // Avoid printing the content in loaded header, which might be random binary code.
-      CHECK(header == serialisation_header_) << error::OldSerialization();
+      CHECK(header == serialisation_header_) << ModelMsg();
      int64_t sz {-1};
      CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
      if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -1397,8 +1413,6 @@ class LearnerImpl : public LearnerIO {
    this->CheckModelInitialized();

    auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    out_predictions.version = 0;
-
    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
    if (type == PredictionType::kValue) {
      obj_->PredTransform(&out_predictions.predictions);
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -11,7 +11,7 @@
 #include <tuple>
 #include <utility>

-#include "../collective/communicator-inl.cuh"
+#include "../collective/device_communicator.cuh"
 #include "../common/algorithm.cuh"        // SegmentedArgSort
 #include "../common/optional_weight.h"    // OptionalWeights
 #include "../common/threading_utils.cuh"  // UnravelTrapeziodIdx,SegmentedTrapezoidThreads
@@ -205,7 +205,8 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
  if (collective::IsDistributed()) {
    int32_t device = dh::CurrentDevice();
    CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
-    collective::AllReduce<collective::Operation::kSum>(device, results.data(), results.size());
+    auto* communicator = collective::Communicator::GetDevice(device);
+    communicator->AllReduceSum(results.data(), results.size());
  }
  auto reduce_in = dh::MakeTransformIterator<Pair>(
      thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -483,13 +483,9 @@ class QuantileError : public MetricNoCache {

  const char* Name() const override { return "quantile"; }
  void LoadConfig(Json const& in) override {
-    auto const& obj = get<Object const>(in);
-    auto it = obj.find("quantile_loss_param");
-    if (it != obj.cend()) {
-      FromJson(it->second, &param_);
-      auto const& name = get<String const>(in["name"]);
-      CHECK_EQ(name, "quantile");
-    }
+    auto const& name = get<String const>(in["name"]);
+    CHECK_EQ(name, "quantile");
+    FromJson(in["quantile_loss_param"], &param_);
  }
  void SaveConfig(Json* p_out) const override {
    auto& out = *p_out;
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -577,8 +577,8 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
                       if (lj(0) >= Eps64()) {
                         tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
                       }
-                       assert(!isinf(ti_plus(i)));
-                       assert(!isinf(tj_minus(i)));
+                       assert(!std::isinf(ti_plus(i)));
+                       assert(!std::isinf(tj_minus(i)));
                     });
 }
 }  // namespace cuda_impl
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -16,7 +16,6 @@
 #include "../common/bitfield.h"               // for RBitField8
 #include "../common/categorical.h"            // for IsCat, Decision
 #include "../common/common.h"                 // for DivRoundUp
-#include "../common/error_msg.h"              // for InplacePredictProxy
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
@@ -468,6 +467,7 @@ class ColumnSplitHelper {
  void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
    auto const &tree = *model_.trees[tree_id];
    auto const &cats = tree.GetCategoriesMatrix();
+    auto const has_categorical = tree.HasCategoricalSplit();
    bst_node_t n_nodes = tree.GetNodes().size();

    for (bst_node_t nid = 0; nid < n_nodes; nid++) {
@@ -484,10 +484,16 @@ class ColumnSplitHelper {
      }

      auto const fvalue = feat.GetFvalue(split_index);
-      auto const decision = tree.HasCategoricalSplit()
-                                ? GetDecision<true>(node, nid, fvalue, cats)
-                                : GetDecision<false>(node, nid, fvalue, cats);
-      if (decision) {
+      if (has_categorical && common::IsCat(cats.split_type, nid)) {
+        auto const node_categories =
+            cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
+        if (!common::Decision(node_categories, fvalue)) {
+          decision_bits_.Set(bit_index);
+        }
+        continue;
+      }
+
+      if (fvalue >= node.SplitCond()) {
        decision_bits_.Set(bit_index);
      }
    }
@@ -505,7 +511,7 @@ class ColumnSplitHelper {
    if (missing_bits_.Check(bit_index)) {
      return node.DefaultChild();
    } else {
-      return node.LeftChild() + !decision_bits_.Check(bit_index);
+      return node.LeftChild() + decision_bits_.Check(bit_index);
    }
  }

@@ -742,7 +748,7 @@ class CPUPredictor : public Predictor {
                      PredictionCacheEntry *out_preds, uint32_t tree_begin,
                      unsigned tree_end) const override {
    auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
-    CHECK(proxy)<< error::InplacePredictProxy();
+    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
    CHECK(!p_m->Info().IsColumnSplit())
        << "Inplace predict support for column-wise data split is not yet implemented.";
    auto x = proxy->Adapter();
@@ -884,8 +890,9 @@ class CPUPredictor : public Predictor {
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      auto page = batch.GetView();
      // parallel over local batch
-      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
-        auto row_idx = batch.base_rowid + i;
+      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
+      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
+        auto row_idx = static_cast<size_t>(batch.base_rowid + i);
        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
        if (feats.Size() == 0) {
          feats.Init(num_feature);
--- a/Show More
+++ b/Show More