Bump scala-collection-compat_2.12 from 2.10.0 to 2.11.0 in /jvm-packages

Bumps [scala-collection-compat_2.12](https://github.com/scala/scala-collection-compat) from 2.10.0 to 2.11.0. - [Release notes](https://github.com/scala/scala-collection-compat/releases) - [Commits](https://github.com/scala/scala-collection-compat/compare/v2.10.0...v2.11.0) --- updated-dependencies: - dependency-name: org.scala-lang.modules:scala-collection-compat_2.12 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>
2023-06-16 03:02:09 +00:00
141 changed files with 2334 additions and 4730 deletions
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -255,44 +255,3 @@ jobs:
      shell: bash -l {0}
      run: |
        pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
-
-  python-system-installation-on-ubuntu:
-    name: Test XGBoost Python package System Installation on ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest]
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: 'true'
-
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-
-      - name: Install ninja
-        run: |
-          sudo apt-get update && sudo apt-get install -y ninja-build
-
-      - name: Build XGBoost on Ubuntu
-        run: |
-          mkdir build
-          cd build
-          cmake .. -GNinja
-          ninja
-
-      - name: Copy lib to system lib
-        run: |
-          cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib"
-
-      - name: Install XGBoost in Virtual Environment
-        run: |
-          cd python-package
-          pip install virtualenv
-          virtualenv venv
-          source venv/bin/activate && \
-            pip install -v . --config-settings use_system_libxgboost=True && \
-            python -c 'import xgboost'
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,23 +3,6 @@ XGBoost Change Log

 This file records the changes in xgboost library in reverse chronological order.

-## 1.7.6 (2023 Jun 16)
-
-This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.
-
-### Bug Fixes
-* Fix distributed training with mixed dense and sparse partitions. (#9272)
-* Fix monotone constraints on CPU with large trees. (#9122)
-* [spark] Make the spark model have the same UID as its estimator (#9022)
-* Optimize prediction with `QuantileDMatrix`. (#9096)
-
-### Document
-* Improve doxygen (#8959)
-* Update the cuDF pip index URL. (#9106)
-
-### Maintenance
-* Fix tests with pandas 2.0. (#9014)
-
 ## 1.7.5 (2023 Mar 30)
 This is a patch release for bug fixes.

--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -85,18 +85,9 @@ test_that("dart prediction works", {
    rnorm(100)

  set.seed(1994)
-  booster_by_xgboost <- xgboost(
-    data = d,
-    label = y,
-    max_depth = 2,
-    booster = "dart",
-    rate_drop = 0.5,
-    one_drop = TRUE,
-    eta = 1,
-    nthread = 2,
-    nrounds = nrounds,
-    objective = "reg:squarederror"
-  )
+  booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
+                                rate_drop = 0.5, one_drop = TRUE,
+                                eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
  expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
@@ -106,19 +97,19 @@ test_that("dart prediction works", {

  set.seed(1994)
  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
-  booster_by_train <- xgb.train(
-    params = list(
-      booster = "dart",
-      max_depth = 2,
-      eta = 1,
-      rate_drop = 0.5,
-      one_drop = TRUE,
-      nthread = 1,
-      objective = "reg:squarederror"
-    ),
-    data = dtrain,
-    nrounds = nrounds
-  )
+  booster_by_train <- xgb.train(params = list(
+                                    booster = "dart",
+                                    max_depth = 2,
+                                    eta = 1,
+                                    rate_drop = 0.5,
+                                    one_drop = TRUE,
+                                    nthread = 1,
+                                    tree_method = "exact",
+                                    objective = "reg:squarederror"
+                                ),
+                                data = dtrain,
+                                nrounds = nrounds
+                                )
  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
  pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
@@ -408,7 +399,7 @@ test_that("colsample_bytree works", {
  xgb.importance(model = bst)
  # If colsample_bytree works properly, a variety of features should be used
  # in the 100 trees
-  expect_gte(nrow(xgb.importance(model = bst)), 28)
+  expect_gte(nrow(xgb.importance(model = bst)), 30)
 })

 test_that("Configuration works", {
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -76,20 +76,32 @@ test_that("Models from previous versions of XGBoost can be loaded", {
    name <- m[3]
    is_rds <- endsWith(model_file, '.rds')
    is_json <- endsWith(model_file, '.json')
-    # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
-    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
-      booster <- readRDS(model_file)
-      expect_warning(predict(booster, newdata = pred_data))
-      booster <- readRDS(model_file)
-      expect_warning(run_booster_check(booster, name))
-    } else {
-      if (is_rds) {
+
+    cpp_warning <- capture.output({
+      # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
+      if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
        booster <- readRDS(model_file)
+        expect_warning(predict(booster, newdata = pred_data))
+        booster <- readRDS(model_file)
+        expect_warning(run_booster_check(booster, name))
      } else {
-        booster <- xgb.load(model_file)
+        if (is_rds) {
+          booster <- readRDS(model_file)
+        } else {
+          booster <- xgb.load(model_file)
+        }
+        predict(booster, newdata = pred_data)
+        run_booster_check(booster, name)
      }
-      predict(booster, newdata = pred_data)
-      run_booster_check(booster, name)
+    })
+    cpp_warning <- paste0(cpp_warning, collapse = ' ')
+    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') >= 0) {
+      # Expect a C++ warning when a model is loaded from RDS and it was generated by old XGBoost`
+      m <- grepl(paste0('.*If you are loading a serialized model ',
+                        '\\(like pickle in Python, RDS in R\\).*',
+                        'for more details about differences between ',
+                        'saving model and serializing.*'), cpp_warning, perl = TRUE)
+      expect_true(length(m) > 0 && all(m))
    }
  })
 })
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -13,10 +13,7 @@ test_that("updating the model works", {
  watchlist <- list(train = dtrain, test = dtest)

  # no-subsampling
-  p1 <- list(
-    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
-    updater = "grow_colmaker,prune"
-  )
+  p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
  set.seed(11)
  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
  tr1 <- xgb.model.dt.tree(model = bst1)
--- a/R-package/vignettes/xgboost.bib
+++ b/R-package/vignettes/xgboost.bib
@@ -18,11 +18,13 @@
  publisher={Institute of Mathematical Statistics}
 }

+
@misc{
    Bache+Lichman:2013 ,
    author = "K. Bache and M. Lichman",
    year = "2013",
    title = "{UCI} Machine Learning Repository",
-    url = "https://archive.ics.uci.edu/",
-    institution = "University of California, Irvine, School of Information and Computer Sciences"
+    url = "http://archive.ics.uci.edu/ml/",
+    institution = "University of California, Irvine, School of Information and Computer Sciences" 
 }
+
--- a/demo/aft_survival/aft_survival_viz_demo.py
+++ b/demo/aft_survival/aft_survival_viz_demo.py
@@ -11,43 +11,33 @@ import numpy as np

 import xgboost as xgb

-plt.rcParams.update({"font.size": 13})
-
+plt.rcParams.update({'font.size': 13})

 # Function to visualize censored labels
-def plot_censored_labels(
-    X: np.ndarray, y_lower: np.ndarray, y_upper: np.ndarray
-) -> None:
-    def replace_inf(x: np.ndarray, target_value: float) -> np.ndarray:
+def plot_censored_labels(X, y_lower, y_upper):
+    def replace_inf(x, target_value):
        x[np.isinf(x)] = target_value
        return x
-
-    plt.plot(X, y_lower, "o", label="y_lower", color="blue")
-    plt.plot(X, y_upper, "o", label="y_upper", color="fuchsia")
-    plt.vlines(
-        X,
-        ymin=replace_inf(y_lower, 0.01),
-        ymax=replace_inf(y_upper, 1000.0),
-        label="Range for y",
-        color="gray",
-    )
-
+    plt.plot(X, y_lower, 'o', label='y_lower', color='blue')
+    plt.plot(X, y_upper, 'o', label='y_upper', color='fuchsia')
+    plt.vlines(X, ymin=replace_inf(y_lower, 0.01), ymax=replace_inf(y_upper, 1000),
+               label='Range for y', color='gray')

 # Toy data
 X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
 INF = np.inf
-y_lower = np.array([10, 15, -INF, 30, 100])
-y_upper = np.array([INF, INF, 20, 50, INF])
+y_lower = np.array([ 10,  15, -INF, 30, 100])
+y_upper = np.array([INF, INF,   20, 50, INF])

 # Visualize toy data
 plt.figure(figsize=(5, 4))
 plot_censored_labels(X, y_lower, y_upper)
 plt.ylim((6, 200))
-plt.legend(loc="lower right")
-plt.title("Toy data")
-plt.xlabel("Input feature")
-plt.ylabel("Label")
-plt.yscale("log")
+plt.legend(loc='lower right')
+plt.title('Toy data')
+plt.xlabel('Input feature')
+plt.ylabel('Label')
+plt.yscale('log')
 plt.tight_layout()
 plt.show(block=True)

@@ -56,83 +46,54 @@ grid_pts = np.linspace(0.8, 5.2, 1000).reshape((-1, 1))

 # Train AFT model using XGBoost
 dmat = xgb.DMatrix(X)
-dmat.set_float_info("label_lower_bound", y_lower)
-dmat.set_float_info("label_upper_bound", y_upper)
-params = {"max_depth": 3, "objective": "survival:aft", "min_child_weight": 0}
+dmat.set_float_info('label_lower_bound', y_lower)
+dmat.set_float_info('label_upper_bound', y_upper)
+params = {'max_depth': 3, 'objective':'survival:aft', 'min_child_weight': 0}

 accuracy_history = []
+def plot_intermediate_model_callback(env):
+    """Custom callback to plot intermediate models"""
+    # Compute y_pred = prediction using the intermediate model, at current boosting iteration
+    y_pred = env.model.predict(dmat)
+    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
+    #              the corresponding predicted label (y_pred)
+    acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X) * 100)
+    accuracy_history.append(acc)

+    # Plot ranged labels as well as predictions by the model
+    plt.subplot(5, 3, env.iteration + 1)
+    plot_censored_labels(X, y_lower, y_upper)
+    y_pred_grid_pts = env.model.predict(xgb.DMatrix(grid_pts))
+    plt.plot(grid_pts, y_pred_grid_pts, 'r-', label='XGBoost AFT model', linewidth=4)
+    plt.title('Iteration {}'.format(env.iteration), x=0.5, y=0.8)
+    plt.xlim((0.8, 5.2))
+    plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
+    plt.yscale('log')

-class PlotIntermediateModel(xgb.callback.TrainingCallback):
-    """Custom callback to plot intermediate models."""
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def after_iteration(
-        self,
-        model: xgb.Booster,
-        epoch: int,
-        evals_log: xgb.callback.TrainingCallback.EvalsLog,
-    ) -> bool:
-        """Run after training is finished."""
-        # Compute y_pred = prediction using the intermediate model, at current boosting
-        # iteration
-        y_pred = model.predict(dmat)
-        # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
-        #              includes the corresponding predicted label (y_pred)
-        acc = np.sum(
-            np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X) * 100
-        )
-        accuracy_history.append(acc)
-
-        # Plot ranged labels as well as predictions by the model
-        plt.subplot(5, 3, epoch + 1)
-        plot_censored_labels(X, y_lower, y_upper)
-        y_pred_grid_pts = model.predict(xgb.DMatrix(grid_pts))
-        plt.plot(
-            grid_pts, y_pred_grid_pts, "r-", label="XGBoost AFT model", linewidth=4
-        )
-        plt.title("Iteration {}".format(epoch), x=0.5, y=0.8)
-        plt.xlim((0.8, 5.2))
-        plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
-        plt.yscale("log")
-        return False
-
-
-res: xgb.callback.TrainingCallback.EvalsLog = {}
-plt.figure(figsize=(12, 13))
-bst = xgb.train(
-    params,
-    dmat,
-    15,
-    [(dmat, "train")],
-    evals_result=res,
-    callbacks=[PlotIntermediateModel()],
-)
+res = {}
+plt.figure(figsize=(12,13))
+bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=res,
+                callbacks=[plot_intermediate_model_callback])
 plt.tight_layout()
-plt.legend(
-    loc="lower center",
-    ncol=4,
-    bbox_to_anchor=(0.5, 0),
-    bbox_transform=plt.gcf().transFigure,
-)
+plt.legend(loc='lower center', ncol=4,
+           bbox_to_anchor=(0.5, 0),
+           bbox_transform=plt.gcf().transFigure)
 plt.tight_layout()

 # Plot negative log likelihood over boosting iterations
-plt.figure(figsize=(8, 3))
+plt.figure(figsize=(8,3))
 plt.subplot(1, 2, 1)
-plt.plot(res["train"]["aft-nloglik"], "b-o", label="aft-nloglik")
-plt.xlabel("# Boosting Iterations")
-plt.legend(loc="best")
+plt.plot(res['train']['aft-nloglik'], 'b-o', label='aft-nloglik')
+plt.xlabel('# Boosting Iterations')
+plt.legend(loc='best')

 # Plot "accuracy" over boosting iterations
 # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
 #              the corresponding predicted label (y_pred)
 plt.subplot(1, 2, 2)
-plt.plot(accuracy_history, "r-o", label="Accuracy (%)")
-plt.xlabel("# Boosting Iterations")
-plt.legend(loc="best")
+plt.plot(accuracy_history, 'r-o', label='Accuracy (%)')
+plt.xlabel('# Boosting Iterations')
+plt.legend(loc='best')
 plt.tight_layout()

 plt.show()
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -82,10 +82,10 @@ def main(tmpdir: str) -> xgboost.Booster:
    missing = np.NaN
    Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)

-    # Other tree methods including ``approx``, and ``gpu_hist`` are supported. GPU
-    # behaves differently than CPU tree methods. See tutorial in doc for details.
+    # Other tree methods including ``hist`` and ``gpu_hist`` also work, see tutorial in
+    # doc for details.
    booster = xgboost.train(
-        {"tree_method": "hist", "max_depth": 4},
+        {"tree_method": "approx", "max_depth": 2},
        Xy,
        evals=[(Xy, "Train")],
        num_boost_round=10,
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -259,7 +259,7 @@ There are several ways to build and install the package from source:

    import sys
    import pathlib
-    libpath = pathlib.Path(sys.base_prefix).joinpath("lib", "libxgboost.so")
+    libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
    assert libpath.exists()

  Then pass ``use_system_libxgboost=True`` option to ``pip install``:
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -33,8 +33,6 @@ DMatrix
 .. doxygengroup:: DMatrix
   :project: xgboost

-.. _c_streaming:
-
 Streaming
 ---------

--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -45,7 +45,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as

 .. code-block:: python

-  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+  model.set_param({"predictor": "gpu_predictor"})
  shap_values = model.predict(dtrain, pred_contribs=True)
  shap_interaction_values = model.predict(dtrain, pred_interactions=True)

--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -199,6 +199,18 @@ Parameters for Tree Booster
  - Maximum number of discrete bins to bucket continuous features.
  - Increasing this number improves the optimality of splits at the cost of higher computation time.

+* ``predictor``, [default= ``auto``]
+
+  - The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
+
+    - ``auto``: Configure predictor based on heuristics.
+    - ``cpu_predictor``: Multicore CPU prediction algorithm.
+    - ``gpu_predictor``: Prediction using GPU.  Used when ``tree_method`` is ``gpu_hist``.
+      When ``predictor`` is set to default value ``auto``, the ``gpu_hist`` tree method is
+      able to provide GPU based prediction without copying training data to GPU memory.
+      If ``gpu_predictor`` is explicitly specified, then all data is copied into GPU, only
+      recommended for performing prediction tasks.
+
 * ``num_parallel_tree``, [default=1]

  - Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -87,6 +87,15 @@ with the native Python interface :py:meth:`xgboost.Booster.predict` and
 behavior.  Also the ``save_best`` parameter from :py:obj:`xgboost.callback.EarlyStopping`
 might be useful.

+*********
+Predictor
+*********
+
+There are 2 predictors in XGBoost (3 if you have the one-api plugin enabled), namely
+``cpu_predictor`` and ``gpu_predictor``.  The default option is ``auto`` so that XGBoost
+can employ some heuristics for saving GPU memory during training.  They might have slight
+different outputs due to floating point errors.
+

 ***********
 Base Margin
@@ -125,6 +134,15 @@ it.  Be aware that the output of in-place prediction depends on input data type,
 input is on GPU data output is :py:obj:`cupy.ndarray`, otherwise a :py:obj:`numpy.ndarray`
 is returned.

+****************
+Categorical Data
+****************
+
+Other than users performing encoding, XGBoost has experimental support for categorical
+data using ``gpu_hist`` and ``gpu_predictor``.  No special operation needs to be done on
+input test data since the information about categories is encoded into the model during
+training.
+
 *************
 Thread Safety
 *************
@@ -141,6 +159,7 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi

    def predict_fn(clf: xgb.XGBClassifier, X):
        X = preprocess(X)
+        clf.set_params(predictor="gpu_predictor")  # NOT safe!
        clf.set_params(n_jobs=1)  # NOT safe!
        return clf.predict_proba(X, iteration_range=(0, 10))

--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -54,9 +54,6 @@ on a dask cluster:
        y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))

        dtrain = xgb.dask.DaskDMatrix(client, X, y)
-        # or
-        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
-        # `DaskQuantileDMatrix` is available for the `hist` and `gpu_hist` tree method.

        output = xgb.dask.train(
            client,
@@ -148,8 +145,8 @@ Also for inplace prediction:

 .. code-block:: python

-  # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
-  booster.set_param({"gpu_id": "0"})
+  booster.set_param({'predictor': 'gpu_predictor'})
+  # where X is a dask DataFrame or dask Array containing cupy or cuDF backed data.
  prediction = xgb.dask.inplace_predict(client, booster, X)

 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -2,34 +2,11 @@
 Using XGBoost External Memory Version
 #####################################

-When working with large datasets, training XGBoost models can be challenging as the entire
-dataset needs to be loaded into memory. This can be costly and sometimes
-infeasible. Staring from 1.5, users can define a custom iterator to load data in chunks
-for running XGBoost algorithms. External memory can be used for both training and
-prediction, but training is the primary use case and it will be our focus in this
-tutorial. For prediction and evaluation, users can iterate through the data themseleves
-while training requires the full dataset to be loaded into the memory.
-
-During training, there are two different modes for external memory support available in
-XGBoost, one for CPU-based algorithms like ``hist`` and ``approx``, another one for the
-GPU-based training algorithm. We will introduce them in the following sections.
-
-.. note::
-
-   Training on data from external memory is not supported by the ``exact`` tree method.
-
-.. note::
-
-   The feature is still experimental as of 2.0. The performance is not well optimized.
-
-The external memory support has gone through multiple iterations and is still under heavy
-development. Like the :py:class:`~xgboost.QuantileDMatrix` with
-:py:class:`~xgboost.DataIter`, XGBoost loads data batch-by-batch using a custom iterator
-supplied by the user. However, unlike the :py:class:`~xgboost.QuantileDMatrix`, external
-memory will not concatenate the batches unless GPU is used (it uses a hybrid approach,
-more details follow). Instead, it will cache all batches on the external memory and fetch
-them on-demand.  Go to the end of the document to see a comparison between
-`QuantileDMatrix` and external memory.
+XGBoost supports loading data from external memory using builtin data parser.  And
+starting from version 1.5, users can also define a custom iterator to load data in chunks.
+The feature is still experimental and not yet ready for production use.  In this tutorial
+we will introduce both methods.  Please note that training on data from external memory is
+not supported by ``exact`` tree method.

 *************
 Data Iterator
@@ -38,8 +15,8 @@ Data Iterator
 Starting from XGBoost 1.5, users can define their own data loader using Python or C
 interface.  There are some examples in the ``demo`` directory for quick start.  This is a
 generalized version of text input external memory, where users no longer need to prepare a
-text file that XGBoost recognizes.  To enable the feature, users need to define a data
-iterator with 2 class methods: ``next`` and ``reset``, then pass it into the ``DMatrix``
+text file that XGBoost recognizes.  To enable the feature, user need to define a data
+iterator with 2 class methods ``next`` and ``reset`` then pass it into ``DMatrix``
 constructor.

 .. code-block:: python
@@ -83,146 +60,20 @@ constructor.

  # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
  # as noted in following sections.
-  booster = xgboost.train({"tree_method": "hist"}, Xy)
+  booster = xgboost.train({"tree_method": "approx"}, Xy)


-The above snippet is a simplified version of :ref:`sphx_glr_python_examples_external_memory.py`.
-For an example in C, please see ``demo/c-api/external-memory/``. The iterator is the
-common interface for using external memory with XGBoost, you can pass the resulting
-``DMatrix`` object for training, prediction, and evaluation.
-
-It is important to set the batch size based on the memory available. A good starting point
-is to set the batch size to 10GB per batch if you have 64GB of memory. It is *not*
-recommended to set small batch sizes like 32 samples per batch, as this can seriously hurt
-performance in gradient boosting.
-
-***********
-CPU Version
-***********
-
-In the previous section, we demonstrated how to train a tree-based model using the
-``hist`` tree method on a CPU. This method involves iterating through data batches stored
-in a cache during tree construction. For optimal performance, we recommend using the
-``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer of tree
-nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
-requires XGBoost to iterate over the data set for each tree node, resulting in slower
-performance.
-
-If external memory is used, the performance of CPU training is limited by IO
-(input/output) speed. This means that the disk IO speed primarily determines the training
-speed. During benchmarking, we used an NVMe connected to a PCIe-4 slot, other types of
-storage can be too slow for practical usage. In addition, your system may perform caching
-to reduce the overhead of file reading.
-
-**********************************
-GPU Version (GPU Hist tree method)
-**********************************
-
-External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
-``gpu_hist``). However, the algorithm used for GPU is different from the one used for
-CPU. When training on a CPU, the tree method iterates through all batches from external
-memory for each step of the tree construction algorithm. On the other hand, the GPU
-algorithm uses a hybrid approach. It iterates through the data during the beginning of
-each iteration and concatenates all batches into one in GPU memory. To reduce overall
-memory usage, users can utilize subsampling. The GPU hist tree method supports
-`gradient-based sampling`, enabling users to set a low sampling rate without compromising
-accuracy.
-
-.. code-block:: python
-
-  param = {
-    ...
-    'subsample': 0.2,
-    'sampling_method': 'gradient_based',
-  }
-
-For more information about the sampling algorithm and its use in external memory training,
-see `this paper <https://arxiv.org/abs/2005.09148>`_.
-
-.. warning::
-
-   When GPU is running out of memory during iteration on external memory, user might
-   recieve a segfault instead of an OOM exception.
-
-.. _ext_remarks:
-
-*******
-Remarks
-*******
-
-When using external memory with XBGoost, data is divided into smaller chunks so that only
-a fraction of it needs to be stored in memory at any given time. It's important to note
-that this method only applies to the predictor data (``X``), while other data, like labels
-and internal runtime structures are concatenated. This means that memory reduction is most
-effective when dealing with wide datasets where ``X`` is significantly larger in size
-compared to other data like ``y``, while it has little impact on slim datasets.
-
-As one might expect, fetching data on-demand puts significant pressure on the storage
-device. Today's computing device can process way more data than a storage can read in a
-single unit of time. The ratio is at order of magnitudes. An GPU is capable of processing
-hundred of Gigabytes of floating-point data in a split second. On the other hand, a
-four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data transfer
-rate. As a result, the training is likely to be severely bounded by your storage
-device. Before adopting the external memory solution, some back-of-envelop calculations
-might help you see whether it's viable. For instance, if your NVMe drive can transfer 4GB
-(a fairly practical number) of data per second and you have a 100GB of data in compressed
-XGBoost cache (which corresponds to a dense float32 numpy array with the size of 200GB,
-give or take). A tree with depth 8 needs at least 16 iterations through the data when the
-parameter is right. You need about 14 minutes to train a single tree without accounting
-for some other overheads and assume the computation overlaps with the IO. If your dataset
-happens to have TB-level size, then you might need thousands of trees to get a generalized
-model. These calculations can help you get an estimate on the expected training time.
-
-However, sometimes we can ameliorate this limitation. One should also consider that the OS
-(mostly talking about the Linux kernel) can usually cache the data on host memory. It only
-evicts pages when new data comes in and there's no room left. In practice, at least some
-portion of the data can persist on the host memory throughout the entire training
-session. We are aware of this cache when optimizing the external memory fetcher. The
-compressed cache is usually smaller than the raw input data, especially when the input is
-dense without any missing value. If the host memory can fit a significant portion of this
-compressed cache, then the performance should be decent after initialization. Our
-development so far focus on two fronts of optimization for external memory:
-
- Avoid iterating through the data whenever appropriate.
- If the OS can cache the data, the performance should be close to in-core training.
-
-Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
-tested against system errors like disconnected network devices (`SIGBUS`). In the face of
-a bus error, you will see a hard crash and need to clean up the cache files. If the
-training session might take a long time and you are using solutions like NVMe-oF, we
-recommend checkpointing your model periodically. Also, it's worth noting that most tests
-have been conducted on Linux distributions.
-
-
-Another important point to keep in mind is that creating the initial cache for XGBoost may
-take some time. The interface to external memory is through custom iterators, which we can
-not assume to be thread-safe. Therefore, initialization is performed sequentially. Using
-the `xgboost.config_context` with `verbosity=2` can give you some information on what
-XGBoost is doing during the wait if you don't mind the extra output.
-
-*******************************
-Compared to the QuantileDMatrix
-*******************************
-
-Passing an iterator to the :py:class:`~xgboost.QuantileDmatrix` enables direct
-construction of `QuantileDmatrix` with data chunks. On the other hand, if it's passed to
-:py:class:`~xgboost.DMatrix`, it instead enables the external memory feature. The
-:py:class:`~xgboost.QuantileDmatrix` concatenates the data on memory after compression and
-doesn't fetch data during training. On the other hand, the external memory `DMatrix`
-fetches data batches from external memory on-demand.  Use the `QuantileDMatrix` (with
-iterator if necessary) when you can fit most of your data in memory. The training would be
-an order of magnitute faster than using external memory.
+The above snippet is a simplified version of ``demo/guide-python/external_memory.py``.  For
+an example in C, please see ``demo/c-api/external-memory/``.

 ****************
 Text File Inputs
 ****************

-This is the original form of external memory support, users are encouraged to use custom
-data iterator instead. There is no big difference between using external memory version of
-text input and the in-memory version.  The only difference is the filename format.
+There is no big difference between using external memory version and in-memory version.
+The only difference is the filename format.

-The external memory version takes in the following `URI
-<https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
+The external memory version takes in the following `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:

 .. code-block:: none

@@ -240,8 +91,9 @@ To load from csv files, use the following syntax:

 where ``label_column`` should point to the csv column acting as the label.

-If you have a dataset stored in a file similar to ``demo/data/agaricus.txt.train`` with LIBSVM
-format, the external memory support can be enabled by:
+To provide a simple example for illustration, extracting the code from
+`demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_. If
+you have a dataset stored in a file similar to ``agaricus.txt.train`` with LIBSVM format, the external memory support can be enabled by:

 .. code-block:: python

@@ -252,3 +104,35 @@ XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to
 more notes about text input formats, see :doc:`/tutorials/input_format`.

 For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
+
+
+**********************************
+GPU Version (GPU Hist tree method)
+**********************************
+External memory is supported in GPU algorithms (i.e. when ``tree_method`` is set to ``gpu_hist``).
+
+If you are still getting out-of-memory errors after enabling external memory, try subsampling the
+data to further reduce GPU memory usage:
+
+.. code-block:: python
+
+  param = {
+    ...
+    'subsample': 0.1,
+    'sampling_method': 'gradient_based',
+  }
+
+For more information, see `this paper <https://arxiv.org/abs/2005.09148>`_.  Internally
+the tree method still concatenate all the chunks into 1 final histogram index due to
+performance reason, but in compressed format.  So its scalability has an upper bound but
+still has lower memory cost in general.
+
+***********
+CPU Version
+***********
+
+For CPU histogram based tree methods (``approx``, ``hist``) it's recommended to use
+``grow_policy=depthwise`` for performance reason.  Iterating over data batches is slow,
+with ``depthwise`` policy XGBoost can build a entire layer of tree nodes with a few
+iterations, while with ``lossguide`` XGBoost needs to iterate over the data set for each
+tree node.
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -11,22 +11,22 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo

  model
  saving_model
-  learning_to_rank
-  dart
-  monotonic
-  feature_interaction_constraint
-  aft_survival_analysis
-  categorical
-  multioutput
-  rf
  kubernetes
  Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
  Distributed XGBoost with XGBoost4J-Spark-GPU <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_gpu_tutorial.html>
  dask
  spark_estimator
  ray
-  external_memory
+  dart
+  monotonic
+  rf
+  feature_interaction_constraint
+  learning_to_rank
+  aft_survival_analysis
  c_api_tutorial
  input_format
  param_tuning
+  external_memory
  custom_metric_obj
+  categorical
+  multioutput
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -48,9 +48,8 @@ Notice that the samples are sorted based on their query index in a non-decreasin
  import xgboost as xgb

  # Make a synthetic ranking dataset for demonstration
-  seed = 1994 
-  X, y = make_classification(random_state=seed)
-  rng = np.random.default_rng(seed)
+  X, y = make_classification(random_state=rng)
+  rng = np.random.default_rng(1994)
  n_query_groups = 3
  qid = rng.integers(0, 3, size=X.shape[0])

--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -58,45 +58,3 @@ This can affect the training of XGBoost model, and there are two ways to improve

  - In such a case, you cannot re-balance the dataset
  - Set parameter ``max_delta_step`` to a finite number (say 1) to help convergence
-
-
-*********************
-Reducing Memory Usage
-*********************
-
-If you are using a HPO library like :py:class:`sklearn.model_selection.GridSearchCV`,
-please control the number of threads it can use. It's best to let XGBoost to run in
-parallel instead of asking `GridSearchCV` to run multiple experiments at the same
-time. For instance, creating a fold of data for cross validation can consume a significant
-amount of memory:
-
-.. code-block:: python
-
-    # This creates a copy of dataset. X and X_train are both in memory at the same time.
-
-    # This happens for every thread at the same time if you run `GridSearchCV` with
-    # `n_jobs` larger than 1
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y)
-
-.. code-block:: python
-
-    df = pd.DataFrame()
-    # This creates a new copy of the dataframe, even if you specify the inplace parameter
-    new_df = df.drop(...)
-
-.. code-block:: python
-
-    array = np.array(...)
-    # This may or may not make a copy of the data, depending on the type of the data
-    array.astype(np.float32)
-
-.. code-block::
-
-    # np by default uses double, do you actually need it?
-    array = np.array(...)
-
-You can find some more specific memory reduction practices scattered through the documents
-For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`. However, before going into
-these, being conscious about making data copies is a good starting point. It usually
-consumes a lot more memory than people expect.
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -173,6 +173,7 @@ Will print out something similar to (not actual output as it's too long for demo
        "gradient_booster": {
          "gbtree_train_param": {
            "num_parallel_tree": "1",
+            "predictor": "gpu_predictor",
            "process_type": "default",
            "tree_method": "gpu_hist",
            "updater": "grow_gpu_hist",
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -10,7 +10,6 @@
 #include <dmlc/omp.h>

 #include <cmath>
-#include <cstdint>
 #include <iostream>
 #include <string>
 #include <utility>
@@ -113,7 +112,7 @@ using bst_row_t = std::size_t;   // NOLINT
 /*! \brief Type for tree node index. */
 using bst_node_t = std::int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
-using bst_group_t = std::uint32_t;  // NOLINT
+using bst_group_t = std::uint32_t;      // NOLINT
 /**
 * \brief Type for indexing into output targets.
 */
@@ -126,10 +125,6 @@ using bst_layer_t = std::int32_t;  // NOLINT
 * \brief Type for indexing trees.
 */
 using bst_tree_t = std::int32_t;  // NOLINT
-/**
- * @brief Ordinal of a CUDA device.
- */
-using bst_d_ordinal_t = std::int16_t;  // NOLINT

 namespace detail {
 /*! \brief Implementation of gradient statistics pair. Template specialisation
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1067,9 +1067,6 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
 /**
 * \brief Inplace prediction from CPU dense matrix.
 *
- * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle.
 * \param values        JSON encoded __array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1094,9 +1091,6 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
 /**
 * \brief Inplace prediction from CPU CSR matrix.
 *
- * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle.
 * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
 * \param indices       JSON encoded __array_interface__ to column indices in CSR.
@@ -1122,9 +1116,6 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
 /**
 * \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
 *
- * \note If the booster is configured to run on a CPU, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle
 * \param values        JSON encoded __cuda_array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1146,9 +1137,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *valu
 /**
 * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
 *
- * \note If the booster is configured to run on a CPU, XGBoost falls back to run
- *       prediction with DMatrix with a performance warning.
- *
 * \param handle        Booster handle
 * \param values        List of __cuda_array_interface__ for all columns encoded in JSON list.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -1,80 +1,20 @@
-/**
- * Copyright 2014-2023, XGBoost Contributors
+/*!
+ * Copyright 2014-2022 by Contributors
 * \file context.h
 */
 #ifndef XGBOOST_CONTEXT_H_
 #define XGBOOST_CONTEXT_H_

-#include <xgboost/base.h>       // for bst_d_ordinal_t
-#include <xgboost/logging.h>    // for CHECK_GE
-#include <xgboost/parameter.h>  // for XGBoostParameter
+#include <xgboost/logging.h>
+#include <xgboost/parameter.h>

-#include <cstdint>      // for int16_t, int32_t, int64_t
-#include <memory>       // for shared_ptr
-#include <string>       // for string, to_string
-#include <type_traits>  // for invoke_result_t, is_same_v
+#include <memory>  // std::shared_ptr
+#include <string>

 namespace xgboost {

 struct CUDAContext;

-/**
- * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
- *        viewing types like `linalg::TensorView`.
- */
-struct DeviceOrd {
-  enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
-  // CUDA device ordinal.
-  bst_d_ordinal_t ordinal{-1};
-
-  [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
-  [[nodiscard]] bool IsCPU() const { return device == kCPU; }
-
-  DeviceOrd() = default;
-  constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
-
-  DeviceOrd(DeviceOrd const& that) = default;
-  DeviceOrd& operator=(DeviceOrd const& that) = default;
-  DeviceOrd(DeviceOrd&& that) = default;
-  DeviceOrd& operator=(DeviceOrd&& that) = default;
-
-  /**
-   * @brief Constructor for CPU.
-   */
-  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
-  /**
-   * @brief Constructor for CUDA device.
-   *
-   * @param ordinal CUDA device ordinal.
-   */
-  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
-
-  [[nodiscard]] bool operator==(DeviceOrd const& that) const {
-    return device == that.device && ordinal == that.ordinal;
-  }
-  [[nodiscard]] bool operator!=(DeviceOrd const& that) const { return !(*this == that); }
-  /**
-   * @brief Get a string representation of the device and the ordinal.
-   */
-  [[nodiscard]] std::string Name() const {
-    switch (device) {
-      case DeviceOrd::kCPU:
-        return "CPU";
-      case DeviceOrd::kCUDA:
-        return "CUDA:" + std::to_string(ordinal);
-      default: {
-        LOG(FATAL) << "Unknown device.";
-        return "";
-      }
-    }
-  }
-};
-
-static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
-
-/**
- * @brief Runtime context for XGBoost. Contains information like threads and device.
- */
 struct Context : public XGBoostParameter<Context> {
 public:
  // Constant representing the device ID of CPU.
@@ -96,82 +36,33 @@ struct Context : public XGBoostParameter<Context> {
  // fail when gpu_id is invalid
  bool fail_on_invalid_gpu_id{false};
  bool validate_parameters{false};
-  /**
-   * @brief Configure the parameter `gpu_id'.
+
+  /*!
+   * \brief Configure the parameter `gpu_id'.
   *
-   * @param require_gpu Whether GPU is explicitly required by the user through other
-   *                    configurations.
+   * \param require_gpu  Whether GPU is explicitly required from user.
   */
  void ConfigureGpuId(bool require_gpu);
-  /**
-   * @brief Returns the automatically chosen number of threads based on the `nthread`
-   *        parameter and the system settting.
+  /*!
+   * Return automatically chosen threads.
   */
-  [[nodiscard]] std::int32_t Threads() const;
-  /**
-   * @brief Is XGBoost running on CPU?
-   */
-  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
-  /**
-   * @brief Is XGBoost running on a CUDA device?
-   */
-  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
-  /**
-   * @brief Get the current device and ordinal.
-   */
-  [[nodiscard]] DeviceOrd Device() const {
-    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
-  }
-  /**
-   * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
-   */
-  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
-  /**
-   * @brief Name of the current device.
-   */
-  [[nodiscard]] std::string DeviceName() const { return Device().Name(); }
-  /**
-   * @brief Get a CUDA device context for allocator and stream.
-   */
-  [[nodiscard]] CUDAContext const* CUDACtx() const;
-  /**
-   * @brief Make a CUDA context based on the current context.
-   *
-   * @param ordinal The CUDA device ordinal.
-   */
-  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
+  std::int32_t Threads() const;
+
+  bool IsCPU() const { return gpu_id == kCpuId; }
+  bool IsCUDA() const { return !IsCPU(); }
+
+  CUDAContext const* CUDACtx() const;
+  // Make a CUDA context based on the current context.
+  Context MakeCUDA(std::int32_t device = 0) const {
    Context ctx = *this;
-    CHECK_GE(ordinal, 0);
-    ctx.gpu_id = ordinal;
+    ctx.gpu_id = device;
    return ctx;
  }
-  /**
-   * @brief Make a CPU context based on the current context.
-   */
-  [[nodiscard]] Context MakeCPU() const {
+  Context MakeCPU() const {
    Context ctx = *this;
    ctx.gpu_id = kCpuId;
    return ctx;
  }
-  /**
-   * @brief Call function based on the current device.
-   */
-  template <typename CPUFn, typename CUDAFn>
-  decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn) const {
-    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
-    switch (this->Device().device) {
-      case DeviceOrd::kCPU:
-        return cpu_fn();
-      case DeviceOrd::kCUDA:
-        return cuda_fn();
-      default:
-        // Do not use the device name as this is likely an internal error, the name
-        // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
-        break;
-    }
-    return std::invoke_result_t<CPUFn>();
-  }

  // declare parameters
  DMLC_DECLARE_PARAMETER(Context) {
@@ -196,9 +87,9 @@ struct Context : public XGBoostParameter<Context> {
  }

 private:
-  // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
-  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
-  // p_impl while trying to hide CUDA code from the host compiler.
+  // mutable for lazy initialization for cuda context to avoid initializing CUDA at load.
+  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define p_impl
+  // while trying to hide CUDA code from host compiler.
  mutable std::shared_ptr<CUDAContext> cuctx_;
  // cached value for CFS CPU limit. (used in containerized env)
  std::int32_t cfs_cpu_count_;  // NOLINT
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -149,14 +149,18 @@ class GradientBooster : public Model, public Configurable {
   * \param layer_begin Beginning of boosted tree layer used for prediction.
   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param approximate use a faster (inconsistent) approximation of SHAP values
+   * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
+   * \param condition_feature feature to condition on (i.e. fix) during calculations
   */
-  virtual void PredictContribution(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
-                                   bst_layer_t layer_begin, bst_layer_t layer_end,
-                                   bool approximate = false) = 0;
+  virtual void PredictContribution(DMatrix* dmat,
+                                   HostDeviceVector<bst_float>* out_contribs,
+                                   unsigned layer_begin, unsigned layer_end,
+                                   bool approximate = false, int condition = 0,
+                                   unsigned condition_feature = 0) = 0;

-  virtual void PredictInteractionContributions(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
-                                               bst_layer_t layer_begin, bst_layer_t layer_end,
-                                               bool approximate) = 0;
+  virtual void PredictInteractionContributions(
+      DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
+      unsigned layer_begin, unsigned layer_end, bool approximate) = 0;

  /*!
   * \brief dump the model in the requested format
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -6,22 +6,24 @@
 */
 #pragma once
 #include <xgboost/base.h>
-#include <xgboost/cache.h>    // for DMatrixCache
-#include <xgboost/context.h>  // for Context
+#include <xgboost/cache.h>  // DMatrixCache
 #include <xgboost/context.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>

-#include <functional>  // for function
-#include <memory>      // for shared_ptr
+#include <functional>  // std::function
+#include <memory>
 #include <string>
+#include <thread>   // for get_id
 #include <utility>  // for make_pair
 #include <vector>

 // Forward declarations
-namespace xgboost::gbm {
+namespace xgboost {
+namespace gbm {
 struct GBTreeModel;
-}  // namespace xgboost::gbm
+}  // namespace gbm
+}  // namespace xgboost

 namespace xgboost {
 /**
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -47,7 +47,7 @@
        <spark.rapids.version>23.04.1</spark.rapids.version>
        <cudf.classifier>cuda11</cudf.classifier>
        <scalatest.version>3.2.16</scalatest.version>
-        <scala-collection-compat.version>2.10.0</scala-collection-compat.version>
+        <scala-collection-compat.version>2.11.0</scala-collection-compat.version>
      </properties>
    <repositories>
        <repository>
@@ -91,9 +91,6 @@
                    <value>ON</value>
                </property>
            </activation>
-            <properties>
-               <use.cuda>ON</use.cuda>
-            </properties>
            <modules>
                <module>xgboost4j-gpu</module>
                <module>xgboost4j-spark-gpu</module>
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -78,6 +78,7 @@ public class BoosterTest {
        put("num_round", round);
        put("num_workers", 1);
        put("tree_method", "gpu_hist");
+        put("predictor", "gpu_predictor");
        put("max_bin", maxBin);
      }
    };
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -281,6 +281,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
            // - predictor: Force to gpu predictor since native doesn't save predictor.
            val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
            booster.setParam("gpu_id", gpuId.toString)
+            booster.setParam("predictor", "gpu_predictor")
            logger.info("GPU transform on device: " + gpuId)
            boosterFlag.isGpuParamsSet = true;
          }
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(

    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
-        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
+        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
+        libxgboost_sys = sys_prefix / "lib" / _lib_name()
        if not libxgboost_sys.exists():
            raise RuntimeError(
                f"use_system_libxgboost was specified but {_lib_name()} is "
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2187,25 +2187,20 @@ class Booster:
        base_margin: Any = None,
        strict_shape: bool = False,
    ) -> NumpyOrCupy:
-        """Run prediction in-place when possible, Unlike :py:meth:`predict` method,
-        inplace prediction does not cache the prediction result.
+        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
+        does not cache the prediction result.

        Calling only ``inplace_predict`` in multiple threads is safe and lock
        free.  But the safety does not hold when used in conjunction with other
        methods. E.g. you can't train the booster in one thread and perform
        prediction in the other.

-        .. note::
-
-            If the device ordinal of the input data doesn't match the one configured for
-            the booster, data will be copied to the booster device.
-
        .. code-block:: python

-            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+            booster.set_param({"predictor": "gpu_predictor"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
+            booster.set_param({"predictor": "cpu_predictor"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2213,7 +2208,9 @@ class Booster:
        Parameters
        ----------
        data :
-            The input data.
+            The input data, must not be a view for numpy array.  Set
+            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
+            array or CuDF DataFrame.
        iteration_range :
            See :py:meth:`predict` for details.
        predict_type :
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
        os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
        # use libxgboost from a system prefix, if available.  This should be the last
        # option.
-        os.path.join(sys.base_prefix, "lib"),
+        os.path.join(sys.prefix, "lib"),
    ]

    if sys.platform == "win32":
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
            + ("\n- ".join(dll_path))
            + "\nXGBoost Python package path: "
            + curr_path
-            + "\nsys.base_prefix: "
-            + sys.base_prefix
+            + "\nsys.prefix: "
+            + sys.prefix
            + "\nSee: "
            + link
            + " for installing XGBoost."
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -277,6 +277,9 @@ __model_doc = f"""
        Device ordinal.
    validate_parameters : Optional[bool]
        Give warnings for unknown parameter.
+    predictor : Optional[str]
+        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
+        gpu_predictor].
    enable_categorical : bool

        .. versionadded:: 1.5.0
@@ -378,21 +381,17 @@ __model_doc = f"""
          every **early_stopping_rounds** round(s) to continue training.  Requires at
          least one item in **eval_set** in :py:meth:`fit`.

-        - If early stopping occurs, the model will have two additional attributes:
-          :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
-          :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
-          number of trees during inference. If users want to access the full model
-          (including trees built after early stopping), they can specify the
-          `iteration_range` in these inference methods. In addition, other utilities
-          like model plotting can also use the entire model.
-
-        - If you prefer to discard the trees after `best_iteration`, consider using the
-          callback function :py:class:`xgboost.callback.EarlyStopping`.
+        - The method returns the model from the last iteration, not the best one, use a
+          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
+          model is preferred.

        - If there's more than one item in **eval_set**, the last entry will be used for
          early stopping.  If there's more than one metric in **eval_metric**, the last
          metric will be used for early stopping.

+        - If early stopping occurs, the model will have three additional fields:
+          :py:attr:`best_score`, :py:attr:`best_iteration`.
+
        .. note::

            This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
@@ -649,6 +648,7 @@ class XGBModel(XGBModelBase):
        importance_type: Optional[str] = None,
        gpu_id: Optional[int] = None,
        validate_parameters: Optional[bool] = None,
+        predictor: Optional[str] = None,
        enable_categorical: bool = False,
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
@@ -695,6 +695,7 @@ class XGBModel(XGBModelBase):
        self.importance_type = importance_type
        self.gpu_id = gpu_id
        self.validate_parameters = validate_parameters
+        self.predictor = predictor
        self.enable_categorical = enable_categorical
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
@@ -930,7 +931,8 @@ class XGBModel(XGBModelBase):
        callbacks = self.callbacks if self.callbacks is not None else callbacks

        tree_method = params.get("tree_method", None)
-        if self.enable_categorical and tree_method == "exact":
+        cat_support = {"gpu_hist", "approx", "hist"}
+        if self.enable_categorical and tree_method not in cat_support:
            raise ValueError(
                "Experimental support for categorical data is not implemented for"
                " current tree method yet."
@@ -1087,7 +1089,12 @@ class XGBModel(XGBModelBase):
            return self

    def _can_use_inplace_predict(self) -> bool:
-        if self.booster != "gblinear":
+        # When predictor is explicitly set, using `inplace_predict` might result into
+        # error with incompatible data type.
+        # Inplace predict doesn't handle as many data types as DMatrix, but it's
+        # sufficient for dask interface where input is simpiler.
+        predictor = self.get_xgb_params().get("predictor", None)
+        if predictor in ("auto", None) and self.booster != "gblinear":
            return True
        return False

@@ -1113,9 +1120,9 @@ class XGBModel(XGBModelBase):
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then
-        :py:attr:`best_iteration` is used automatically. The estimator uses
-        `inplace_predict` by default and falls back to using :py:class:`DMatrix` if
-        devices between the data and the estimator don't match.
+        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
+        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
+        prediction is run on GPU automatically, otherwise it will run on CPU.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1577,9 +1584,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    ) -> np.ndarray:
        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
-        automatically. The estimator uses `inplace_predict` by default and falls back to
-        using :py:class:`DMatrix` if devices between the data and the estimator don't
-        match.
+        automatically.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -25,7 +25,6 @@ from typing import (
    Set,
    Tuple,
    TypedDict,
-    TypeVar,
    Union,
 )

@@ -199,14 +198,14 @@ class IteratorForTest(xgb.core.DataIter):
        X: Sequence,
        y: Sequence,
        w: Optional[Sequence],
-        cache: Optional[str],
+        cache: Optional[str] = "./",
    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.w = w
        self.it = 0
-        super().__init__(cache_prefix=cache)
+        super().__init__(cache)

    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
@@ -348,9 +347,7 @@ class TestDataset:
            if w is not None:
                weight.append(w)

-        it = IteratorForTest(
-            predictor, response, weight if weight else None, cache="cache"
-        )
+        it = IteratorForTest(predictor, response, weight if weight else None)
        return xgb.DMatrix(it)

    def __repr__(self) -> str:
@@ -712,27 +709,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
    )


-M = TypeVar("M", xgb.Booster, xgb.XGBModel)
-
-
-def set_ordinal(ordinal: int, booster: M) -> M:
-    """Temporary solution for setting the device ordinal until we move away from
-    `gpu_id`.
-
-    """
-    if ordinal < 0:
-        params = {"gpu_id": -1, "tree_method": "hist"}
-    else:
-        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
-
-    if isinstance(booster, xgb.Booster):
-        booster.set_param(params)
-    elif isinstance(booster, xgb.XGBModel):
-        booster.set_params(**params)
-
-    return booster
-
-
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -1,79 +1,73 @@
-/**
- *  Copyright 2014-2023, XGBoost Contributors
+/*!
+ *  Copyright (c) 2014-2019 by Contributors
 * \file io.h
 * \brief utilities with different serializable implementations
 * \author Tianqi Chen
 */
 #ifndef RABIT_INTERNAL_IO_H_
 #define RABIT_INTERNAL_IO_H_
-
-#include <algorithm>
-#include <cstddef>  // for size_t
 #include <cstdio>
-#include <cstring>  // for memcpy
-#include <limits>
-#include <numeric>
-#include <string>
 #include <vector>
-
+#include <cstring>
+#include <string>
+#include <algorithm>
+#include <numeric>
+#include <limits>
 #include "rabit/internal/utils.h"
 #include "rabit/serializable.h"

-namespace rabit::utils {
+namespace rabit {
+namespace utils {
 /*! \brief re-use definition of dmlc::SeekStream */
 using SeekStream = dmlc::SeekStream;
-/**
- * @brief Fixed size memory buffer as a stream.
- */
+/*! \brief fixed size memory buffer */
 struct MemoryFixSizeBuffer : public SeekStream {
 public:
  // similar to SEEK_END in libc
-  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
+  static size_t constexpr kSeekEnd = std::numeric_limits<size_t>::max();

 public:
-  /**
-   * @brief Ctor
-   *
-   * @param p_buffer Pointer to the source buffer with size `buffer_size`.
-   * @param buffer_size Size of the source buffer
-   */
-  MemoryFixSizeBuffer(void *p_buffer, std::size_t buffer_size)
-      : p_buffer_(reinterpret_cast<char *>(p_buffer)), buffer_size_(buffer_size) {}
+  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
+      : p_buffer_(reinterpret_cast<char*>(p_buffer)),
+        buffer_size_(buffer_size) {
+    curr_ptr_ = 0;
+  }
  ~MemoryFixSizeBuffer() override = default;
-
-  std::size_t Read(void *ptr, std::size_t size) override {
-    std::size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+  size_t Read(void *ptr, size_t size) override {
+    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
    if (nread != 0) std::memcpy(ptr, p_buffer_ + curr_ptr_, nread);
    curr_ptr_ += nread;
    return nread;
  }
-  void Write(const void *ptr, std::size_t size) override {
+  void Write(const void *ptr, size_t size) override {
    if (size == 0) return;
-    CHECK_LE(curr_ptr_ + size, buffer_size_);
+    utils::Assert(curr_ptr_ + size <=  buffer_size_,
+                  "write position exceed fixed buffer size");
    std::memcpy(p_buffer_ + curr_ptr_, ptr, size);
    curr_ptr_ += size;
  }
-  void Seek(std::size_t pos) override {
+  void Seek(size_t pos) override {
    if (pos == kSeekEnd) {
      curr_ptr_ = buffer_size_;
    } else {
-      curr_ptr_ = static_cast<std::size_t>(pos);
+      curr_ptr_ = static_cast<size_t>(pos);
    }
  }
-  /**
-   * @brief Current position in the buffer (stream).
-   */
-  std::size_t Tell() override { return curr_ptr_; }
-  [[nodiscard]] virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
+  size_t Tell() override {
+    return curr_ptr_;
+  }
+  virtual bool AtEnd() const {
+    return curr_ptr_ == buffer_size_;
+  }

- protected:
+ private:
  /*! \brief in memory buffer */
-  char *p_buffer_{nullptr};
+  char *p_buffer_;
  /*! \brief current pointer */
-  std::size_t buffer_size_{0};
+  size_t buffer_size_;
  /*! \brief current pointer */
-  std::size_t curr_ptr_{0};
-};
+  size_t curr_ptr_;
+};  // class MemoryFixSizeBuffer

 /*! \brief a in memory buffer that can be read and write as stream interface */
 struct MemoryBufferStream : public SeekStream {
@@ -115,5 +109,6 @@ struct MemoryBufferStream : public SeekStream {
  /*! \brief current pointer */
  size_t curr_ptr_;
 };  // class MemoryBufferStream
-}  // namespace rabit::utils
+}  // namespace utils
+}  // namespace rabit
 #endif  // RABIT_INTERNAL_IO_H_
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1023,6 +1023,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
                        const float **out_result) {
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
+  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";

  HostDeviceVector<float> *p_predt{nullptr};
  auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1041,7 +1042,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
  xgboost_CHECK_C_ARG_PTR(out_dim);
  CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
                   learner->BoostedRounds(), &shape, out_dim);
-  CHECK_GE(p_predt->Size(), n_samples);

  xgboost_CHECK_C_ARG_PTR(out_result);
  xgboost_CHECK_C_ARG_PTR(out_shape);
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -92,7 +92,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  API_END();
 }

-int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
+int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
                       char const *c_json_config, std::shared_ptr<DMatrix> p_m,
                       xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
                       const float **out_result) {
@@ -107,6 +107,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
  proxy->SetCUDAArray(c_array_interface);

  auto config = Json::Load(StringView{c_json_config});
+  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
  auto *learner = static_cast<Learner *>(handle);

  HostDeviceVector<float> *p_predt{nullptr};
@@ -117,13 +118,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                          RequiredArg<Integer>(config, "iteration_begin", __func__),
                          RequiredArg<Integer>(config, "iteration_end", __func__));
  CHECK(p_predt);
-  if (learner->Ctx()->IsCPU()) {
-    // Prediction using DMatrix as fallback.
-    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
-  } else {
-    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
-  }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());

  auto &shape = learner->GetThreadLocal().prediction_shape;
  size_t n_samples = p_m->Info().num_row_;
@@ -151,7 +146,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
  if (m) {
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
-  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }

@@ -164,6 +159,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
  xgboost_CHECK_C_ARG_PTR(out_result);
-  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -29,21 +29,13 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
    old_device_ordinal = device_ordinal;
    old_world_size = communicator_->GetWorldSize();
 #ifdef XGBOOST_USE_NCCL
-    switch (type_) {
-      case CommunicatorType::kRabit:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
-        break;
-      case CommunicatorType::kFederated:
-        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
-        break;
-      case CommunicatorType::kInMemory:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
-        break;
-      default:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+    if (type_ != CommunicatorType::kFederated) {
+      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
+    } else {
+      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
    }
 #else
-    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
+    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
 #endif
  }
  return device_communicator_.get();
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -11,18 +11,21 @@ namespace collective {

 class DeviceCommunicatorAdapter : public DeviceCommunicator {
 public:
-  explicit DeviceCommunicatorAdapter(int device_ordinal)
-      : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
+  DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
+      : device_ordinal_{device_ordinal}, communicator_{communicator} {
    if (device_ordinal_ < 0) {
      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
    }
+    if (communicator_ == nullptr) {
+      LOG(FATAL) << "Communicator cannot be null.";
+    }
  }

  ~DeviceCommunicatorAdapter() override = default;

  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override {
-    if (world_size_ == 1) {
+    if (communicator_->GetWorldSize() == 1) {
      return;
    }

@@ -30,34 +33,37 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    auto size = count * GetTypeSize(data_type);
    host_buffer_.reserve(size);
    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
-    Allreduce(host_buffer_.data(), count, data_type, op);
+    communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
  }

  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override {
-    if (world_size_ == 1) {
+    if (communicator_->GetWorldSize() == 1) {
      return;
    }

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    int const world_size = communicator_->GetWorldSize();
+    int const rank = communicator_->GetRank();

    segments->clear();
-    segments->resize(world_size_, 0);
-    segments->at(rank_) = length_bytes;
-    Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
+    segments->resize(world_size, 0);
+    segments->at(rank) = length_bytes;
+    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
+                             Operation::kMax);
    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
    receive_buffer->resize(total_bytes);

    host_buffer_.reserve(total_bytes);
    size_t offset = 0;
-    for (int32_t i = 0; i < world_size_; ++i) {
+    for (int32_t i = 0; i < world_size; ++i) {
      size_t as_bytes = segments->at(i);
-      if (i == rank_) {
-        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
+      if (i == rank) {
+        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
                                 cudaMemcpyDefault));
      }
-      Broadcast(host_buffer_.data() + offset, as_bytes, i);
+      communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
@@ -70,8 +76,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {

 private:
  int const device_ordinal_;
-  int const world_size_;
-  int const rank_;
+  Communicator *communicator_;
  /// Host buffer used to call communicator functions.
  std::vector<char> host_buffer_{};
 };
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -1,225 +0,0 @@
-/*!
- * Copyright 2023 XGBoost contributors
- */
-#if defined(XGBOOST_USE_NCCL)
-#include "nccl_device_communicator.cuh"
-
-namespace xgboost {
-namespace collective {
-
-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
-    : device_ordinal_{device_ordinal},
-      needs_sync_{needs_sync},
-      world_size_{GetWorldSize()},
-      rank_{GetRank()} {
-  if (device_ordinal_ < 0) {
-    LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
-  }
-  if (world_size_ == 1) {
-    return;
-  }
-
-  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
-  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
-  auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
-  GetCudaUUID(s_this_uuid);
-
-  // TODO(rongou): replace this with allgather.
-  Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
-
-  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
-  size_t j = 0;
-  for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
-    converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
-    j++;
-  }
-
-  auto iter = std::unique(converted.begin(), converted.end());
-  auto n_uniques = std::distance(converted.begin(), iter);
-
-  CHECK_EQ(n_uniques, world_size_)
-      << "Multiple processes within communication group running on same CUDA "
-      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
-
-  nccl_unique_id_ = GetUniqueId();
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
-  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
-}
-
-NcclDeviceCommunicator::~NcclDeviceCommunicator() {
-  if (world_size_ == 1) {
-    return;
-  }
-  if (cuda_stream_) {
-    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-  }
-  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
-  }
-  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-    LOG(CONSOLE) << "======== NCCL Statistics========";
-    LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
-    LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
-  }
-}
-
-namespace {
-ncclDataType_t GetNcclDataType(DataType const &data_type) {
-  ncclDataType_t result{ncclInt8};
-  switch (data_type) {
-    case DataType::kInt8:
-      result = ncclInt8;
-      break;
-    case DataType::kUInt8:
-      result = ncclUint8;
-      break;
-    case DataType::kInt32:
-      result = ncclInt32;
-      break;
-    case DataType::kUInt32:
-      result = ncclUint32;
-      break;
-    case DataType::kInt64:
-      result = ncclInt64;
-      break;
-    case DataType::kUInt64:
-      result = ncclUint64;
-      break;
-    case DataType::kFloat:
-      result = ncclFloat;
-      break;
-    case DataType::kDouble:
-      result = ncclDouble;
-      break;
-    default:
-      LOG(FATAL) << "Unknown data type.";
-  }
-  return result;
-}
-
-bool IsBitwiseOp(Operation const &op) {
-  return op == Operation::kBitwiseAND || op == Operation::kBitwiseOR ||
-         op == Operation::kBitwiseXOR;
-}
-
-ncclRedOp_t GetNcclRedOp(Operation const &op) {
-  ncclRedOp_t result{ncclMax};
-  switch (op) {
-    case Operation::kMax:
-      result = ncclMax;
-      break;
-    case Operation::kMin:
-      result = ncclMin;
-      break;
-    case Operation::kSum:
-      result = ncclSum;
-      break;
-    default:
-      LOG(FATAL) << "Unsupported reduce operation.";
-  }
-  return result;
-}
-
-template <typename Func>
-void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
-                         std::size_t size, cudaStream_t stream) {
-  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
-    auto result = device_buffer[idx];
-    for (auto rank = 1; rank < world_size; rank++) {
-      result = func(result, device_buffer[rank * size + idx]);
-    }
-    out_buffer[idx] = result;
-  });
-}
-}  // anonymous namespace
-
-void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
-                                              DataType data_type, Operation op) {
-  auto const size = count * GetTypeSize(data_type);
-  dh::caching_device_vector<char> buffer(size * world_size_);
-  auto *device_buffer = buffer.data().get();
-
-  // First gather data from all the workers.
-  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, cuda_stream_));
-  if (needs_sync_) {
-    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
-  }
-
-  // Then reduce locally.
-  auto *out_buffer = static_cast<char *>(send_receive_buffer);
-  switch (op) {
-    case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
-                          cuda_stream_);
-      break;
-    case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
-                          cuda_stream_);
-      break;
-    case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
-                          cuda_stream_);
-      break;
-    default:
-      LOG(FATAL) << "Not a bitwise reduce operation.";
-  }
-}
-
-void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
-                                       DataType data_type, Operation op) {
-  if (world_size_ == 1) {
-    return;
-  }
-
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  if (IsBitwiseOp(op)) {
-    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
-  } else {
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                cuda_stream_));
-  }
-  allreduce_bytes_ += count * GetTypeSize(data_type);
-  allreduce_calls_ += 1;
-}
-
-void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
-                                        std::vector<std::size_t> *segments,
-                                        dh::caching_device_vector<char> *receive_buffer) {
-  if (world_size_ == 1) {
-    return;
-  }
-
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-
-  segments->clear();
-  segments->resize(world_size_, 0);
-  segments->at(rank_) = length_bytes;
-  Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
-  auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
-  receive_buffer->resize(total_bytes);
-
-  size_t offset = 0;
-  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size_; ++i) {
-    size_t as_bytes = segments->at(i);
-    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, cuda_stream_));
-    offset += as_bytes;
-  }
-  dh::safe_nccl(ncclGroupEnd());
-}
-
-void NcclDeviceCommunicator::Synchronize() {
-  if (world_size_ == 1) {
-    return;
-  }
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
-}
-
-}  // namespace collective
-}  // namespace xgboost
-#endif
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022-2023 XGBoost contributors
+ * Copyright 2022 XGBoost contributors
 */
 #pragma once

@@ -12,26 +12,116 @@ namespace collective {

 class NcclDeviceCommunicator : public DeviceCommunicator {
 public:
-  /**
-   * @brief Construct a new NCCL communicator.
-   * @param device_ordinal The GPU device id.
-   * @param needs_sync Whether extra CUDA stream synchronization is needed.
-   *
-   * In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
-   * a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
-   * makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
-   *
-   * The Rabit communicator runs with one process per GPU, so the additional synchronization is not
-   * needed. The in-memory communicator is used in tests with multiple threads, each thread
-   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
-   */
-  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
-  ~NcclDeviceCommunicator() override;
+  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
+      : device_ordinal_{device_ordinal}, communicator_{communicator} {
+    if (device_ordinal_ < 0) {
+      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
+    }
+    if (communicator_ == nullptr) {
+      LOG(FATAL) << "Communicator cannot be null.";
+    }
+
+    int32_t const rank = communicator_->GetRank();
+    int32_t const world = communicator_->GetWorldSize();
+
+    if (world == 1) {
+      return;
+    }
+
+    std::vector<uint64_t> uuids(world * kUuidLength, 0);
+    auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
+    auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
+    GetCudaUUID(s_this_uuid);
+
+    // TODO(rongou): replace this with allgather.
+    communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
+
+    std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
+    size_t j = 0;
+    for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
+      converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
+      j++;
+    }
+
+    auto iter = std::unique(converted.begin(), converted.end());
+    auto n_uniques = std::distance(converted.begin(), iter);
+
+    CHECK_EQ(n_uniques, world)
+        << "Multiple processes within communication group running on same CUDA "
+        << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
+
+    nccl_unique_id_ = GetUniqueId();
+    dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
+    dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
+  }
+
+  ~NcclDeviceCommunicator() override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+    if (cuda_stream_) {
+      dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
+    }
+    if (nccl_comm_) {
+      dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    }
+    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+      LOG(CONSOLE) << "======== NCCL Statistics========";
+      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
+      LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
+    }
+  }
+
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                 Operation op) override;
+                 Operation op) override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
+                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
+                                cuda_stream_));
+    allreduce_bytes_ += count * GetTypeSize(data_type);
+    allreduce_calls_ += 1;
+  }
+
  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
-                  dh::caching_device_vector<char> *receive_buffer) override;
-  void Synchronize() override;
+                  dh::caching_device_vector<char> *receive_buffer) override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    int const world_size = communicator_->GetWorldSize();
+    int const rank = communicator_->GetRank();
+
+    segments->clear();
+    segments->resize(world_size, 0);
+    segments->at(rank) = length_bytes;
+    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
+                             Operation::kMax);
+    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
+    receive_buffer->resize(total_bytes);
+
+    size_t offset = 0;
+    dh::safe_nccl(ncclGroupStart());
+    for (int32_t i = 0; i < world_size; ++i) {
+      size_t as_bytes = segments->at(i);
+      dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                  ncclChar, i, nccl_comm_, cuda_stream_));
+      offset += as_bytes;
+    }
+    dh::safe_nccl(ncclGroupEnd());
+  }
+
+  void Synchronize() override {
+    if (communicator_->GetWorldSize() == 1) {
+      return;
+    }
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+  }

 private:
  static constexpr std::size_t kUuidLength =
@@ -62,20 +152,71 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  ncclUniqueId GetUniqueId() {
    static const int kRootRank = 0;
    ncclUniqueId id;
-    if (rank_ == kRootRank) {
+    if (communicator_->GetRank() == kRootRank) {
      dh::safe_nccl(ncclGetUniqueId(&id));
    }
-    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
+    communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
+                             static_cast<int>(kRootRank));
    return id;
  }

-  void BitwiseAllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                        Operation op);
+  static ncclDataType_t GetNcclDataType(DataType const &data_type) {
+    ncclDataType_t result;
+    switch (data_type) {
+      case DataType::kInt8:
+        result = ncclInt8;
+        break;
+      case DataType::kUInt8:
+        result = ncclUint8;
+        break;
+      case DataType::kInt32:
+        result = ncclInt32;
+        break;
+      case DataType::kUInt32:
+        result = ncclUint32;
+        break;
+      case DataType::kInt64:
+        result = ncclInt64;
+        break;
+      case DataType::kUInt64:
+        result = ncclUint64;
+        break;
+      case DataType::kFloat:
+        result = ncclFloat;
+        break;
+      case DataType::kDouble:
+        result = ncclDouble;
+        break;
+      default:
+        LOG(FATAL) << "Unknown data type.";
+    }
+    return result;
+  }
+
+  static ncclRedOp_t GetNcclRedOp(Operation const &op) {
+    ncclRedOp_t result;
+    switch (op) {
+      case Operation::kMax:
+        result = ncclMax;
+        break;
+      case Operation::kMin:
+        result = ncclMin;
+        break;
+      case Operation::kSum:
+        result = ncclSum;
+        break;
+      case Operation::kBitwiseAND:
+      case Operation::kBitwiseOR:
+      case Operation::kBitwiseXOR:
+        LOG(FATAL) << "Not implemented yet.";
+      default:
+        LOG(FATAL) << "Unknown reduce operation.";
+    }
+    return result;
+  }

  int const device_ordinal_;
-  bool const needs_sync_;
-  int const world_size_;
-  int const rank_;
+  Communicator *communicator_;
  ncclComm_t nccl_comm_{};
  cudaStream_t cuda_stream_{};
  ncclUniqueId nccl_unique_id_{};
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -1,5 +1,5 @@
-/**
- * Copyright 2019-2023, XGBoost Contributors
+/*!
+ * Copyright 2019 by Contributors
 * \file bitfield.h
 */
 #ifndef XGBOOST_COMMON_BITFIELD_H_
@@ -50,17 +50,14 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
 }
 #endif  // defined(__CUDACC__)

-/**
- * @brief A non-owning type with auxiliary methods defined for manipulating bits.
+/*!
+ * \brief A non-owning type with auxiliary methods defined for manipulating bits.
 *
- * @tparam VT        Underlying value type, must be an unsigned integer.
- * @tparam Direction Whether the bits start from left or from right.
- * @tparam IsConst   Whether the view is const.
+ * \tparam Direction Whether the bits start from left or from right.
 */
 template <typename VT, typename Direction, bool IsConst = false>
 struct BitFieldContainer {
  using value_type = std::conditional_t<IsConst, VT const, VT>;  // NOLINT
-  using size_type = size_t;                                      // NOLINT
  using index_type = size_t;                                     // NOLINT
  using pointer = value_type*;                                   // NOLINT

@@ -73,9 +70,8 @@ struct BitFieldContainer {
  };

 private:
-  value_type* bits_{nullptr};
-  size_type n_values_{0};
-  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
+  common::Span<value_type> bits_;
+  static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");

 public:
  XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -90,15 +86,13 @@ struct BitFieldContainer {

 public:
  BitFieldContainer() = default;
-  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
-      : bits_{bits.data()}, n_values_{bits.size()} {}
-  BitFieldContainer(BitFieldContainer const& other) = default;
-  BitFieldContainer(BitFieldContainer&& other) = default;
+  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
+  XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
  BitFieldContainer &operator=(BitFieldContainer const &that) = default;
  BitFieldContainer &operator=(BitFieldContainer &&that) = default;

-  XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
-  XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }
+  XGBOOST_DEVICE common::Span<value_type>       Bits()       { return bits_; }
+  XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }

  /*\brief Compute the size of needed memory allocation.  The returned value is in terms
   *       of number of elements with `BitFieldContainer::value_type'.
@@ -109,17 +103,17 @@ struct BitFieldContainer {
 #if defined(__CUDA_ARCH__)
  __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t min_size = min(NumValues(), rhs.NumValues());
+    size_t min_size = min(bits_.size(), rhs.bits_.size());
    if (tid < min_size) {
-      Data()[tid] |= rhs.Data()[tid];
+      bits_[tid] |= rhs.bits_[tid];
    }
    return *this;
  }
 #else
  BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(NumValues(), rhs.NumValues());
+    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
    for (size_t i = 0; i < min_size; ++i) {
-      Data()[i] |= rhs.Data()[i];
+      bits_[i] |= rhs.bits_[i];
    }
    return *this;
  }
@@ -127,85 +121,75 @@ struct BitFieldContainer {

 #if defined(__CUDA_ARCH__)
  __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = min(NumValues(), rhs.NumValues());
+    size_t min_size = min(bits_.size(), rhs.bits_.size());
    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < min_size) {
-      Data()[tid] &= rhs.Data()[tid];
+      bits_[tid] &= rhs.bits_[tid];
    }
    return *this;
  }
 #else
  BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(NumValues(), rhs.NumValues());
+    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
    for (size_t i = 0; i < min_size; ++i) {
-      Data()[i] &= rhs.Data()[i];
+      bits_[i] &= rhs.bits_[i];
    }
    return *this;
  }
 #endif  // defined(__CUDA_ARCH__)

 #if defined(__CUDA_ARCH__)
-  __device__ auto Set(index_type pos) noexcept(true) {
+  __device__ auto Set(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type set_bit = kOne << pos_v.bit_pos;
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicOr(reinterpret_cast<Type *>(&value), set_bit);
  }
-  __device__ void Clear(index_type pos) noexcept(true) {
+  __device__ void Clear(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
    atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
  }
 #else
-  void Set(index_type pos) noexcept(true) {
+  void Set(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type set_bit = kOne << pos_v.bit_pos;
    value |= set_bit;
  }
-  void Clear(index_type pos) noexcept(true) {
+  void Clear(index_type pos) {
    Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = Data()[pos_v.int_pos];
+    value_type& value = bits_[pos_v.int_pos];
    value_type clear_bit = ~(kOne << pos_v.bit_pos);
    value &= clear_bit;
  }
 #endif  // defined(__CUDA_ARCH__)

-  XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
+  XGBOOST_DEVICE bool Check(Pos pos_v) const {
    pos_v = Direction::Shift(pos_v);
-    assert(pos_v.int_pos < NumValues());
-    value_type const value = Data()[pos_v.int_pos];
+    SPAN_LT(pos_v.int_pos, bits_.size());
+    value_type const value = bits_[pos_v.int_pos];
    value_type const test_bit = kOne << pos_v.bit_pos;
    value_type result = test_bit & value;
    return static_cast<bool>(result);
  }
-  [[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
+  XGBOOST_DEVICE bool Check(index_type pos) const {
    Pos pos_v = ToBitPos(pos);
    return Check(pos_v);
  }
-  /**
-   * @brief Returns the total number of bits that can be viewed. This is equal to or
-   *        larger than the acutal number of valid bits.
-   */
-  [[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
-    return kValueSize * NumValues();
-  }
-  /**
-   * @brief Number of storage unit used in this bit field.
-   */
-  [[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }

-  XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }
+  XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }

-  inline friend std::ostream& operator<<(std::ostream& os,
-                                         BitFieldContainer<VT, Direction, IsConst> field) {
-    os << "Bits "
-       << "storage size: " << field.NumValues() << "\n";
-    for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
-      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
+  XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
+
+  inline friend std::ostream &
+  operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
+    os << "Bits " << "storage size: " << field.bits_.size() << "\n";
+    for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
+      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
      os << bset << "\n";
    }
    return os;
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -1,5 +1,5 @@
-/**
- * Copyright 2020-2023, XGBoost Contributors
+/*!
+ * Copyright 2020-2022 by XGBoost Contributors
 * \file categorical.h
 */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -10,6 +10,7 @@
 #include "bitfield.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
+#include "xgboost/parameter.h"
 #include "xgboost/span.h"

 namespace xgboost {
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -1,27 +1,16 @@
-/**
- * Copyright 2017-2023, XGBoost Contributors
+/*!
+ * Copyright 2017-2022 by XGBoost Contributors
 * \brief Utility for fast column-wise access
 */
 #include "column_matrix.h"

-#include <algorithm>    // for transform
-#include <cstddef>      // for size_t
-#include <cstdint>      // for uint64_t, uint8_t
-#include <limits>       // for numeric_limits
-#include <type_traits>  // for remove_reference_t
-#include <vector>       // for vector
-
-#include "../data/gradient_index.h"  // for GHistIndexMatrix
-#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "xgboost/base.h"            // for bst_feaature_t
-#include "xgboost/span.h"            // for Span
-
-namespace xgboost::common {
+namespace xgboost {
+namespace common {
 void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
  auto const nfeature = gmat.Features();
  const size_t nrow = gmat.Size();
  // identify type of each column
-  type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});
+  type_.resize(nfeature);

  uint32_t max_val = std::numeric_limits<uint32_t>::max();
  for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
@@ -45,7 +34,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  // want to compute storage boundary for each feature
  // using variants of prefix sum scan
-  feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
+  feature_offsets_.resize(nfeature + 1);
  size_t accum_index = 0;
  feature_offsets_[0] = accum_index;
  for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
@@ -60,11 +49,9 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
-
-  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
-
+  index_.resize(storage_size, 0);
  if (!all_dense_column) {
-    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
+    row_ind_.resize(feature_offsets_[nfeature]);
  }

  // store least bin id for each feature
@@ -72,51 +59,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  any_missing_ = !gmat.IsDense();

-  missing_ = MissingIndicator{0, false};
+  missing_flags_.clear();
 }
-
-// IO procedures for external memory.
-bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
-  if (!common::ReadVec(fi, &index_)) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &type_)) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &row_ind_)) {
-    return false;
-  }
-  if (!common::ReadVec(fi, &feature_offsets_)) {
-    return false;
-  }
-
-  if (!common::ReadVec(fi, &missing_.storage)) {
-    return false;
-  }
-  missing_.InitView();
-
-  index_base_ = index_base;
-  if (!fi->Read(&bins_type_size_)) {
-    return false;
-  }
-  if (!fi->Read(&any_missing_)) {
-    return false;
-  }
-  return true;
-}
-
-std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
-  std::size_t bytes{0};
-
-  bytes += common::WriteVec(fo, index_);
-  bytes += common::WriteVec(fo, type_);
-  bytes += common::WriteVec(fo, row_ind_);
-  bytes += common::WriteVec(fo, feature_offsets_);
-  bytes += common::WriteVec(fo, missing_.storage);
-
-  bytes += fo->Write(bins_type_size_);
-  bytes += fo->Write(any_missing_);
-
-  return bytes;
-}
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
-/**
- * Copyright 2017-2023, XGBoost Contributors
+/*!
+ * Copyright 2017-2022 by Contributors
 * \file column_matrix.h
 * \brief Utility for fast column-wise access
 * \author Philip Cho
@@ -8,30 +8,25 @@
 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
 #define XGBOOST_COMMON_COLUMN_MATRIX_H_

+#include <dmlc/endian.h>
+
 #include <algorithm>
-#include <cstddef>  // for size_t
-#include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
-#include <utility>  // for move
+#include <utility>  // std::move
 #include <vector>

 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "algorithm.h"
-#include "bitfield.h"  // for RBitField8
 #include "hist_util.h"
-#include "ref_resource_view.h"  // for RefResourceView
-#include "xgboost/base.h"       // for bst_bin_t
-#include "xgboost/span.h"       // for Span

-namespace xgboost::common {
+namespace xgboost {
+namespace common {
+
 class ColumnMatrix;
-class AlignedFileWriteStream;
-class AlignedResourceReadStream;
-
 /*! \brief column type */
-enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };
+enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };

 /*! \brief a column storage, to be used with ApplySplit. Note that each
    bin id is stored as index[i] + index_base.
@@ -46,12 +41,12 @@ class Column {
      : index_(index), index_base_(least_bin_idx) {}
  virtual ~Column() = default;

-  [[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
+  bst_bin_t GetGlobalBinIdx(size_t idx) const {
    return index_base_ + static_cast<bst_bin_t>(index_[idx]);
  }

  /* returns number of elements in column */
-  [[nodiscard]] size_t Size() const { return index_.size(); }
+  size_t Size() const { return index_.size(); }

 private:
  /* bin indexes in range [0, max_bins - 1] */
@@ -68,7 +63,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  common::Span<const size_t> row_ind_;
  size_t idx_;

-  [[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }
+  size_t const* RowIndices() const { return row_ind_.data(); }

 public:
  SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
@@ -86,7 +81,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  SparseColumnIter(SparseColumnIter const&) = delete;
  SparseColumnIter(SparseColumnIter&&) = default;

-  [[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
+  size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
  bst_bin_t operator[](size_t rid) {
    const size_t column_size = this->Size();
    if (!((idx_) < column_size)) {
@@ -106,10 +101,6 @@ class SparseColumnIter : public Column<BinIdxT> {
  }
 };

-/**
- * @brief Column stored as a dense vector. It might still contain missing values as
- *        indicated by the missing flags.
- */
 template <typename BinIdxT, bool any_missing>
 class DenseColumnIter : public Column<BinIdxT> {
 public:
@@ -118,19 +109,17 @@ class DenseColumnIter : public Column<BinIdxT> {
 private:
  using Base = Column<BinIdxT>;
  /* flags for missing values in dense columns */
-  LBitField32 missing_flags_;
+  std::vector<ByteType> const& missing_flags_;
  size_t feature_offset_;

 public:
  explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
-                           LBitField32 missing_flags, size_t feature_offset)
+                           std::vector<ByteType> const& missing_flags, size_t feature_offset)
      : Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
  DenseColumnIter(DenseColumnIter const&) = delete;
  DenseColumnIter(DenseColumnIter&&) = default;

-  [[nodiscard]] bool IsMissing(size_t ridx) const {
-    return missing_flags_.Check(feature_offset_ + ridx);
-  }
+  bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }

  bst_bin_t operator[](size_t ridx) const {
    if (any_missing) {
@@ -142,54 +131,12 @@ class DenseColumnIter : public Column<BinIdxT> {
 };

 /**
- * @brief Column major matrix for gradient index on CPU.
- *
- *    This matrix contains both dense columns and sparse columns, the type of the column
- *    is controlled by the sparse threshold parameter. When the number of missing values
- *    in a column is below the threshold it's classified as dense column.
+ * \brief Column major matrix for gradient index. This matrix contains both dense column
+ * and sparse column, the type of the column is controlled by sparse threshold. When the
+ * number of missing values in a column is below the threshold it's classified as dense
+ * column.
 */
 class ColumnMatrix {
-  /**
-   * @brief A bit set for indicating whether an element in a dense column is missing.
-   */
-  struct MissingIndicator {
-    LBitField32 missing;
-    RefResourceView<std::uint32_t> storage;
-
-    MissingIndicator() = default;
-    /**
-     * @param n_elements Size of the bit set
-     * @param init       Initialize the indicator to true or false.
-     */
-    MissingIndicator(std::size_t n_elements, bool init) {
-      auto m_size = missing.ComputeStorageSize(n_elements);
-      storage = common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
-      this->InitView();
-    }
-    /** @brief Set the i^th element to be a valid element (instead of missing). */
-    void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
-    /** @brief assign the storage to the view. */
-    void InitView() {
-      missing = LBitField32{Span{storage.data(), storage.size()}};
-    }
-
-    void GrowTo(std::size_t n_elements, bool init) {
-      CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
-          << "[Internal Error]: Cannot grow the vector when external memory is used.";
-      auto m_size = missing.ComputeStorageSize(n_elements);
-      CHECK_GE(m_size, storage.size());
-      if (m_size == storage.size()) {
-        return;
-      }
-
-      auto new_storage =
-          common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
-      std::copy_n(storage.cbegin(), storage.size(), new_storage.begin());
-      storage = std::move(new_storage);
-      this->InitView();
-    }
-  };
-
  void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);

  template <typename ColumnBinT, typename BinT, typename RIdx>
@@ -197,10 +144,9 @@ class ColumnMatrix {
    if (type_[fid] == kDenseColumn) {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[rid] = bin_id - index_base_[fid];
-      // not thread-safe with bit field.
-      // FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
-      // flags.
-      missing_.SetValid(feature_offsets_[fid] + rid);
+      // not thread-safe with bool vector.  FIXME(jiamingy): We can directly assign
+      // kMissingId to the index to avoid missing flags.
+      missing_flags_[feature_offsets_[fid] + rid] = false;
    } else {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
@@ -212,9 +158,7 @@ class ColumnMatrix {
 public:
  using ByteType = bool;
  // get number of features
-  [[nodiscard]] bst_feature_t GetNumFeature() const {
-    return static_cast<bst_feature_t>(type_.size());
-  }
+  bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }

  ColumnMatrix() = default;
  ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
@@ -222,7 +166,7 @@ class ColumnMatrix {
  }

  /**
-   * @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
+   * \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
   *        SparsePage.
   */
  void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
@@ -234,8 +178,8 @@ class ColumnMatrix {
  }

  /**
-   * @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
-   *        data.
+   * \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
+   * data.
   *
   *    This function requires a binary search for each bin to get back the feature index
   *    for those bins.
@@ -255,7 +199,7 @@ class ColumnMatrix {
    }
  }

-  [[nodiscard]] bool IsInitialized() const { return !type_.empty(); }
+  bool IsInitialized() const { return !type_.empty(); }

  /**
   * \brief Push batch of data for Quantile DMatrix support.
@@ -313,7 +257,7 @@ class ColumnMatrix {
        reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
        column_size};
    return std::move(DenseColumnIter<BinIdxType, any_missing>{
-        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
+        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
  }

  // all columns are dense column and has no missing value
@@ -321,8 +265,7 @@ class ColumnMatrix {
  template <typename RowBinIdxT>
  void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                         const size_t n_features, int32_t n_threads) {
-    missing_.GrowTo(feature_offsets_[n_features], false);
-
+    missing_flags_.resize(feature_offsets_[n_features], false);
    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
      auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
@@ -347,15 +290,9 @@ class ColumnMatrix {
  void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
                            float missing) {
    auto n_features = gmat.Features();
-
-    missing_.GrowTo(feature_offsets_[n_features], true);
-    auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
-    if (num_nonzeros_.empty()) {
-      num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
-    } else {
-      CHECK_EQ(num_nonzeros_.size(), n_features);
-    }
-
+    missing_flags_.resize(feature_offsets_[n_features], true);
+    auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
+    num_nonzeros_.resize(n_features, 0);
    auto is_valid = data::IsValidFunctor{missing};

    DispatchBinType(bins_type_size_, [&](auto t) {
@@ -384,9 +321,8 @@ class ColumnMatrix {
   */
  void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
    auto n_features = gmat.Features();
-
-    missing_ = MissingIndicator{feature_offsets_[n_features], true};
-    num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
+    missing_flags_.resize(feature_offsets_[n_features], true);
+    num_nonzeros_.resize(n_features, 0);

    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
@@ -399,34 +335,106 @@ class ColumnMatrix {
    });
  }

-  [[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
-  [[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
+  BinTypeSize GetTypeSize() const { return bins_type_size_; }
+  auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }

  // And this returns part of state
-  [[nodiscard]] bool AnyMissing() const { return any_missing_; }
+  bool AnyMissing() const { return any_missing_; }

  // IO procedures for external memory.
-  [[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
-  [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
+  bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
+    fi->Read(&index_);
+#if !DMLC_LITTLE_ENDIAN
+    // s390x
+    std::vector<std::underlying_type<ColumnType>::type> int_types;
+    fi->Read(&int_types);
+    type_.resize(int_types.size());
+    std::transform(
+        int_types.begin(), int_types.end(), type_.begin(),
+        [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
+#else
+    fi->Read(&type_);
+#endif  // !DMLC_LITTLE_ENDIAN
+
+    fi->Read(&row_ind_);
+    fi->Read(&feature_offsets_);
+
+    std::vector<std::uint8_t> missing;
+    fi->Read(&missing);
+    missing_flags_.resize(missing.size());
+    std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
+                   [](std::uint8_t flag) { return !!flag; });
+
+    index_base_ = index_base;
+#if !DMLC_LITTLE_ENDIAN
+    std::underlying_type<BinTypeSize>::type v;
+    fi->Read(&v);
+    bins_type_size_ = static_cast<BinTypeSize>(v);
+#else
+    fi->Read(&bins_type_size_);
+#endif
+
+    fi->Read(&any_missing_);
+    return true;
+  }
+
+  size_t Write(dmlc::Stream* fo) const {
+    size_t bytes{0};
+
+    auto write_vec = [&](auto const& vec) {
+      fo->Write(vec);
+      bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
+               sizeof(uint64_t);
+    };
+    write_vec(index_);
+#if !DMLC_LITTLE_ENDIAN
+    // s390x
+    std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
+    std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
+      return static_cast<std::underlying_type<ColumnType>::type>(t);
+    });
+    write_vec(int_types);
+#else
+    write_vec(type_);
+#endif  // !DMLC_LITTLE_ENDIAN
+    write_vec(row_ind_);
+    write_vec(feature_offsets_);
+    // dmlc can not handle bool vector
+    std::vector<std::uint8_t> missing(missing_flags_.size());
+    std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
+                   [](bool flag) { return static_cast<std::uint8_t>(flag); });
+    write_vec(missing);
+
+#if !DMLC_LITTLE_ENDIAN
+    auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
+    fo->Write(v);
+#else
+    fo->Write(bins_type_size_);
+#endif  // DMLC_LITTLE_ENDIAN
+    bytes += sizeof(bins_type_size_);
+    fo->Write(any_missing_);
+    bytes += sizeof(any_missing_);
+
+    return bytes;
+  }

 private:
-  RefResourceView<std::uint8_t> index_;
+  std::vector<uint8_t> index_;

-  RefResourceView<ColumnType> type_;
-  /** @brief indptr of a CSC matrix. */
-  RefResourceView<std::size_t> row_ind_;
-  /** @brief indicate where each column's index and row_ind is stored. */
-  RefResourceView<std::size_t> feature_offsets_;
-  /** @brief The number of nnz of each column. */
-  RefResourceView<std::size_t> num_nonzeros_;
+  std::vector<ColumnType> type_;
+  /* indptr of a CSC matrix. */
+  std::vector<size_t> row_ind_;
+  /* indicate where each column's index and row_ind is stored. */
+  std::vector<size_t> feature_offsets_;
+  /* The number of nnz of each column. */
+  std::vector<size_t> num_nonzeros_;

  // index_base_[fid]: least bin id for feature fid
-  std::uint32_t const* index_base_;
-
-  MissingIndicator missing_;
-
+  uint32_t const* index_base_;
+  std::vector<ByteType> missing_flags_;
  BinTypeSize bins_type_size_;
  bool any_missing_;
 };
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
 #endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -6,11 +6,6 @@
 #ifndef XGBOOST_COMMON_ERROR_MSG_H_
 #define XGBOOST_COMMON_ERROR_MSG_H_

-#include <cinttypes>  // for uint64_t
-#include <limits>     // for numeric_limits
-
-#include "xgboost/base.h"  // for bst_feature_t
-#include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView

 namespace xgboost::error {
@@ -38,46 +33,5 @@ constexpr StringView InconsistentMaxBin() {
  return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
         "and consistent with the Booster being trained.";
 }
-
-constexpr StringView UnknownDevice() { return "Unknown device type."; }
-
-inline void MaxFeatureSize(std::uint64_t n_features) {
-  auto max_n_features = std::numeric_limits<bst_feature_t>::max();
-  CHECK_LE(n_features, max_n_features)
-      << "Unfortunately, XGBoost does not support data matrices with "
-      << std::numeric_limits<bst_feature_t>::max() << " features or greater";
-}
-
-constexpr StringView InplacePredictProxy() {
-  return "Inplace predict accepts only DMatrixProxy as input.";
-}
-
-inline void MaxSampleSize(std::size_t n) {
-  LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
-             << ". Consider using a different updater or tree_method.";
-}
-
-constexpr StringView OldSerialization() {
-  return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
-configuration generated by an older version of XGBoost, please export the model by calling
-`Booster.save_model` from that version first, then load it back in current version. See:
-
-    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
-
-for more details about differences between saving model and serializing.
-)doc";
-}
-
-inline void WarnOldSerialization() {
-  // Display it once is enough. Otherwise this can be really verbose in distributed
-  // environments.
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-
-  LOG(WARNING) << OldSerialization();
-  logged = true;
-}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -127,76 +127,55 @@ void SortByWeight(dh::device_vector<float>* weights,
                                });
 }

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
-                                dh::device_vector<Entry>* p_sorted_entries,
-                                dh::device_vector<float>* p_sorted_weights,
-                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
+void RemoveDuplicatedCategories(
+    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
+    dh::device_vector<Entry> *p_sorted_entries,
+    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
  info.feature_types.SetDevice(device);
  auto d_feature_types = info.feature_types.ConstDeviceSpan();
  CHECK(!d_feature_types.empty());
-  auto& column_sizes_scan = *p_column_sizes_scan;
-  auto& sorted_entries = *p_sorted_entries;
+  auto &column_sizes_scan = *p_column_sizes_scan;
+  auto &sorted_entries = *p_sorted_entries;
  // Removing duplicated entries in categorical features.
-
-  // We don't need to accumulate weight for duplicated entries as there's no weighted
-  // sketching for categorical features, the categories are the cut values.
  dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
-  std::size_t n_uniques{0};
-  if (p_sorted_weights) {
-    using Pair = thrust::tuple<Entry, float>;
-    auto d_sorted_entries = dh::ToSpan(sorted_entries);
-    auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
-    auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
-    auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
-    n_uniques = dh::SegmentedUnique(
-        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
-        val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
-        [=] __device__(Pair const& l, Pair const& r) {
-          Entry const& le = thrust::get<0>(l);
-          Entry const& re = thrust::get<0>(r);
-          if (le.index == re.index && IsCat(d_feature_types, le.index)) {
-            return le.fvalue == re.fvalue;
-          }
-          return false;
-        });
-    p_sorted_weights->resize(n_uniques);
-  } else {
-    n_uniques = dh::SegmentedUnique(
-        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
-        sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
-        sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
-          if (l.index == r.index) {
-            if (IsCat(d_feature_types, l.index)) {
-              return l.fvalue == r.fvalue;
-            }
-          }
-          return false;
-        });
-  }
-  sorted_entries.resize(n_uniques);
+  dh::SegmentedUnique(column_sizes_scan.data().get(),
+                      column_sizes_scan.data().get() + column_sizes_scan.size(),
+                      sorted_entries.begin(), sorted_entries.end(),
+                      new_column_scan.data().get(), sorted_entries.begin(),
+                      [=] __device__(Entry const &l, Entry const &r) {
+                        if (l.index == r.index) {
+                          if (IsCat(d_feature_types, l.index)) {
+                            return l.fvalue == r.fvalue;
+                          }
+                        }
+                        return false;
+                      });

  // Renew the column scan and cut scan based on categorical data.
  auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
-  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
+  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
+      info.num_col_ + 1);
  CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
-  dh::LaunchN(new_column_scan.size(),
-              [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
-               d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
-               d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
-                d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
-                if (idx == d_new_columns_ptr.size() - 1) {
-                  return;
-                }
-                if (IsCat(d_feature_types, idx)) {
-                  // Cut size is the same as number of categories in input.
-                  d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
-                } else {
-                  d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
-                }
-              });
+  dh::LaunchN(
+      new_column_scan.size(),
+      [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
+       d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
+       d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
+        d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
+        if (idx == d_new_columns_ptr.size() - 1) {
+          return;
+        }
+        if (IsCat(d_feature_types, idx)) {
+          // Cut size is the same as number of categories in input.
+          d_new_cuts_size[idx] =
+              d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
+        } else {
+          d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
+        }
+      });
  // Turn size into ptr.
-  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
-                         d_cuts_ptr.data());
+  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
+                         new_cuts_size.cend(), d_cuts_ptr.data());
 }
 }  // namespace detail

@@ -230,8 +209,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();

  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
@@ -297,8 +276,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -240,10 +240,10 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);

-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
-                                dh::device_vector<Entry>* p_sorted_entries,
-                                dh::device_vector<float>* p_sorted_weights,
-                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
+void RemoveDuplicatedCategories(
+    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
+    dh::device_vector<Entry> *p_sorted_entries,
+    dh::caching_device_vector<size_t> *p_column_sizes_scan);
 }  // namespace detail

 // Compute sketch on DMatrix.
@@ -275,8 +275,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -354,8 +354,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
-                                       &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
+                                       &sorted_entries, &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -84,7 +84,7 @@ class HistogramCuts {
    return *this;
  }

-  [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
+  uint32_t FeatureBins(bst_feature_t feature) const {
    return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
  }

@@ -92,8 +92,8 @@ class HistogramCuts {
  std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
  std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }

-  [[nodiscard]] bool HasCategorical() const { return has_categorical_; }
-  [[nodiscard]] float MaxCategory() const { return max_cat_; }
+  bool HasCategorical() const { return has_categorical_; }
+  float MaxCategory() const { return max_cat_; }
  /**
   * \brief Set meta info about categorical features.
   *
@@ -105,13 +105,12 @@ class HistogramCuts {
    max_cat_ = max_cat;
  }

-  [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
+  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }

  // Return the index of a cut point that is strictly greater than the input
  // value, or the last available index if none exists
-  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
-                                    std::vector<uint32_t> const& ptrs,
-                                    std::vector<float> const& values) const {
+  bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
+                      std::vector<float> const& values) const {
    auto end = ptrs[column_id + 1];
    auto beg = ptrs[column_id];
    auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
@@ -120,20 +119,20 @@ class HistogramCuts {
    return idx;
  }

-  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
+  bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
    return this->SearchBin(value, column_id, Ptrs(), Values());
  }
+
  /**
   * \brief Search the bin index for numerical feature.
   */
-  [[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
+  bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }

  /**
   * \brief Search the bin index for categorical feature.
   */
-  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
-                                       std::vector<uint32_t> const& ptrs,
-                                       std::vector<float> const& vals) const {
+  bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
+                         std::vector<float> const& vals) const {
    auto end = ptrs.at(fidx + 1) + vals.cbegin();
    auto beg = ptrs[fidx] + vals.cbegin();
    // Truncates the value in case it's not perfectly rounded.
@@ -144,14 +143,12 @@ class HistogramCuts {
    }
    return bin_idx;
  }
-  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
+  bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
    auto const& ptrs = this->Ptrs();
    auto const& vals = this->Values();
    return this->SearchCatBin(value, fidx, ptrs, vals);
  }
-  [[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
-    return SearchCatBin(e.fvalue, e.index);
-  }
+  bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }

  /**
   * \brief Return numerical bin value given bin index.
@@ -203,33 +200,13 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
 }

 /**
- * @brief Optionally compressed gradient index. The compression works only with dense
+ * \brief Optionally compressed gradient index. The compression works only with dense
 *        data.
 *
 *   The main body of construction code is in gradient_index.cc, this struct is only a
- *   view class.
+ *   storage class.
 */
-class Index {
- private:
-  void SetBinTypeSize(BinTypeSize binTypeSize) {
-    binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
-              binTypeSize == kUint32BinsTypeSize);
-    }
-  }
-
- public:
+struct Index {
  // Inside the compressor, bin_idx is the index for cut value across all features. By
  // subtracting it with starting pointer of each feature, we can reduce it to smaller
  // value and store it with smaller types. Usable only with dense data.
@@ -253,24 +230,10 @@ class Index {
  }

  Index() { SetBinTypeSize(binTypeSize_); }
-
-  Index(Index const& i) = delete;
-  Index& operator=(Index const& i) = delete;
+  Index(const Index& i) = delete;
+  Index& operator=(Index i) = delete;
  Index(Index&& i) = delete;
-
-  /** @brief Move assignment for lazy initialization. */
-  Index& operator=(Index&& i) = default;
-
-  /**
-   * @brief Construct the index from data.
-   *
-   * @param data     Storage for compressed histogram bin.
-   * @param bin_size Number of bytes for each bin.
-   */
-  Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
-    this->SetBinTypeSize(bin_size);
-  }
-
+  Index& operator=(Index&& i) = delete;
  uint32_t operator[](size_t i) const {
    if (!bin_offset_.empty()) {
      // dense, compressed
@@ -281,7 +244,26 @@ class Index {
      return func_(data_.data(), i);
    }
  }
-  [[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
+              binTypeSize == kUint32BinsTypeSize);
+    }
+  }
+  BinTypeSize GetBinTypeSize() const {
+    return binTypeSize_;
+  }
  template <typename T>
  T const* data() const {  // NOLINT
    return reinterpret_cast<T const*>(data_.data());
@@ -290,27 +272,30 @@ class Index {
  T* data() {  // NOLINT
    return reinterpret_cast<T*>(data_.data());
  }
-  [[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
-  [[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
-  [[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }
+  uint32_t const* Offset() const { return bin_offset_.data(); }
+  size_t OffsetSize() const { return bin_offset_.size(); }
+  size_t Size() const { return data_.size() / (binTypeSize_); }

+  void Resize(const size_t n_bytes) {
+    data_.resize(n_bytes);
+  }
  // set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
  void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
    bin_offset_.resize(cut_ptrs.size() - 1);  // resize to number of features.
    std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
  }
-  auto begin() const {  // NOLINT
-    return data_.data();
+  std::vector<uint8_t>::const_iterator begin() const {  // NOLINT
+    return data_.begin();
  }
-  auto end() const {  // NOLINT
-    return data_.data() + data_.size();
+  std::vector<uint8_t>::const_iterator end() const {  // NOLINT
+    return data_.end();
  }

-  auto begin() {  // NOLINT
-    return data_.data();
+  std::vector<uint8_t>::iterator begin() {  // NOLINT
+    return data_.begin();
  }
-  auto end() {  // NOLINT
-    return data_.data() + data_.size();
+  std::vector<uint8_t>::iterator end() {  // NOLINT
+    return data_.end();
  }

 private:
@@ -325,12 +310,12 @@ class Index {

  using Func = uint32_t (*)(uint8_t const*, size_t);

-  Span<std::uint8_t> data_;
+  std::vector<uint8_t> data_;
  // starting position of each feature inside the cut values (the indptr of the CSC cut matrix
  // HistogramCuts without the last entry.) Used for bin compression.
  std::vector<uint32_t> bin_offset_;

-  BinTypeSize binTypeSize_{kUint8BinsTypeSize};
+  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
  Func func_;
 };

--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,47 +1,24 @@
-/**
- * Copyright 2019-2023, by XGBoost Contributors
+/*!
+ * Copyright (c) by XGBoost Contributors 2019-2022
 */
-#if !defined(NOMINMAX) && defined(_WIN32)
-#define NOMINMAX
-#endif  // !defined(NOMINMAX)
-
-#if !defined(xgboost_IS_WIN)
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define xgboost_IS_WIN 1
-#endif  // defined(_MSC_VER) || defined(__MINGW32__)
-
-#endif  // !defined(xgboost_IS_WIN)
-
-#if defined(__unix__) || defined(__APPLE__)
-#include <fcntl.h>     // for open, O_RDONLY
-#include <sys/mman.h>  // for mmap, mmap64, munmap
-#include <unistd.h>    // for close, getpagesize
-#elif defined(xgboost_IS_WIN)
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+#if defined(__unix__)
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 #endif  // defined(__unix__)
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <memory>
+#include <utility>
+#include <cstdio>

-#include <algorithm>     // for copy, transform
-#include <cctype>        // for tolower
-#include <cerrno>        // for errno
-#include <cstddef>       // for size_t
-#include <cstdint>       // for int32_t, uint32_t
-#include <cstring>       // for memcpy
-#include <fstream>       // for ifstream
-#include <iterator>      // for distance
-#include <limits>        // for numeric_limits
-#include <memory>        // for unique_ptr
-#include <string>        // for string
-#include <system_error>  // for error_code, system_category
-#include <utility>       // for move
-#include <vector>        // for vector
-
-#include "io.h"
-#include "xgboost/collective/socket.h"  // for LastError
 #include "xgboost/logging.h"
+#include "io.h"
+
+namespace xgboost {
+namespace common {

-namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
  size_t nbuffer = buffer_.length() - buffer_ptr_;
  if (nbuffer == 0) return strm_->Read(dptr, size);
@@ -117,32 +94,11 @@ void FixedSizeStream::Take(std::string* out) {
  *out = std::move(buffer_);
 }

-namespace {
-// Get system alignment value for IO with mmap.
-std::size_t GetMmapAlignment() {
-#if defined(xgboost_IS_WIN)
-  SYSTEM_INFO sys_info;
-  GetSystemInfo(&sys_info);
-  // During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
-  // size 65536.
-  return sys_info.dwAllocationGranularity;
-#else
-  return getpagesize();
-#endif
-}
-
-auto SystemErrorMsg() {
-  std::int32_t errsv = system::LastError();
-  auto err = std::error_code{errsv, std::system_category()};
-  return err.message();
-}
-}  // anonymous namespace
-
 std::string LoadSequentialFile(std::string uri, bool stream) {
  auto OpenErr = [&uri]() {
    std::string msg;
    msg = "Opening " + uri + " failed: ";
-    msg += SystemErrorMsg();
+    msg += strerror(errno);
    LOG(FATAL) << msg;
  };

@@ -199,159 +155,5 @@ std::string FileExtension(std::string fname, bool lower) {
    return "";
  }
 }
-
-// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
-// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
-ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
-
-struct MMAPFile {
-#if defined(xgboost_IS_WIN)
-  HANDLE fd{INVALID_HANDLE_VALUE};
-  HANDLE file_map{INVALID_HANDLE_VALUE};
-#else
-  std::int32_t fd{0};
-#endif
-  std::byte* base_ptr{nullptr};
-  std::size_t base_size{0};
-  std::size_t delta{0};
-  std::string path;
-
-  MMAPFile() = default;
-
-#if defined(xgboost_IS_WIN)
-  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd},
-        file_map{fm},
-        base_ptr{base_ptr},
-        base_size{base_size},
-        delta{delta},
-        path{std::move(path)} {}
-#else
-  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
-           std::string path)
-      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
-#endif
-};
-
-std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
-  if (length == 0) {
-    return std::make_unique<MMAPFile>();
-  }
-
-#if defined(xgboost_IS_WIN)
-  HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
-                         FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
-  CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
-#else
-  auto fd = open(path.c_str(), O_RDONLY);
-  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
-#endif
-
-  std::byte* ptr{nullptr};
-  // Round down for alignment.
-  auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
-  auto view_size = length + (offset - view_start);
-
-#if defined(__linux__) || defined(__GLIBC__)
-  int prot{PROT_READ};
-  ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  madvise(ptr, view_size, MADV_WILLNEED);
-  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
-#elif defined(xgboost_IS_WIN)
-  auto file_size = GetFileSize(fd, nullptr);
-  DWORD access = PAGE_READONLY;
-  auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
-  access = FILE_MAP_READ;
-  std::uint32_t loff = static_cast<std::uint32_t>(view_start);
-  std::uint32_t hoff = view_start >> 32;
-  CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
-  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
-                                           std::move(path));
-#else
-  CHECK_LE(offset, std::numeric_limits<off_t>::max())
-      << "File size has exceeded the limit on the current system.";
-  int prot{PROT_READ};
-  ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
-  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  auto handle =
-      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
-#endif  // defined(__linux__)
-
-  return handle;
-}
-
-MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
-    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
-
-MmapResource::~MmapResource() noexcept(false) {
-  if (!handle_) {
-    return;
-  }
-#if defined(xgboost_IS_WIN)
-  if (handle_->base_ptr) {
-    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
-  }
-  if (handle_->fd != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
-  }
-  if (handle_->file_map != INVALID_HANDLE_VALUE) {
-    CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
-  }
-#else
-  if (handle_->base_ptr) {
-    CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
-        << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
-  }
-  if (handle_->fd != 0) {
-    CHECK_NE(close(handle_->fd), -1)
-        << "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
-  }
-#endif
-}
-
-[[nodiscard]] void* MmapResource::Data() {
-  if (!handle_) {
-    return nullptr;
-  }
-  return handle_->base_ptr + handle_->delta;
-}
-
-[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
-
-// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
-// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
-AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {}  // NOLINT
-PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {}        // NOLINT
-
-AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
-    : pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
-
-[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
-                                                          std::size_t n_bytes) noexcept(true) {
-  pimpl_->Write(ptr, n_bytes);
-  return n_bytes;
-}
-
-AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
-    : pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
-AlignedMemWriteStream::~AlignedMemWriteStream() = default;
-
-[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
-                                                         std::size_t n_bytes) noexcept(true) {
-  this->pimpl_->Write(ptr, n_bytes);
-  return n_bytes;
-}
-
-[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
-  return this->pimpl_->Tell();
-}
-}  // namespace xgboost::common
-
-#if defined(xgboost_IS_WIN)
-#undef xgboost_IS_WIN
-#endif  // defined(xgboost_IS_WIN)
+}  // namespace common
+}  // namespace xgboost
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -1,32 +1,23 @@
-/**
- * Copyright 2014-2023, XGBoost Contributors
+/*!
+ * Copyright by XGBoost Contributors 2014-2022
 * \file io.h
 * \brief general stream interface for serialization, I/O
 * \author Tianqi Chen
 */
+
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_

 #include <dmlc/io.h>
 #include <rabit/rabit.h>
-
-#include <algorithm>    // for min
-#include <array>        // for array
-#include <cstddef>      // for byte, size_t
-#include <cstdlib>      // for malloc, realloc, free
-#include <cstring>      // for memcpy
-#include <fstream>      // for ifstream
-#include <limits>       // for numeric_limits
-#include <memory>       // for unique_ptr
-#include <string>       // for string
-#include <type_traits>  // for alignment_of_v, enable_if_t
-#include <utility>      // for move
-#include <vector>       // for vector
+#include <string>
+#include <cstring>
+#include <fstream>

 #include "common.h"
-#include "xgboost/string_view.h"  // for StringView

-namespace xgboost::common {
+namespace xgboost {
+namespace common {
 using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
 using MemoryBufferStream = rabit::utils::MemoryBufferStream;

@@ -65,8 +56,8 @@ class FixedSizeStream : public PeekableInStream {

  size_t Read(void* dptr, size_t size) override;
  size_t PeekRead(void* dptr, size_t size) override;
-  [[nodiscard]] std::size_t Size() const { return buffer_.size(); }
-  [[nodiscard]] std::size_t Tell() const { return pointer_; }
+  size_t Size() const { return buffer_.size(); }
+  size_t Tell() const { return pointer_; }
  void Seek(size_t pos);

  void Write(const void*, size_t) override {
@@ -136,318 +127,6 @@ inline std::string ReadAll(std::string const &path) {
  return content;
 }

-struct MMAPFile;
-
-/**
- * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
- *        fixed once it's constructed. Users cannot use mutable operations like resize
- *        without acquiring the specific resource first.
- */
-class ResourceHandler {
- public:
-  // RTTI
-  enum Kind : std::uint8_t {
-    kMalloc = 0,
-    kMmap = 1,
-  };
-
- private:
-  Kind kind_{kMalloc};
-
- public:
-  virtual void* Data() = 0;
-  template <typename T>
-  [[nodiscard]] T* DataAs() {
-    return reinterpret_cast<T*>(this->Data());
-  }
-
-  [[nodiscard]] virtual std::size_t Size() const = 0;
-  [[nodiscard]] auto Type() const { return kind_; }
-
-  // Allow exceptions for cleaning up resource.
-  virtual ~ResourceHandler() noexcept(false);
-
-  explicit ResourceHandler(Kind kind) : kind_{kind} {}
-  // Use shared_ptr to manage a pool like resource handler. All copy and assignment
-  // operators are disabled.
-  ResourceHandler(ResourceHandler const& that) = delete;
-  ResourceHandler& operator=(ResourceHandler const& that) = delete;
-  ResourceHandler(ResourceHandler&& that) = delete;
-  ResourceHandler& operator=(ResourceHandler&& that) = delete;
-  /**
-   * @brief Wether two resources have the same type. (both malloc or both mmap).
-   */
-  [[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
-    return this->Type() == that.Type();
-  }
-};
-
-class MallocResource : public ResourceHandler {
-  void* ptr_{nullptr};
-  std::size_t n_{0};
-
-  void Clear() noexcept(true) {
-    std::free(ptr_);
-    ptr_ = nullptr;
-    n_ = 0;
-  }
-
- public:
-  explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
-  ~MallocResource() noexcept(true) override { this->Clear(); }
-
-  void* Data() override { return ptr_; }
-  [[nodiscard]] std::size_t Size() const override { return n_; }
-  /**
-   * @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
-   *        over malloc.
-   *
-   * @tparam force_malloc Force the use of malloc over realloc. Used for testing.
-   *
-   * @param n_bytes The new size.
-   */
-  template <bool force_malloc = false>
-  void Resize(std::size_t n_bytes) {
-    // realloc(ptr, 0) works, but is deprecated.
-    if (n_bytes == 0) {
-      this->Clear();
-      return;
-    }
-
-    // If realloc fails, we need to copy the data ourselves.
-    bool need_copy{false};
-    void* new_ptr{nullptr};
-    // use realloc first, it can handle nullptr.
-    if constexpr (!force_malloc) {
-      new_ptr = std::realloc(ptr_, n_bytes);
-    }
-    // retry with malloc if realloc fails
-    if (!new_ptr) {
-      // ptr_ is preserved if realloc fails
-      new_ptr = std::malloc(n_bytes);
-      need_copy = true;
-    }
-    if (!new_ptr) {
-      // malloc fails
-      LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
-    }
-
-    if (need_copy) {
-      std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
-    }
-    // default initialize
-    std::memset(reinterpret_cast<std::byte*>(new_ptr) + n_, '\0', n_bytes - n_);
-    // free the old ptr if malloc is used.
-    if (need_copy) {
-      this->Clear();
-    }
-
-    ptr_ = new_ptr;
-    n_ = n_bytes;
-  }
-};
-
-/**
- * @brief A class for wrapping mmap as a resource for RAII.
- */
-class MmapResource : public ResourceHandler {
-  std::unique_ptr<MMAPFile> handle_;
-  std::size_t n_;
-
- public:
-  MmapResource(std::string path, std::size_t offset, std::size_t length);
-  ~MmapResource() noexcept(false) override;
-
-  [[nodiscard]] void* Data() override;
-  [[nodiscard]] std::size_t Size() const override;
-};
-
-/**
- * @param Alignment for resource read stream and aligned write stream.
- */
-constexpr std::size_t IOAlignment() {
-  // For most of the pod types in XGBoost, 8 byte is sufficient.
-  return 8;
-}
-
-/**
- * @brief Wrap resource into a dmlc stream.
- *
- *  This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
- *  method or the `Consume()` method. The former copies data into output, while the latter
- *  makes copy only if it's a primitive type.
- *
- *  Input is required to be aligned to IOAlignment().
- */
-class AlignedResourceReadStream {
-  std::shared_ptr<ResourceHandler> resource_;
-  std::size_t curr_ptr_{0};
-
-  // Similar to SEEK_END in libc
-  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
-
- public:
-  explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
-      : resource_{std::move(resource)} {}
-
-  [[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
-  /**
-   * @brief Consume n_bytes of data, no copying is performed.
-   *
-   * @return A pair with the beginning pointer and the number of available bytes, which
-   *         may be smaller than requested.
-   */
-  [[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
-    auto res_size = resource_->Size();
-    auto data = reinterpret_cast<std::byte*>(resource_->Data());
-    auto ptr = data + curr_ptr_;
-
-    // Move the cursor
-    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
-    auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
-    std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
-
-    curr_ptr_ += aligned_forward;
-
-    return std::pair{ptr, forward};
-  }
-
-  template <typename T>
-  [[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
-    auto [ptr, size] = this->Consume(sizeof(T));
-    if (size != sizeof(T)) {
-      return false;
-    }
-    CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
-    *out = *reinterpret_cast<T*>(ptr);
-    return true;
-  }
-
-  [[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
-  /**
-   * @brief Read n_bytes of data, output is copied into ptr.
-   */
-  [[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
-    auto [res_ptr, forward] = this->Consume(n_bytes);
-    if (forward != 0) {
-      std::memcpy(ptr, res_ptr, forward);
-    }
-    return forward;
-  }
-  /**
-   * @brief Read a primitive type.
-   *
-   * @return Whether the read is successful.
-   */
-  template <typename T>
-  [[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
-    return this->Consume(out);
-  }
-  /**
-   * @brief Read a vector.
-   *
-   * @return Whether the read is successful.
-   */
-  template <typename T>
-  [[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
-    std::uint64_t n{0};
-    if (!this->Consume(&n)) {
-      return false;
-    }
-    out->resize(n);
-
-    auto n_bytes = sizeof(T) * n;
-    if (this->Read(out->data(), n_bytes) != n_bytes) {
-      return false;
-    }
-    return true;
-  }
-
-  virtual ~AlignedResourceReadStream() noexcept(false);
-};
-
-/**
- * @brief Private mmap file as a read-only stream.
- *
- *  It can calculate alignment automatically based on system page size (or allocation
- *  granularity on Windows).
- *
- *  The file is required to be aligned by IOAlignment().
- */
-class PrivateMmapConstStream : public AlignedResourceReadStream {
- public:
-  /**
-   * @brief Construct a private mmap stream.
-   *
-   * @param path      File path.
-   * @param offset    See the `offset` parameter of `mmap` for details.
-   * @param length    See the `length` parameter of `mmap` for details.
-   */
-  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
-  ~PrivateMmapConstStream() noexcept(false) override;
-};
-
-/**
- * @brief Base class for write stream with alignment defined by IOAlignment().
- */
-class AlignedWriteStream {
- protected:
-  [[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
-                                            std::size_t n_bytes) noexcept(true) = 0;
-
- public:
-  virtual ~AlignedWriteStream() = default;
-
-  [[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
-    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
-    auto w_n_bytes = this->DoWrite(ptr, n_bytes);
-    CHECK_EQ(w_n_bytes, n_bytes);
-    auto remaining = aligned_n_bytes - n_bytes;
-    if (remaining > 0) {
-      std::array<std::uint8_t, IOAlignment()> padding;
-      std::memset(padding.data(), '\0', padding.size());
-      w_n_bytes = this->DoWrite(padding.data(), remaining);
-      CHECK_EQ(w_n_bytes, remaining);
-    }
-    return aligned_n_bytes;
-  }
-
-  template <typename T>
-  [[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
-    return this->Write(&v, sizeof(T));
-  }
-};
-
-/**
- * @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
- */
-class AlignedFileWriteStream : public AlignedWriteStream {
-  std::unique_ptr<dmlc::Stream> pimpl_;
-
- protected:
-  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
-
- public:
-  AlignedFileWriteStream() = default;
-  AlignedFileWriteStream(StringView path, StringView flags);
-  ~AlignedFileWriteStream() override = default;
-};
-
-/**
- * @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
- */
-class AlignedMemWriteStream : public AlignedFileWriteStream {
-  std::unique_ptr<MemoryBufferStream> pimpl_;
-
- protected:
-  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
-
- public:
-  explicit AlignedMemWriteStream(std::string* p_buf);
-  ~AlignedMemWriteStream() override;
-
-  [[nodiscard]] std::size_t Tell() const noexcept(true);
-};
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
 #endif  // XGBOOST_COMMON_IO_H_
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -1,158 +0,0 @@
-/**
- * Copyright 2023, XGBoost Contributors
- */
-#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
-#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
-
-#include <algorithm>    // for fill_n
-#include <cstdint>      // for uint64_t
-#include <cstring>      // for memcpy
-#include <memory>       // for shared_ptr, make_shared
-#include <type_traits>  // for is_reference_v, remove_reference_t, is_same_v
-#include <utility>      // for swap, move
-
-#include "io.h"  // for ResourceHandler, AlignedResourceReadStream, MallocResource
-#include "xgboost/logging.h"
-#include "xgboost/span.h"  // for Span
-
-namespace xgboost::common {
-/**
- * @brief A vector-like type that holds a reference counted resource.
- *
- *    The vector size is immutable after construction. This way we can swap the underlying
- *    resource when needed.
- */
-template <typename T>
-class RefResourceView {
-  static_assert(!std::is_reference_v<T>);
-
- public:
-  using value_type = T;             // NOLINT
-  using size_type = std::uint64_t;  // NOLINT
-
- private:
-  value_type* ptr_{nullptr};
-  size_type size_{0};
-  std::shared_ptr<common::ResourceHandler> mem_{nullptr};
-
- public:
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
-      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
-    CHECK_GE(mem_->Size(), n);
-  }
-  /**
-   * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
-   *
-   * @param ptr  The pointer to view.
-   * @param n    The length of the view.
-   * @param mem  The owner of the pointer.
-   * @param init Initialize the view with this value.
-   */
-  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
-                  T const& init)
-      : RefResourceView{ptr, n, mem} {
-    if (n != 0) {
-      std::fill_n(ptr_, n, init);
-    }
-  }
-
-  ~RefResourceView() = default;
-
-  RefResourceView() = default;
-  RefResourceView(RefResourceView const& that) = delete;
-  RefResourceView(RefResourceView&& that) = delete;
-  RefResourceView& operator=(RefResourceView const& that) = delete;
-  /**
-   * @brief We allow move assignment for lazy initialization.
-   */
-  RefResourceView& operator=(RefResourceView&& that) = default;
-
-  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
-  [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
-  }
-  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
-  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
-  [[nodiscard]] bool empty() const { return size() == 0; }        // NOLINT
-
-  [[nodiscard]] auto cbegin() const { return data(); }         // NOLINT
-  [[nodiscard]] auto begin() { return data(); }                // NOLINT
-  [[nodiscard]] auto begin() const { return cbegin(); }        // NOLINT
-  [[nodiscard]] auto cend() const { return data() + size(); }  // NOLINT
-  [[nodiscard]] auto end() { return data() + size(); }         // NOLINT
-  [[nodiscard]] auto end() const { return cend(); }            // NOLINT
-
-  [[nodiscard]] auto const& front() const { return data()[0]; }          // NOLINT
-  [[nodiscard]] auto& front() { return data()[0]; }                      // NOLINT
-  [[nodiscard]] auto const& back() const { return data()[size() - 1]; }  // NOLINT
-  [[nodiscard]] auto& back() { return data()[size() - 1]; }              // NOLINT
-
-  [[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
-  [[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
-
-  /**
-   * @brief Get the underlying resource.
-   */
-  auto Resource() const { return mem_; }
-};
-
-/**
- * @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
- *
- *  If the output vector is a referenced counted view, no copying occur.
- */
-template <typename Vec>
-[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
-  std::uint64_t n{0};
-  if (!fi->Read(&n)) {
-    return false;
-  }
-  if (n == 0) {
-    return true;
-  }
-
-  using T = typename Vec::value_type;
-  auto expected_bytes = sizeof(T) * n;
-
-  auto [ptr, n_bytes] = fi->Consume(expected_bytes);
-  if (n_bytes != expected_bytes) {
-    return false;
-  }
-
-  if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
-    *vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
-  } else {
-    vec->resize(n);
-    std::memcpy(vec->data(), ptr, n_bytes);
-  }
-  return true;
-}
-
-/**
- * @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
- */
-template <typename Vec>
-[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
-  std::size_t bytes{0};
-  auto n = static_cast<std::uint64_t>(vec.size());
-  bytes += fo->Write(n);
-  if (n == 0) {
-    return sizeof(n);
-  }
-
-  using T = typename std::remove_reference_t<decltype(vec)>::value_type;
-  bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
-
-  return bytes;
-}
-
-/**
- * @brief Make a fixed size `RefResourceView` with malloc resource.
- */
-template <typename T>
-[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
-  auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
-  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
-}
-}  // namespace xgboost::common
-#endif  // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -7,7 +7,7 @@
 #include <dmlc/data.h>

 #include <algorithm>
-#include <cstddef>  // for size_t
+#include <cstddef>  // std::size_t
 #include <functional>
 #include <limits>
 #include <map>
@@ -17,7 +17,6 @@
 #include <vector>

 #include "../c_api/c_api_error.h"
-#include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
 #include "arrow-cdi.h"
@@ -301,9 +300,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
    array_interface_ = ArrayInterface<2>(get<Object const>(j));
    batch_ = ArrayAdapterBatch{array_interface_};
  }
-  [[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
-  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
-  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  ArrayAdapterBatch const& Value() const override { return batch_; }
+  size_t NumRows() const { return array_interface_.Shape(0); }
+  size_t NumColumns() const { return array_interface_.Shape(1); }

 private:
  ArrayAdapterBatch batch_;
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -590,7 +590,7 @@ class ArrayInterface {
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
  // Only used for cuDF at the moment.
-  CHECK_EQ(array.valid.Capacity(), 0);
+  CHECK_EQ(array.valid.Size(), 0);
  auto dispatch = [&](auto t) {
    using T = std::remove_const_t<decltype(t)> const;
    // Set the data size to max as we don't know the original size of a sliced array:
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -416,8 +416,7 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK_EQ(array.valid.Capacity(), 0)
-      << "Meta info like label or weight can not have missing value.";
+  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
  if (array.is_contiguous && array.type == ToDType<T>::kType) {
    // Handle contigious
    p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -33,8 +33,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
    p_out->Reshape(array.shape);
    return;
  }
-  CHECK_EQ(array.valid.Capacity(), 0)
-      << "Meta info like label or weight can not have missing value.";
+  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
  auto ptr_device = SetDeviceToPtr(array.data);
  p_out->SetDevice(ptr_device);

--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -5,7 +5,6 @@
 #include <thrust/iterator/transform_output_iterator.h>

 #include "../common/categorical.h"
-#include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
@@ -314,8 +313,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
  auto d_csc_indptr = dh::ToSpan(csc_indptr);

  auto bin_type = page.index.GetBinTypeSize();
-  common::CompressedBufferWriter writer{page.cut.TotalBins() +
-                                        static_cast<std::size_t>(1)};  // +1 for null value
+  common::CompressedBufferWriter writer{page.cut.TotalBins() + 1};  // +1 for null value

  dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
    auto ridx = idx / row_stride;
@@ -359,10 +357,8 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag

  // copy gidx
  common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
-  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
+  dh::device_vector<size_t> row_ptr(page.row_ptr);
  auto d_row_ptr = dh::ToSpan(row_ptr);
-  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
-                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));

  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
  auto null = accessor.NullValue();
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,59 +1,60 @@
-/**
- * Copyright 2019-2023, XGBoost contributors
+/*!
+ * Copyright 2019-2021 XGBoost contributors
 */
+#include <xgboost/data.h>
 #include <dmlc/registry.h>

-#include <cstddef>  // for size_t
-
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
 #include "ellpack_page.cuh"
-#include "histogram_cut_format.h"  // for ReadHistogramCuts, WriteHistogramCuts
-#include "sparse_page_writer.h"    // for SparsePageFormat
+#include "sparse_page_writer.h"
+#include "histogram_cut_format.h"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);

+
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
-  bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
+  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
    auto* impl = page->Impl();
    if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
      return false;
    }
-    if (!fi->Read(&impl->n_rows)) {
-      return false;
-    }
-    if (!fi->Read(&impl->is_dense)) {
-      return false;
-    }
-    if (!fi->Read(&impl->row_stride)) {
-      return false;
-    }
-    if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
-      return false;
-    }
+    fi->Read(&impl->n_rows);
+    fi->Read(&impl->is_dense);
+    fi->Read(&impl->row_stride);
+    fi->Read(&impl->gidx_buffer.HostVector());
    if (!fi->Read(&impl->base_rowid)) {
      return false;
    }
    return true;
  }

-  size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
-    std::size_t bytes{0};
+  size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
+    size_t bytes = 0;
    auto* impl = page.Impl();
    bytes += WriteHistogramCuts(impl->Cuts(), fo);
-    bytes += fo->Write(impl->n_rows);
-    bytes += fo->Write(impl->is_dense);
-    bytes += fo->Write(impl->row_stride);
+    fo->Write(impl->n_rows);
+    bytes += sizeof(impl->n_rows);
+    fo->Write(impl->is_dense);
+    bytes += sizeof(impl->is_dense);
+    fo->Write(impl->row_stride);
+    bytes += sizeof(impl->row_stride);
    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-    bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
-    bytes += fo->Write(impl->base_rowid);
+    fo->Write(impl->gidx_buffer.HostVector());
+    bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+    fo->Write(impl->base_rowid);
+    bytes += sizeof(impl->base_rowid);
    return bytes;
  }
 };

 XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
    .describe("Raw ELLPACK binary data format.")
-    .set_body([]() { return new EllpackPageRawFormat(); });
-}  // namespace xgboost::data
+    .set_body([]() {
+      return new EllpackPageRawFormat();
+    });
+
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -29,7 +29,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
  cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);

  const uint32_t nbins = cut.Ptrs().back();
-  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
+  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(ctx->Threads() * nbins, 0);

  size_t new_size = 1;
@@ -37,7 +37,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
    new_size += batch.Size();
  }

-  row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});
+  row_ptr.resize(new_size);
+  row_ptr[0] = 0;

  const bool isDense = p_fmat->IsDense();
  this->isDense_ = isDense;
@@ -60,8 +61,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_

 GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
                                   bst_bin_t max_bin_per_feat)
-    : row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
-      hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
+    : row_ptr(info.num_row_ + 1, 0),
+      hit_count(cuts.TotalBins(), 0),
      cut{std::forward<common::HistogramCuts>(cuts)},
      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
@@ -94,10 +95,12 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
  CHECK_EQ(row_ptr.size(), 0);
-  row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
-
+  // The number of threads is pegged to the batch size. If the OMP
+  // block is parallelized on anything other than the batch/block size,
+  // it should be reassigned
+  row_ptr.resize(batch.Size() + 1, 0);
  const uint32_t nbins = cut.Ptrs().back();
-  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
+  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(n_threads * nbins, 0);

  this->PushBatch(batch, ft, n_threads);
@@ -125,45 +128,20 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
-  auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
-    // Must resize instead of allocating a new one. This function is called everytime a
-    // new batch is pushed, and we grow the size accordingly without loosing the data the
-    // previous batches.
-    using T = decltype(t);
-    std::size_t n_bytes = sizeof(T) * n_index;
-    CHECK_GE(n_bytes, this->data.size());
-
-    auto resource = this->data.Resource();
-    decltype(this->data) new_vec;
-    if (!resource) {
-      CHECK(this->data.empty());
-      new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
-    } else {
-      CHECK(resource->Type() == common::ResourceHandler::kMalloc);
-      auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
-      CHECK(malloc_resource);
-      malloc_resource->Resize(n_bytes);
-
-      // gcc-11.3 doesn't work if DataAs is used.
-      std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
-      new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
-    }
-    this->data = std::move(new_vec);
-    this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
-  };
-
  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
      isDense) {
    // compress dense index to uint8
-    make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
+    index.SetBinTypeSize(common::kUint8BinsTypeSize);
+    index.Resize((sizeof(uint8_t)) * n_index);
  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
-    make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
+    index.SetBinTypeSize(common::kUint16BinsTypeSize);
+    index.Resize((sizeof(uint16_t)) * n_index);
  } else {
-    // no compression
-    make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
+    index.SetBinTypeSize(common::kUint32BinsTypeSize);
+    index.Resize((sizeof(uint32_t)) * n_index);
  }
 }

@@ -236,11 +214,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
  return std::numeric_limits<float>::quiet_NaN();
 }

-bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
+bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
  return this->columns_->Read(fi, this->cut.Ptrs().data());
 }

-std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
+size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
  return this->columns_->Write(fo);
 }
 }  // namespace xgboost
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,5 +1,5 @@
-/**
- * Copyright 2022-2023, XGBoost Contributors
+/*!
+ * Copyright 2022 by XGBoost Contributors
 */
 #include <memory>  // std::unique_ptr

@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
 }

 void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
-                          common::RefResourceView<std::size_t>* p_out) {
+                          std::vector<size_t>* p_out) {
  auto& row_ptr = *p_out;
-  row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
+  row_ptr.resize(page->Size() + 1, 0);
  if (page->is_dense) {
    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
  } else {
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
        ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
  }

-  this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
+  this->hit_count.resize(n_bins_total, 0);
  this->GatherHitCount(ctx->Threads(), n_bins_total);

  // sanity checks
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -9,14 +9,13 @@
 #include <atomic>     // for atomic
 #include <cinttypes>  // for uint32_t
 #include <cstddef>    // for size_t
-#include <memory>     // for make_unique
+#include <memory>
 #include <vector>

 #include "../common/categorical.h"
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
-#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
@@ -26,11 +25,9 @@
 namespace xgboost {
 namespace common {
 class ColumnMatrix;
-class AlignedFileWriteStream;
 }  // namespace common
-
-/**
- * @brief preprocessed global index matrix, in CSR format.
+/*!
+ * \brief preprocessed global index matrix, in CSR format
 *
 *  Transform floating values to integer index in histogram This is a global histogram
 *  index for CPU histogram.  On GPU ellpack page is used.
@@ -136,22 +133,20 @@ class GHistIndexMatrix {
  }

 public:
-  /** @brief row pointer to rows by element position */
-  common::RefResourceView<std::size_t> row_ptr;
-  /** @brief data storage for index. */
-  common::RefResourceView<std::uint8_t> data;
-  /** @brief The histogram index. */
+  /*! \brief row pointer to rows by element position */
+  std::vector<size_t> row_ptr;
+  /*! \brief The index data */
  common::Index index;
-  /** @brief hit count of each index, used for constructing the ColumnMatrix */
-  common::RefResourceView<std::size_t> hit_count;
-  /** @brief The corresponding cuts */
+  /*! \brief hit count of each index, used for constructing the ColumnMatrix */
+  std::vector<size_t> hit_count;
+  /*! \brief The corresponding cuts */
  common::HistogramCuts cut;
-  /** @brief max_bin for each feature. */
+  /** \brief max_bin for each feature. */
  bst_bin_t max_numeric_bins_per_feat;
-  /** @brief base row index for current page (used by external memory) */
-  bst_row_t base_rowid{0};
+  /*! \brief base row index for current page (used by external memory) */
+  size_t base_rowid{0};

-  [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
+  bst_bin_t MaxNumBinPerFeat() const {
    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
  }

@@ -223,27 +218,29 @@ class GHistIndexMatrix {
    }
  }

-  [[nodiscard]] bool IsDense() const { return isDense_; }
+  bool IsDense() const {
+    return isDense_;
+  }
  void SetDense(bool is_dense) { isDense_ = is_dense; }
  /**
-   * @brief Get the local row index.
+   * \brief Get the local row index.
   */
-  [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
+  size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }

-  [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
-  [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
+  bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
+  bst_feature_t Features() const { return cut.Ptrs().size() - 1; }

-  [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
-  [[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;
+  bool ReadColumnPage(dmlc::SeekStream* fi);
+  size_t WriteColumnPage(dmlc::Stream* fo) const;

-  [[nodiscard]] common::ColumnMatrix const& Transpose() const;
+  common::ColumnMatrix const& Transpose() const;

-  [[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;

-  [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
-  [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
-                                std::vector<float> const& values, std::vector<float> const& mins,
-                                bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
+  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
+                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
+                  bool is_cat) const;

 private:
  std::unique_ptr<common::ColumnMatrix> columns_;
@@ -297,5 +294,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
    }
  });
 }
-}  // namespace xgboost
+}      // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,49 +1,38 @@
-/**
- * Copyright 2021-2023 XGBoost contributors
+/*!
+ * Copyright 2021-2022 XGBoost contributors
 */
-#include <cstddef>      // for size_t
-#include <cstdint>      // for uint8_t
-#include <type_traits>  // for underlying_type_t
-#include <vector>       // for vector
+#include "sparse_page_writer.h"
+#include "gradient_index.h"
+#include "histogram_cut_format.h"

-#include "../common/io.h"                 // for AlignedResourceReadStream
-#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
-#include "gradient_index.h"               // for GHistIndexMatrix
-#include "histogram_cut_format.h"         // for ReadHistogramCuts
-#include "sparse_page_writer.h"           // for SparsePageFormat
-
-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
 public:
-  bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
-    CHECK(fi);
-
+  bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
    if (!ReadHistogramCuts(&page->cut, fi)) {
      return false;
    }
-
    // indptr
-    if (!common::ReadVec(fi, &page->row_ptr)) {
+    fi->Read(&page->row_ptr);
+    // data
+    std::vector<uint8_t> data;
+    if (!fi->Read(&data)) {
      return false;
    }
-
-    // data
-    // - bin type
+    page->index.Resize(data.size());
+    std::copy(data.cbegin(), data.cend(), page->index.begin());
+    // bin type
    // Old gcc doesn't support reading from enum.
    std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
    if (!fi->Read(&uint_bin_type)) {
      return false;
    }
-    common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
-    // - index buffer
-    if (!common::ReadVec(fi, &page->data)) {
-      return false;
-    }
-    // - index
-    page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
-
+    common::BinTypeSize size_type =
+        static_cast<common::BinTypeSize>(uint_bin_type);
+    page->index.SetBinTypeSize(size_type);
    // hit count
-    if (!common::ReadVec(fi, &page->hit_count)) {
+    if (!fi->Read(&page->hit_count)) {
      return false;
    }
    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
@@ -61,34 +50,38 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
      page->index.SetBinOffset(page->cut.Ptrs());
    }

-    if (!page->ReadColumnPage(fi)) {
-      return false;
-    }
+    page->ReadColumnPage(fi);
    return true;
  }

-  std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
-    CHECK_NE(page.index.Size(), 0) << "Empty page is not supported.";
-    std::size_t bytes = 0;
+  size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
+    size_t bytes = 0;
    bytes += WriteHistogramCuts(page.cut, fo);
    // indptr
-    bytes += common::WriteVec(fo, page.row_ptr);
-
+    fo->Write(page.row_ptr);
+    bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
+             sizeof(uint64_t);
    // data
-    // - bin type
-    std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
-    bytes += fo->Write(uint_bin_type);
-    // - index buffer
-    std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
-    bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
-    bytes += fo->Write(data.data(), data.size());
-
+    std::vector<uint8_t> data(page.index.begin(), page.index.end());
+    fo->Write(data);
+    bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
+    // bin type
+    std::underlying_type_t<common::BinTypeSize> uint_bin_type =
+        page.index.GetBinTypeSize();
+    fo->Write(uint_bin_type);
+    bytes += sizeof(page.index.GetBinTypeSize());
    // hit count
-    bytes += common::WriteVec(fo, page.hit_count);
+    fo->Write(page.hit_count);
+    bytes +=
+        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
+        sizeof(uint64_t);
    // max_bins, base row, is_dense
-    bytes += fo->Write(page.max_numeric_bins_per_feat);
-    bytes += fo->Write(page.base_rowid);
-    bytes += fo->Write(page.IsDense());
+    fo->Write(page.max_numeric_bins_per_feat);
+    bytes += sizeof(page.max_numeric_bins_per_feat);
+    fo->Write(page.base_rowid);
+    bytes += sizeof(page.base_rowid);
+    fo->Write(page.IsDense());
+    bytes += sizeof(page.IsDense());

    bytes += page.WriteColumnPage(fo);
    return bytes;
@@ -100,4 +93,6 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
 XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
    .describe("Raw GHistIndex binary data format.")
    .set_body([]() { return new GHistIndexRawFormat(); });
-}  // namespace xgboost::data
+
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -1,9 +1,10 @@
-/**
- * Copyright 2021-2023, XGBoost Contributors
+/*!
+ * Copyright 2021-2022 by XGBoost Contributors
 */
 #include "gradient_index_page_source.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 void GradientIndexPageSource::Fetch() {
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
@@ -20,4 +21,5 @@ void GradientIndexPageSource::Fetch() {
    this->WriteCache();
  }
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,38 +1,36 @@
-/**
- * Copyright 2021-2023, XGBoost contributors
+/*!
+ * Copyright 2021 XGBoost contributors
 */
 #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_

-#include <dmlc/io.h>  // for Stream
+#include "../common/hist_util.h"

-#include <cstddef>  // for size_t
-
-#include "../common/hist_util.h"          // for HistogramCuts
-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
-
-namespace xgboost::data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
-  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
+namespace xgboost {
+namespace data {
+inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
+  if (!fi->Read(&cuts->cut_values_.HostVector())) {
    return false;
  }
-  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
+  if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
    return false;
  }
-  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
+  if (!fi->Read(&cuts->min_vals_.HostVector())) {
    return false;
  }
  return true;
 }

-inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
-                                      common::AlignedFileWriteStream *fo) {
-  std::size_t bytes = 0;
-  bytes += common::WriteVec(fo, cuts.Values());
-  bytes += common::WriteVec(fo, cuts.Ptrs());
-  bytes += common::WriteVec(fo, cuts.MinValues());
+inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
+  size_t bytes = 0;
+  fo->Write(cuts.cut_values_.ConstHostVector());
+  bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+  fo->Write(cuts.cut_ptrs_.ConstHostVector());
+  bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+  fo->Write(cuts.min_vals_.ConstHostVector());
+  bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
  return bytes;
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -240,9 +240,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   * Generate gradient index.
   */
  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
-  std::size_t rbegin = 0;
-  std::size_t prev_sum = 0;
-  std::size_t i = 0;
+  size_t rbegin = 0;
+  size_t prev_sum = 0;
+  size_t i = 0;
  while (iter.Next()) {
    HostAdapterDispatch(proxy, [&](auto const& batch) {
      proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  dh::XGBCachingDeviceAllocator<char> alloc;

  auto num_rows = [&]() {
-    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
+    return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
  };
  auto num_cols = [&]() {
-    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
+    return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
  };

  size_t row_stride = 0;
@@ -74,7 +74,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
                                     get_device());
      auto* p_sketch = &sketch_containers.back();
      proxy->Info().weights_.SetDevice(get_device());
-      cuda_impl::Dispatch(proxy, [&](auto const& value) {
+      Dispatch(proxy, [&](auto const& value) {
        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
      });
    }
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    accumulated_rows += batch_rows;
    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
+    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
                          }));
    nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
@@ -136,14 +136,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    cuda_impl::Dispatch(proxy, [=](auto const& value) {
+    Dispatch(proxy, [=](auto const& value) {
      return GetRowCounts(value, row_counts_span, get_device(), missing);
    });
    auto is_dense = this->IsDense();

    proxy->Info().feature_types.SetDevice(get_device());
    auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
-    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
+    auto new_impl = Dispatch(proxy, [&](auto const& value) {
      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
                             d_feature_types, row_stride, rows, cuts);
    });
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -1,13 +1,14 @@
-/**
- * Copyright 2021-2023, XGBoost Contributors
+/*!
+ * Copyright 2021 by Contributors
 * \file proxy_dmatrix.cc
 */

 #include "proxy_dmatrix.h"

-namespace xgboost::data {
-void DMatrixProxy::SetArrayData(StringView interface_str) {
-  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
+namespace xgboost {
+namespace data {
+void DMatrixProxy::SetArrayData(char const *c_interface) {
+  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
@@ -24,36 +25,5 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
  this->Info().num_row_ = adapter->NumRows();
  this->ctx_.gpu_id = Context::kCpuId;
 }
-
-namespace cuda_impl {
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
-#if !defined(XGBOOST_USE_CUDA)
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
-                                                float) {
-  return nullptr;
-}
-#endif  // XGBOOST_USE_CUDA
-}  // namespace cuda_impl
-
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy,
-                                                float missing) {
-  bool type_error{false};
-  std::shared_ptr<DMatrix> p_fmat{nullptr};
-  if (proxy->Ctx()->IsCPU()) {
-    p_fmat = data::HostAdapterDispatch<false>(
-        proxy.get(),
-        [&](auto const &adapter) {
-          auto p_fmat =
-              std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
-          return p_fmat;
-        },
-        &type_error);
-  } else {
-    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
-  }
-
-  return p_fmat;
-}
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -1,11 +1,12 @@
-/**
- * Copyright 2020-2023, XGBoost contributors
+/*!
+ * Copyright 2020-2022, XGBoost contributors
 */
-#include "device_adapter.cuh"
-#include "proxy_dmatrix.cuh"
 #include "proxy_dmatrix.h"
+#include "device_adapter.cuh"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
  auto const& value = adapter->Value();
@@ -30,15 +31,5 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
    ctx_.gpu_id = dh::CurrentDevice();
  }
 }
-
-namespace cuda_impl {
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy,
-                                                float missing) {
-  return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
-    auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
-    return p_fmat;
-  });
-}
-}  // namespace cuda_impl
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -6,34 +6,19 @@
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"

-namespace xgboost::data::cuda_impl {
-template <bool get_value = true, typename Fn>
+namespace xgboost::data {
+template <typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
  } else {
    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+    return fn(value);
  }
 }
-}  // namespace xgboost::data::cuda_impl
+}  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
  }

-  void SetArrayData(StringView interface_str);
+  void SetArrayData(char const* c_interface);
  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
                  bst_feature_t n_features, bool on_host);

@@ -114,62 +114,28 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
  return typed;
 }

-/**
- * @brief Dispatch function call based on input type.
- *
- * @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
- * @tparam Fn        The type of the function to be dispatched.
- *
- * @param proxy The proxy object holding the reference to the input.
- * @param fn    The function to be dispatched.
- * @param type_error[out] Set to ture if it's not null and the input data is not recognized by
- *                        the host.
- *
- * @return The return value of the function being dispatched.
- */
-template <bool get_value = true, typename Fn>
+template <typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
    if (type_error) {
      *type_error = false;
    }
+    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    if constexpr (get_value) {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
-      return fn(value);
-    } else {
-      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
-      return fn(value);
-    }
+    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
    if (type_error) {
      *type_error = false;
    }
+    return fn(value);
  } else {
    if (type_error) {
      *type_error = true;
    } else {
      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
    }
-    if constexpr (get_value) {
-      return std::result_of_t<Fn(
-          decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
-    } else {
-      return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
-    }
+    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
  }
 }
-
-/**
- * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
- */
-std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
-                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -21,7 +21,8 @@
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 MetaInfo& SimpleDMatrix::Info() { return info_; }

 const MetaInfo& SimpleDMatrix::Info() const { return info_; }
@@ -96,10 +97,6 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
  // column page doesn't exist, generate it
  if (!column_page_) {
-    auto n = std::numeric_limits<decltype(Entry::index)>::max();
-    if (this->sparse_page_->Size() > n) {
-      error::MaxSampleSize(n);
-    }
    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
  }
  auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
@@ -109,10 +106,6 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
 BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
  // Sorted column page doesn't exist, generate it
  if (!sorted_column_page_) {
-    auto n = std::numeric_limits<decltype(Entry::index)>::max();
-    if (this->sparse_page_->Size() > n) {
-      error::MaxSampleSize(n);
-    }
    sorted_column_page_.reset(
        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
    sorted_column_page_->SortRows(ctx->Threads());
@@ -434,4 +427,5 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i

  fmat_ctx_ = ctx;
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -7,6 +7,9 @@
 #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_

+#include <xgboost/data.h>
+#include <xgboost/logging.h>
+
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -17,33 +20,35 @@
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
 #include "sparse_page_source.h"
-#include "xgboost/data.h"
-#include "xgboost/logging.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 /**
 * \brief DMatrix used for external memory.
 *
 * The external memory is created for controlling memory usage by splitting up data into
- * multiple batches.  However that doesn't mean we will actually process exactly 1 batch
- * at a time, which would be terribly slow considering that we have to loop through the
- * whole dataset for every tree split.  So we use async to pre-fetch pages and let the
- * caller to decide how many batches it wants to process by returning data as a shared
- * pointer. The caller can use async function to process the data or just stage those
- * batches based on its use cases. These two optimizations might defeat the purpose of
- * splitting up dataset since if you stage all the batches then the memory usage might be
- * even worse than using a single batch. As a result, we must control how many batches can
- * be in memory at any given time.
+ * multiple batches.  However that doesn't mean we will actually process exact 1 batch at
+ * a time, which would be terribly slow considering that we have to loop through the
+ * whole dataset for every tree split.  So we use async pre-fetch and let caller to decide
+ * how many batches it wants to process by returning data as shared pointer.  The caller
+ * can use async function to process the data or just stage those batches, making the
+ * decision is out of the scope for sparse page dmatrix.  These 2 optimizations might
+ * defeat the purpose of splitting up dataset since if you load all the batches then the
+ * memory usage is even worse than using a single batch.  Essentially we need to control
+ * how many batches can be in memory at the same time.
 *
- * Right now the write to the cache is a sequential operation and is blocking. Reading
- * from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
- * heuristic.  So by sparse dmatrix itself there can be only 7 pages in main memory (might
- * be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
- * pages, 3 pre-fetched dependent pages.
+ * Right now the write to the cache is sequential operation and is blocking, reading from
+ * cache is async but with a hard coded limit of 4 pages as an heuristic.  So by sparse
+ * dmatrix itself there can be only 9 pages in main memory (might be of different types)
+ * at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
+ * dependent pages.  If the caller stops iteration at the middle and start again, then the
+ * number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
+ * caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
+ * sampling algo that samples only the first portion of data).
 *
 * Of course if the caller decides to retain some batches to perform parallel processing,
 * then we might load all pages in memory, which is also considered as a bug in caller's
- * code. So if the algo supports external memory, it must be careful that queue for async
+ * code.  So if the algo supports external memory, it must be careful that queue for async
 * call must have an upper limit.
 *
 * Another assumption we make is that the data must be immutable so caller should never
@@ -96,7 +101,7 @@ class SparsePageDMatrix : public DMatrix {
  MetaInfo &Info() override;
  const MetaInfo &Info() const override;
  Context const *Ctx() const override { return &fmat_ctx_; }
-  // The only DMatrix implementation that returns false.
+
  bool SingleColBlock() const override { return false; }
  DMatrix *Slice(common::Span<int32_t const>) override {
    LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
@@ -148,5 +153,6 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
  }
  return id;
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -1,57 +1,59 @@
-/**
- * Copyright 2015-2023, XGBoost Contributors
+/*!
+ * Copyright (c) 2015-2021 by Contributors
 * \file sparse_page_raw_format.cc
 *  Raw binary format of sparse page.
 */
+#include <xgboost/data.h>
 #include <dmlc/registry.h>

-#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "../common/ref_resource_view.h"  // for WriteVec
-#include "./sparse_page_writer.h"
-#include "xgboost/data.h"
 #include "xgboost/logging.h"
+#include "./sparse_page_writer.h"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);

-template <typename T>
+template<typename T>
 class SparsePageRawFormat : public SparsePageFormat<T> {
 public:
-  bool Read(T* page, common::AlignedResourceReadStream* fi) override {
+  bool Read(T* page, dmlc::SeekStream* fi) override {
    auto& offset_vec = page->offset.HostVector();
-    if (!common::ReadVec(fi, &offset_vec)) {
+    if (!fi->Read(&offset_vec)) {
      return false;
    }
    auto& data_vec = page->data.HostVector();
    CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
    data_vec.resize(offset_vec.back());
    if (page->data.Size() != 0) {
-      if (!common::ReadVec(fi, &data_vec)) {
-        return false;
-      }
-    }
-    if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
-      return false;
+      size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
+                                (page->data).Size() * sizeof(Entry));
+      CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
+          << "Invalid SparsePage file";
    }
+    fi->Read(&page->base_rowid, sizeof(page->base_rowid));
    return true;
  }

-  std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
+  size_t Write(const T& page, dmlc::Stream* fo) override {
    const auto& offset_vec = page.offset.HostVector();
    const auto& data_vec = page.data.HostVector();
    CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
    CHECK_EQ(offset_vec.back(), page.data.Size());
-
-    std::size_t bytes{0};
-    bytes += common::WriteVec(fo, offset_vec);
+    fo->Write(offset_vec);
+    auto bytes = page.MemCostBytes();
+    bytes += sizeof(uint64_t);
    if (page.data.Size() != 0) {
-      bytes += common::WriteVec(fo, data_vec);
+      fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
    }
-    bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
+    fo->Write(&page.base_rowid, sizeof(page.base_rowid));
+    bytes += sizeof(page.base_rowid);
    return bytes;
  }

 private:
+  /*! \brief external memory column offset */
+  std::vector<size_t> disk_offset_;
 };

 XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
@@ -72,4 +74,5 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
    return new SparsePageRawFormat<SortedCSCPage>();
  });

-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -1,31 +1,33 @@
-/**
- * Copyright 2021-2023, XGBoost contributors
+/*!
+ * Copyright 2021 XGBoost contributors
 */
-#include "../common/device_helpers.cuh"  // for CurrentDevice
-#include "proxy_dmatrix.cuh"             // for Dispatch, DMatrixProxy
-#include "simple_dmatrix.cuh"            // for CopyToSparsePage
 #include "sparse_page_source.h"
-#include "xgboost/data.h"  // for SparsePage
+#include "proxy_dmatrix.cuh"
+#include "simple_dmatrix.cuh"
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 namespace detail {
 std::size_t NSamplesDevice(DMatrixProxy *proxy) {
-  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
+  return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
 }

 std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
-  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
+  return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
 }
 }  // namespace detail

-void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
+void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
  auto device = proxy->DeviceIdx();
  if (device < 0) {
    device = dh::CurrentDevice();
  }
  CHECK_GE(device, 0);

-  cuda_impl::Dispatch(proxy,
-                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
+  Dispatch(proxy, [&](auto const &value) {
+    CopyToSparsePage(value, device, missing, page);
+  });
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -1,49 +1,45 @@
-/**
- *  Copyright 2014-2023, XGBoost Contributors
+/*!
+ *  Copyright 2014-2022 by XGBoost Contributors
 * \file sparse_page_source.h
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

-#include <algorithm>  // for min
-#include <atomic>     // for atomic
-#include <future>     // for async
+#include <algorithm>  // std::min
+#include <string>
+#include <utility>
+#include <vector>
+#include <future>
+#include <thread>
 #include <map>
 #include <memory>
-#include <mutex>  // for mutex
-#include <string>
-#include <thread>
-#include <utility>  // for pair, move
-#include <vector>

-#include "../common/common.h"
-#include "../common/io.h"     // for PrivateMmapConstStream
-#include "../common/timer.h"  // for Monitor, Timer
-#include "adapter.h"
-#include "proxy_dmatrix.h"       // for DMatrixProxy
-#include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
 #include "xgboost/data.h"

-namespace xgboost::data {
+#include "adapter.h"
+#include "sparse_page_writer.h"
+#include "proxy_dmatrix.h"
+
+#include "../common/common.h"
+#include "../common/timer.h"
+
+namespace xgboost {
+namespace data {
 inline void TryDeleteCacheFile(const std::string& file) {
  if (std::remove(file.c_str()) != 0) {
-    // Don't throw, this is called in a destructor.
    LOG(WARNING) << "Couldn't remove external memory cache file " << file
-                 << "; you may want to remove it manually";
+              << "; you may want to remove it manually";
  }
 }

-/**
- * @brief Information about the cache including path and page offsets.
- */
 struct Cache {
  // whether the write to the cache is complete
  bool written;
  std::string name;
  std::string format;
  // offset into binary cache file.
-  std::vector<std::uint64_t> offset;
+  std::vector<size_t> offset;

  Cache(bool w, std::string n, std::string fmt)
      : written{w}, name{std::move(n)}, format{std::move(fmt)} {
@@ -55,24 +51,11 @@ struct Cache {
    return name + format;
  }

-  [[nodiscard]] std::string ShardName() const {
+  std::string ShardName() {
    return ShardName(this->name, this->format);
  }
-  /**
-   * @brief Record a page with size of n_bytes.
-   */
-  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
-  /**
-   * @brief Returns the view start and length for the i^th page.
-   */
-  [[nodiscard]] auto View(std::size_t i) const {
-    std::uint64_t off = offset.at(i);
-    std::uint64_t len = offset.at(i + 1) - offset[i];
-    return std::pair{off, len};
-  }
-  /**
-   * @brief Call this once the write for the cache is complete.
-   */
+
+  // The write is completed.
  void Commit() {
    if (!written) {
      std::partial_sum(offset.begin(), offset.end(), offset.begin());
@@ -81,7 +64,7 @@ struct Cache {
  }
 };

-// Prevents multi-threaded call to `GetBatches`.
+// Prevents multi-threaded call.
 class TryLockGuard {
  std::mutex& lock_;

@@ -94,128 +77,74 @@ class TryLockGuard {
  }
 };

-// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
-class ExceHandler {
-  std::mutex mutex_;
-  std::atomic<bool> flag_{false};
-  std::exception_ptr curr_exce_{nullptr};
-
- public:
-  template <typename Fn>
-  decltype(auto) Run(Fn&& fn) noexcept(true) {
-    try {
-      return fn();
-    } catch (dmlc::Error const& e) {
-      std::lock_guard<std::mutex> guard{mutex_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-      }
-      flag_ = true;
-    } catch (std::exception const& e) {
-      std::lock_guard<std::mutex> guard{mutex_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-      }
-      flag_ = true;
-    } catch (...) {
-      std::lock_guard<std::mutex> guard{mutex_};
-      if (!curr_exce_) {
-        curr_exce_ = std::current_exception();
-      }
-      flag_ = true;
-    }
-    return std::invoke_result_t<Fn>();
-  }
-
-  void Rethrow() noexcept(false) {
-    if (flag_) {
-      CHECK(curr_exce_);
-      std::rethrow_exception(curr_exce_);
-    }
-  }
-};
-
-/**
- * @brief Base class for all page sources. Handles fetching, writing, and iteration.
- */
 template <typename S>
 class SparsePageSourceImpl : public BatchIteratorImpl<S> {
 protected:
  // Prevents calling this iterator from multiple places(or threads).
  std::mutex single_threaded_;
-  // The current page.
+
  std::shared_ptr<S> page_;

  bool at_end_ {false};
  float missing_;
-  std::int32_t nthreads_;
+  int nthreads_;
  bst_feature_t n_features_;
-  // Index to the current page.
-  std::uint32_t count_{0};
-  // Total number of batches.
-  std::uint32_t n_batches_{0};
+
+  uint32_t count_{0};
+
+  uint32_t n_batches_ {0};

  std::shared_ptr<Cache> cache_info_;
+  std::unique_ptr<dmlc::Stream> fo_;

  using Ring = std::vector<std::future<std::shared_ptr<S>>>;
  // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
  // can pre-fetch data in a ring.
  std::unique_ptr<Ring> ring_{new Ring};
-  // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
-  // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
-  // OOM error should be rare.
-  ExceHandler exce_;
-  common::Monitor monitor_;

  bool ReadCache() {
    CHECK(!at_end_);
    if (!cache_info_->written) {
      return false;
    }
-    if (ring_->empty()) {
+    if (fo_) {
+      fo_.reset();  // flush the data to disk.
      ring_->resize(n_batches_);
    }
    // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
    // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 3;
+    uint32_t constexpr kPreFetch = 4;

    size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
    CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
-    std::size_t fetch_it = count_;
+    size_t fetch_it = count_;

-    exce_.Rethrow();
-
-    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
      fetch_it %= n_batches_;  // ring
      if (ring_->at(fetch_it).valid()) {
        continue;
      }
-      auto const* self = this;  // make sure it's const
+      auto const *self = this;  // make sure it's const
      CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
+      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
+        common::Timer timer;
+        timer.Start();
+        std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
+        auto n = self->cache_info_->ShardName();
+        size_t offset = self->cache_info_->offset.at(fetch_it);
+        std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(n.c_str())};
+        fi->Seek(offset);
+        CHECK_EQ(fi->Tell(), offset);
        auto page = std::make_shared<S>();
-        this->exce_.Run([&] {
-          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-          auto name = self->cache_info_->ShardName();
-          auto [offset, length] = self->cache_info_->View(fetch_it);
-          auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
-          CHECK(fmt->Read(page.get(), fi.get()));
-        });
+        CHECK(fmt->Read(page.get(), fi.get()));
+        LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
        return page;
      });
    }
-
    CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
             n_prefetch_batches)
        << "Sparse DMatrix assumes forward iteration.";
-
-    monitor_.Start("Wait");
    page_ = (*ring_)[count_].get();
-    CHECK(!(*ring_)[count_].valid());
-    monitor_.Stop("Wait");
-
-    exce_.Rethrow();
-
    return true;
  }

@@ -224,41 +153,29 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    common::Timer timer;
    timer.Start();
    std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-
-    auto name = cache_info_->ShardName();
-    std::unique_ptr<common::AlignedFileWriteStream> fo;
-    if (this->Iter() == 0) {
-      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
-    } else {
-      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
+    if (!fo_) {
+      auto n = cache_info_->ShardName();
+      fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
    }
-
-    auto bytes = fmt->Write(*page_, fo.get());
-
+    auto bytes = fmt->Write(*page_, fo_.get());
    timer.Stop();
-    // Not entirely accurate, the kernels doesn't have to flush the data.
+
    LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
              << timer.ElapsedSeconds() << " seconds.";
-    cache_info_->Push(bytes);
+    cache_info_->offset.push_back(bytes);
  }

  virtual void Fetch() = 0;

 public:
-  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
-                       std::shared_ptr<Cache> cache)
-      : missing_{missing},
-        nthreads_{nthreads},
-        n_features_{n_features},
-        n_batches_{n_batches},
-        cache_info_{std::move(cache)} {
-    monitor_.Init(typeid(S).name());  // not pretty, but works for basic profiling
-  }
+  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
+                       uint32_t n_batches, std::shared_ptr<Cache> cache)
+      : missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
+        n_batches_{n_batches}, cache_info_{std::move(cache)} {}

  SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;

  ~SparsePageSourceImpl() override {
-    // Don't orphan the threads.
    for (auto& fu : *ring_) {
      if (fu.valid()) {
        fu.get();
@@ -266,18 +183,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    }
  }

-  [[nodiscard]] uint32_t Iter() const { return count_; }
+  uint32_t Iter() const { return count_; }

  const S &operator*() const override {
    CHECK(page_);
    return *page_;
  }

-  [[nodiscard]] std::shared_ptr<S const> Page() const override {
+  std::shared_ptr<S const> Page() const override {
    return page_;
  }

-  [[nodiscard]] bool AtEnd() const override {
+  bool AtEnd() const override {
    return at_end_;
  }

@@ -285,23 +202,20 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    TryLockGuard guard{single_threaded_};
    at_end_ = false;
    count_ = 0;
-    // Pre-fetch for the next round of iterations.
    this->Fetch();
  }
 };

 #if defined(XGBOOST_USE_CUDA)
-// Push data from CUDA.
 void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
 #else
 inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
 #endif

 class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
-  // This is the source from the user.
  DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
  DMatrixProxy* proxy_;
-  std::size_t base_row_id_{0};
+  size_t base_row_id_ {0};

  void Fetch() final {
    page_ = std::make_shared<SparsePage>();
@@ -330,7 +244,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
        iter_{iter}, proxy_{proxy} {
    if (!cache_info_->written) {
      iter_.Reset();
-      CHECK(iter_.Next()) << "Must have at least 1 batch.";
+      CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
    }
    this->Fetch();
  }
@@ -345,7 +259,6 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
    }

    if (at_end_) {
-      CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
      cache_info_->Commit();
      if (n_batches_ != 0) {
        CHECK_EQ(count_, n_batches_);
@@ -458,5 +371,6 @@ class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
    this->Fetch();
  }
 };
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,44 +1,52 @@
-/**
- * Copyright 2014-2023, XGBoost Contributors
+/*!
+ * Copyright (c) 2014-2019 by Contributors
 * \file sparse_page_writer.h
 * \author Tianqi Chen
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
 #define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_

-#include <functional>  // for function
-#include <string>      // for string
+#include <xgboost/data.h>
+#include <dmlc/io.h>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <memory>
+#include <functional>

-#include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
-#include "dmlc/io.h"        // for Stream
-#include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
-#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
+#if DMLC_ENABLE_STD_THREAD
+#include <dmlc/concurrency.h>
+#include <thread>
+#endif  // DMLC_ENABLE_STD_THREAD
+
+namespace xgboost {
+namespace data {

-namespace xgboost::data {
 template<typename T>
 struct SparsePageFormatReg;

-/**
- * @brief Format specification of various data formats like SparsePage.
+/*!
+ * \brief Format specification of SparsePage.
 */
-template <typename T>
+template<typename T>
 class SparsePageFormat {
 public:
+  /*! \brief virtual destructor */
  virtual ~SparsePageFormat() = default;
-  /**
-   * @brief Load all the segments into page, advance fi to end of the block.
-   *
-   * @param page The data to read page into.
-   * @param fi the input stream of the file
-   * @return true of the loading as successful, false if end of file was reached
+  /*!
+   * \brief Load all the segments into page, advance fi to end of the block.
+   * \param page The data to read page into.
+   * \param fi the input stream of the file
+   * \return true of the loading as successful, false if end of file was reached
   */
-  virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
-  /**
-   * @brief save the data to fo, when a page was written.
-   *
-   * @param fo output stream
+  virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
+  /*!
+   * \brief save the data to fo, when a page was written.
+   * \param fo output stream
   */
-  virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
+  virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
 };

 /*!
@@ -97,5 +105,6 @@ struct SparsePageFormatReg
  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
                         GHistIndexPageFmt, Name)

-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -172,7 +172,8 @@ class GBLinear : public GradientBooster {
  }

  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
+                           uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
+                           unsigned) override {
    model_.LazyInitModel();
    LinearCheckLayer(layer_begin);
    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
@@ -209,8 +210,8 @@ class GBLinear : public GradientBooster {
    }
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                                       bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                                       unsigned layer_begin, unsigned /*layer_end*/,
                                       bool) override {
    LinearCheckLayer(layer_begin);
    std::vector<bst_float>& contribs = out_contribs->HostVector();
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,11 +18,9 @@
 #include <vector>

 #include "../common/common.h"
-#include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
-#include "../data/proxy_dmatrix.h"  // for DMatrixProxy, HostAdapterDispatch
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -41,6 +39,7 @@ namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);

 void GBTree::Configure(Args const& cfg) {
+  this->cfg_ = cfg;
  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
  tree_param_.UpdateAllowUnknown(cfg);
@@ -60,8 +59,9 @@ void GBTree::Configure(Args const& cfg) {
  cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
  auto n_gpus = common::AllVisibleGPUs();
-  if (!gpu_predictor_) {
-    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
+  if (!gpu_predictor_ && n_gpus != 0) {
+    gpu_predictor_ = std::unique_ptr<Predictor>(
+        Predictor::Create("gpu_predictor", this->ctx_));
  }
  if (n_gpus != 0) {
    gpu_predictor_->Configure(cfg);
@@ -78,9 +78,10 @@ void GBTree::Configure(Args const& cfg) {

  monitor_.Init("GBTree");

-  specified_updater_ = std::any_of(
-      cfg.cbegin(), cfg.cend(),
-      [](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
+  specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
+                   [](std::pair<std::string, std::string> const& arg) {
+                     return arg.first == "updater";
+                   });

  if (specified_updater_ && !showed_updater_warning_) {
    LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
@@ -92,19 +93,12 @@ void GBTree::Configure(Args const& cfg) {
    showed_updater_warning_ = true;
  }

-  if (model_.learner_model_param->IsVectorLeaf()) {
-    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
-        << "Only the hist tree method is supported for building multi-target trees with vector "
-           "leaf.";
-  }
-  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
  this->ConfigureUpdaters();
-
  if (updater_seq != tparam_.updater_seq) {
    updaters_.clear();
    this->InitUpdater(cfg);
  } else {
-    for (auto& up : updaters_) {
+    for (auto &up : updaters_) {
      up->Configure(cfg);
    }
  }
@@ -112,6 +106,66 @@ void GBTree::Configure(Args const& cfg) {
  configured_ = true;
 }

+// FIXME(trivialfis): This handles updaters.  Because the choice of updaters depends on
+// whether external memory is used and how large is dataset.  We can remove the dependency
+// on DMatrix once `hist` tree method can handle external memory so that we can make it
+// default.
+void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
+  CHECK(this->configured_);
+  std::string updater_seq = tparam_.updater_seq;
+  CHECK(tparam_.GetInitialised());
+
+  tparam_.UpdateAllowUnknown(cfg);
+
+  this->PerformTreeMethodHeuristic(fmat);
+  this->ConfigureUpdaters();
+
+  // initialize the updaters only when needed.
+  if (updater_seq != tparam_.updater_seq) {
+    LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
+    this->updaters_.clear();
+    this->InitUpdater(cfg);
+  }
+}
+
+void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
+  if (specified_updater_) {
+    // This method is disabled when `updater` parameter is explicitly
+    // set, since only experts are expected to do so.
+    return;
+  }
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+
+  // tparam_ is set before calling this function.
+  if (tparam_.tree_method != TreeMethod::kAuto) {
+    return;
+  }
+
+  if (collective::IsDistributed()) {
+    LOG(INFO) << "Tree method is automatically selected to be 'approx' "
+                 "for distributed training.";
+    tparam_.tree_method = TreeMethod::kApprox;
+  } else if (!fmat->SingleColBlock()) {
+    LOG(INFO) << "Tree method is automatically set to 'approx' "
+                 "since external-memory data matrix is used.";
+    tparam_.tree_method = TreeMethod::kApprox;
+  } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
+    /* Choose tree_method='approx' automatically for large data matrix */
+    LOG(INFO) << "Tree method is automatically selected to be "
+                 "'approx' for faster speed. To use old behavior "
+                 "(exact greedy algorithm on single machine), "
+                 "set tree_method to 'exact'.";
+    tparam_.tree_method = TreeMethod::kApprox;
+  } else {
+    tparam_.tree_method = TreeMethod::kExact;
+  }
+  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
+}
+
 void GBTree::ConfigureUpdaters() {
  if (specified_updater_) {
    return;
@@ -119,25 +173,31 @@ void GBTree::ConfigureUpdaters() {
  // `updater` parameter was manually specified
  /* Choose updaters according to tree_method parameters */
  switch (tparam_.tree_method) {
-    case TreeMethod::kAuto:  // Use hist as default in 2.0
-    case TreeMethod::kHist: {
-      tparam_.updater_seq = "grow_quantile_histmaker";
+    case TreeMethod::kAuto:
+      // Use heuristic to choose between 'exact' and 'approx' This
+      // choice is carried out in PerformTreeMethodHeuristic() before
+      // calling this function.
      break;
-    }
    case TreeMethod::kApprox:
      tparam_.updater_seq = "grow_histmaker";
      break;
    case TreeMethod::kExact:
      tparam_.updater_seq = "grow_colmaker,prune";
      break;
+    case TreeMethod::kHist: {
+      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
+                   "grow_quantile_histmaker.";
+      tparam_.updater_seq = "grow_quantile_histmaker";
+      break;
+    }
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
      tparam_.updater_seq = "grow_gpu_hist";
      break;
    }
    default:
-      LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
-                 << ") detected";
+      LOG(FATAL) << "Unknown tree_method ("
+                 << static_cast<int>(tparam_.tree_method) << ") detected";
  }
 }

@@ -193,6 +253,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  TreesOneIter new_trees;
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
+  ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");

  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
@@ -375,7 +436,12 @@ void GBTree::LoadConfig(Json const& in) {
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
  tparam_.process_type = TreeProcessType::kDefault;
-  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
+  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
+  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
+    LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine.  "
+                    "Changing predictor to auto.";
+    tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
+  }

  auto msg = StringView{
      R"(
@@ -391,32 +457,19 @@ void GBTree::LoadConfig(Json const& in) {
    LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
  }

-  std::vector<Json> updater_seq;
-  if (IsA<Object>(in["updater"])) {
-    // before 2.0
-    error::WarnOldSerialization();
-    for (auto const& kv : get<Object const>(in["updater"])) {
-      auto name = kv.first;
-      auto config = kv.second;
-      config["name"] = name;
-      updater_seq.push_back(config);
-    }
-  } else {
-    // after 2.0
-    auto const& j_updaters = get<Array const>(in["updater"]);
-    updater_seq = j_updaters;
-  }
-
+  auto const& j_updaters = get<Object const>(in["updater"]);
  updaters_.clear();

-  for (auto const& config : updater_seq) {
-    auto name = get<String>(config["name"]);
+  for (auto const& kv : j_updaters) {
+    auto name = kv.first;
    if (n_gpus == 0 && name == "grow_gpu_hist") {
      name = "grow_quantile_histmaker";
      LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
    }
-    updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
-    updaters_.back()->LoadConfig(config);
+    std::unique_ptr<TreeUpdater> up{
+        TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
+    up->LoadConfig(kv.second);
+    updaters_.push_back(std::move(up));
  }

  specified_updater_ = get<Boolean>(in["specified_updater"]);
@@ -438,14 +491,13 @@ void GBTree::SaveConfig(Json* p_out) const {
  // language binding doesn't need to know about the forest size.
  out["gbtree_model_param"] = ToJson(model_.param);

-  out["updater"] = Array{};
-  auto& j_updaters = get<Array>(out["updater"]);
+  out["updater"] = Object();

-  for (auto const& up : this->updaters_) {
-    Json up_config{Object{}};
-    up_config["name"] = String{up->Name()};
-    up->SaveConfig(&up_config);
-    j_updaters.emplace_back(up_config);
+  auto& j_updaters = out["updater"];
+  for (auto const& up : updaters_) {
+    j_updaters[up->Name()] = Object();
+    auto& j_up = j_updaters[up->Name()];
+    up->SaveConfig(&j_up);
  }
  out["specified_updater"] = Boolean{specified_updater_};
 }
@@ -515,8 +567,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
  out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
 }

-void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
-                              bst_layer_t layer_begin, bst_layer_t layer_end) const {
+void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
+                          bst_layer_t layer_begin, bst_layer_t layer_end) {
  CHECK(configured_);
  if (layer_end == 0) {
    layer_end = this->BoostedRounds();
@@ -536,7 +588,7 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
    CHECK_EQ(out_preds->version, 0);
  }

-  auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
+  auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
  if (out_preds->version == 0) {
    // out_preds->Size() can be non-zero as it's initialized here before any
    // tree is built at the 0^th iterator.
@@ -556,83 +608,52 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
  }
 }

-namespace {
-inline void MismatchedDevices(Context const* booster, Context const* data) {
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-               << "is running on: " << booster->DeviceName()
-               << ", while the input data is on: " << data->DeviceName() << ".\n"
-               << R"(Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
-)";
-}
-};  // namespace
-
-void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
-                          bst_layer_t layer_begin, bst_layer_t layer_end) {
-  // dispatch to const function.
-  this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
-}
-
-void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
-                            PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
-                            bst_layer_t layer_end) const {
+std::unique_ptr<Predictor> const &
+GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
+                     DMatrix *f_dmat) const {
  CHECK(configured_);
-  auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
-  if (p_m->Ctx()->Device() != this->ctx_->Device()) {
-    MismatchedDevices(this->ctx_, p_m->Ctx());
-    CHECK_EQ(out_preds->version, 0);
-    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-    CHECK(proxy) << error::InplacePredictProxy();
-    auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
-    this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
-    return;
-  }
-
-  bool known_type = this->ctx_->DispatchDevice(
-      [&, begin = tree_begin, end = tree_end] {
-        return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
-      },
-      [&, begin = tree_begin, end = tree_end] {
-        return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
-      });
-  if (!known_type) {
-    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-    CHECK(proxy) << error::InplacePredictProxy();
-    LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
-  }
-}
-
-[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
-    bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
-  CHECK(configured_);
-
-  // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
-  // prevent data copy.
-  if (f_dmat && !f_dmat->SingleColBlock()) {
-    if (ctx_->IsCPU()) {
-      return cpu_predictor_;
-    } else {
-      common::AssertGPUSupport();
+  if (tparam_.predictor != PredictorType::kAuto) {
+    if (tparam_.predictor == PredictorType::kGPUPredictor) {
+#if defined(XGBOOST_USE_CUDA)
+      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
      CHECK(gpu_predictor_);
      return gpu_predictor_;
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
    }
+    if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
+#if defined(XGBOOST_USE_ONEAPI)
+      CHECK(oneapi_predictor_);
+      return oneapi_predictor_;
+#else
+      common::AssertOneAPISupport();
+#endif  // defined(XGBOOST_USE_ONEAPI)
+    }
+    CHECK(cpu_predictor_);
+    return cpu_predictor_;
  }

  // Data comes from Device DMatrix.
-  auto is_ellpack =
-      f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
+  auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
+                    !f_dmat->PageExists<SparsePage>();
  // Data comes from device memory, like CuDF or CuPy.
-  auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
-                        (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
+  auto is_from_device =
+      f_dmat && f_dmat->PageExists<SparsePage>() &&
+      (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
  auto on_device = is_ellpack || is_from_device;

  // Use GPU Predictor if data is already on device and gpu_id is set.
-  if (on_device && ctx_->IsCUDA()) {
-    common::AssertGPUSupport();
+  if (on_device && ctx_->gpu_id >= 0) {
+#if defined(XGBOOST_USE_CUDA)
+    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
    CHECK(gpu_predictor_);
    return gpu_predictor_;
+#else
+    LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
+                  "CUDA support.";
+    return cpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
  }

  // GPU_Hist by default has prediction cache calculated from quantile values,
@@ -644,19 +665,23 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
  if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
      // FIXME(trivialfis): Implement a better method for testing whether data
      // is on device after DMatrix refactoring is done.
-      !on_device && is_training) {
+      !on_device) {
    CHECK(cpu_predictor_);
    return cpu_predictor_;
  }

-  if (ctx_->IsCPU()) {
-    return cpu_predictor_;
-  } else {
-    common::AssertGPUSupport();
+  if (tparam_.tree_method == TreeMethod::kGPUHist) {
+#if defined(XGBOOST_USE_CUDA)
+    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
    CHECK(gpu_predictor_);
    return gpu_predictor_;
+#else
+    common::AssertGPUSupport();
+    return cpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
  }

+  CHECK(cpu_predictor_);
  return cpu_predictor_;
 }

@@ -771,7 +796,7 @@ class Dart : public GBTree {
                        bool training, unsigned layer_begin,
                        unsigned layer_end) const {
    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
-    auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
+    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
    CHECK(predictor);
    predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                  model_);
@@ -835,16 +860,15 @@ class Dart : public GBTree {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    auto n_groups = model_.learner_model_param->num_output_group;

-    if (ctx_->Device() != p_fmat->Ctx()->Device()) {
-      MismatchedDevices(ctx_, p_fmat->Ctx());
-      auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
-      CHECK(proxy) << error::InplacePredictProxy();
-      auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
-      this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
-      return;
-    }
-
+    std::vector<Predictor const*> predictors {
+      cpu_predictor_.get(),
+#if defined(XGBOOST_USE_CUDA)
+      gpu_predictor_.get()
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    Predictor const* predictor{nullptr};
    StringView msg{"Unsupported data type for inplace predict."};
+
    PredictionCacheEntry predts;
    if (ctx_->gpu_id != Context::kCpuId) {
      predts.predictions.SetDevice(ctx_->gpu_id);
@@ -853,29 +877,32 @@ class Dart : public GBTree {

    auto predict_impl = [&](size_t i) {
      predts.predictions.Fill(0);
-      bool success = this->ctx_->DispatchDevice(
-          [&] {
-            return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-          },
-          [&] {
-            return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-          });
-      CHECK(success) << msg;
+      if (tparam_.predictor == PredictorType::kAuto) {
+        // Try both predictor implementations
+        bool success = false;
+        for (auto const& p : predictors) {
+          if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
+            success = true;
+            predictor = p;
+            break;
+          }
+        }
+        CHECK(success) << msg;
+      } else {
+        predictor = this->GetPredictor().get();
+        bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+        CHECK(success) << msg << std::endl
+                       << "Current Predictor: "
+                       << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
+                                                                             : "gpu_predictor");
+      }
    };

    // Inplace predict is not used for training, so no need to drop tree.
    for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
      predict_impl(i);
      if (i == tree_begin) {
-        this->ctx_->DispatchDevice(
-            [&] {
-              this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
-                                                       model_);
-            },
-            [&] {
-              this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
-                                                       model_);
-            });
+        predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
      }
      // Multiple the tree weight
      auto w = this->weight_drop_.at(i);
@@ -905,24 +932,25 @@ class Dart : public GBTree {
                       std::vector<bst_float> *out_preds,
                       unsigned layer_begin, unsigned layer_end) override {
    DropTrees(false);
-    auto &predictor = this->GetPredictor(false);
+    auto &predictor = this->GetPredictor();
    uint32_t _, tree_end;
    std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    predictor->PredictInstance(inst, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           bst_layer_t layer_begin, bst_layer_t layer_end,
-                           bool approximate) override {
+  void PredictContribution(DMatrix* p_fmat,
+                           HostDeviceVector<bst_float>* out_contribs,
+                           unsigned layer_begin, unsigned layer_end, bool approximate, int,
+                           unsigned) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
                                        approximate);
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                                       bst_layer_t layer_begin, bst_layer_t layer_end,
-                                       bool approximate) override {
+  void PredictInteractionContributions(
+      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+      unsigned layer_begin, unsigned layer_end, bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,11 +1,14 @@
-/**
- * Copyright 2021-2023, XGBoost Contributors
+/*!
+ * Copyright 2021 by Contributors
 */
 #include "../common/device_helpers.cuh"
+#include "xgboost/context.h"
 #include "xgboost/linalg.h"
 #include "xgboost/span.h"

-namespace xgboost::gbm {
+namespace xgboost {
+namespace gbm {
+
 void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
                     bst_group_t n_groups, bst_group_t group_id,
                     HostDeviceVector<GradientPair> *out_gpair) {
@@ -38,4 +41,5 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
    out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
  });
 }
-}  // namespace xgboost::gbm
+}  // namespace gbm
+}  // namespace xgboost
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -43,18 +43,30 @@ enum class TreeProcessType : int {
  kDefault = 0,
  kUpdate = 1
 };
+
+enum class PredictorType : int {
+  kAuto = 0,
+  kCPUPredictor,
+  kGPUPredictor,
+  kOneAPIPredictor
+};
 }  // namespace xgboost

 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
+DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
+
+namespace xgboost {
+namespace gbm {

-namespace xgboost::gbm {
 /*! \brief training parameters */
 struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
  /*! \brief tree updater sequence */
  std::string updater_seq;
  /*! \brief type of boosting process to run */
  TreeProcessType process_type;
+  // predictor type
+  PredictorType predictor;
  // tree construction method
  TreeMethod tree_method;
  // declare parameters
@@ -69,6 +81,13 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
        .describe("Whether to run the normal boosting process that creates new trees,"\
                  " or to update the trees in an existing model.");
    DMLC_DECLARE_ALIAS(updater_seq, updater);
+    DMLC_DECLARE_FIELD(predictor)
+        .set_default(PredictorType::kAuto)
+        .add_enum("auto", PredictorType::kAuto)
+        .add_enum("cpu_predictor", PredictorType::kCPUPredictor)
+        .add_enum("gpu_predictor", PredictorType::kGPUPredictor)
+        .add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
+        .describe("Predictor algorithm type");
    DMLC_DECLARE_FIELD(tree_method)
        .set_default(TreeMethod::kAuto)
        .add_enum("auto",      TreeMethod::kAuto)
@@ -173,8 +192,12 @@ class GBTree : public GradientBooster {
      : GradientBooster{ctx}, model_(booster_config, ctx_) {}

  void Configure(const Args& cfg) override;
+  // Revise `tree_method` and `updater` parameters after seeing the training
+  // data matrix, only useful when tree_method is auto.
+  void PerformTreeMethodHeuristic(DMatrix* fmat);
  /*! \brief Map `tree_method` parameter to `updater` parameter */
  void ConfigureUpdaters();
+  void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);

  /**
   * \brief Optionally update the leaf value.
@@ -189,11 +212,21 @@ class GBTree : public GradientBooster {
  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
               PredictionCacheEntry* predt, ObjFunction const* obj) override;

-  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
+  bool UseGPU() const override {
+    return
+        tparam_.predictor == PredictorType::kGPUPredictor ||
+        tparam_.tree_method == TreeMethod::kGPUHist;
+  }

-  [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
+  GBTreeTrainParam const& GetTrainParam() const {
+    return tparam_;
+  }
+
+  void Load(dmlc::Stream* fi) override {
+    model_.Load(fi);
+    this->cfg_.clear();
+  }

-  void Load(dmlc::Stream* fi) override { model_.Load(fi); }
  void Save(dmlc::Stream* fo) const override {
    model_.Save(fo);
  }
@@ -213,14 +246,39 @@ class GBTree : public GradientBooster {
    return !model_.trees.empty() || !model_.trees_to_update.empty();
  }

-  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
-                        bst_layer_t layer_begin, bst_layer_t layer_end) const;
-
  void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
                    bst_layer_t layer_begin, bst_layer_t layer_end) override;

  void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
-                      bst_layer_t layer_begin, bst_layer_t layer_end) const override;
+                      bst_layer_t layer_begin, bst_layer_t layer_end) const override {
+    CHECK(configured_);
+    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
+    CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
+    std::vector<Predictor const *> predictors{
+      cpu_predictor_.get(),
+#if defined(XGBOOST_USE_CUDA)
+      gpu_predictor_.get()
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    StringView msg{"Unsupported data type for inplace predict."};
+    if (tparam_.predictor == PredictorType::kAuto) {
+      // Try both predictor implementations
+      for (auto const &p : predictors) {
+        if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
+          return;
+        }
+      }
+      LOG(FATAL) << msg;
+    } else {
+      bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
+                                                          tree_begin, tree_end);
+      CHECK(success) << msg << std::endl
+                     << "Current Predictor: "
+                     << (tparam_.predictor == PredictorType::kCPUPredictor
+                             ? "cpu_predictor"
+                             : "gpu_predictor");
+    }
+  }

  void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
                    std::vector<bst_feature_t>* features,
@@ -301,29 +359,32 @@ class GBTree : public GradientBooster {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
                               "n_iteration), use model slicing instead.";
-    this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
+    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                           bst_layer_t layer_begin, bst_layer_t layer_end,
-                           bool approximate) override {
+  void PredictContribution(DMatrix* p_fmat,
+                           HostDeviceVector<bst_float>* out_contribs,
+                           uint32_t layer_begin, uint32_t layer_end, bool approximate,
+                           int, unsigned) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
-                               "n_iteration), using model slicing instead.";
-    this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
-                                                   approximate);
+    CHECK_EQ(tree_begin, 0)
+        << "Predict contribution supports only iteration end: (0, "
+           "n_iteration), using model slicing instead.";
+    this->GetPredictor()->PredictContribution(
+        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
-                                       bst_layer_t layer_begin, bst_layer_t layer_end,
-                                       bool approximate) override {
+  void PredictInteractionContributions(
+      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+      uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
-                               "n_iteration), using model slicing instead.";
-    this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
-                                                               tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0)
+        << "Predict interaction contribution supports only iteration end: (0, "
+           "n_iteration), using model slicing instead.";
+    this->GetPredictor()->PredictInteractionContributions(
+        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -339,9 +400,8 @@ class GBTree : public GradientBooster {
                     std::vector<HostDeviceVector<bst_node_t>>* out_position,
                     std::vector<std::unique_ptr<RegTree>>* ret);

-  [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
-      bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
-      DMatrix* f_dmat = nullptr) const;
+  std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
+                                                 DMatrix* f_dmat = nullptr) const;

  // commit new trees all at once
  virtual void CommitModel(TreesOneIter&& new_trees);
@@ -356,17 +416,22 @@ class GBTree : public GradientBooster {
  bool showed_updater_warning_ {false};
  bool specified_updater_   {false};
  bool configured_ {false};
+  // configurations for tree
+  Args cfg_;
  // the updaters that can be applied to each of tree
  std::vector<std::unique_ptr<TreeUpdater>> updaters_;
  // Predictors
  std::unique_ptr<Predictor> cpu_predictor_;
-  std::unique_ptr<Predictor> gpu_predictor_{nullptr};
+#if defined(XGBOOST_USE_CUDA)
+  std::unique_ptr<Predictor> gpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
 #if defined(XGBOOST_USE_ONEAPI)
  std::unique_ptr<Predictor> oneapi_predictor_;
 #endif  // defined(XGBOOST_USE_ONEAPI)
  common::Monitor monitor_;
 };

-}  // namespace xgboost::gbm
+}  // namespace gbm
+}  // namespace xgboost

 #endif  // XGBOOST_GBM_GBTREE_H_
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,7 +40,6 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -357,6 +356,21 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
    dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;

+namespace {
+StringView ModelMsg() {
+  return StringView{
+      R"doc(
+  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
+  older XGBoost, please export the model by calling `Booster.save_model` from that version
+  first, then load it back in current version. See:
+
+    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
+
+  for more details about differences between saving model and serializing.
+)doc"};
+}
+}  // anonymous namespace
+
 class LearnerConfiguration : public Learner {
 private:
  std::mutex config_lock_;
@@ -516,7 +530,7 @@ class LearnerConfiguration : public Learner {
    }

    if (!Version::Same(origin_version)) {
-      error::WarnOldSerialization();
+      LOG(WARNING) << ModelMsg();
      return;  // skip configuration if version is not matched
    }

@@ -547,7 +561,7 @@ class LearnerConfiguration : public Learner {
    for (size_t i = 0; i < n_metrics; ++i) {
      auto old_serialization = IsA<String>(j_metrics[i]);
      if (old_serialization) {
-        error::WarnOldSerialization();
+        LOG(WARNING) << ModelMsg();
        metric_names_[i] = get<String>(j_metrics[i]);
      } else {
        metric_names_[i] = get<String>(j_metrics[i]["name"]);
@@ -749,7 +763,9 @@ class LearnerConfiguration : public Learner {
        CHECK(matrix.first.ptr);
        CHECK(!matrix.second.ref.expired());
        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
-        error::MaxFeatureSize(num_col);
+        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
+            << "Unfortunately, XGBoost does not support data matrices with "
+            << std::numeric_limits<unsigned>::max() << " features or greater";
        num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
      }

@@ -1158,7 +1174,7 @@ class LearnerIO : public LearnerConfiguration {
      Json memory_snapshot;
      if (header[1] == '"') {
        memory_snapshot = Json::Load(StringView{buffer});
-        error::WarnOldSerialization();
+        LOG(WARNING) << ModelMsg();
      } else if (std::isalpha(header[1])) {
        memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
      } else {
@@ -1177,7 +1193,7 @@ class LearnerIO : public LearnerConfiguration {
      header.resize(serialisation_header_.size());
      CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
      // Avoid printing the content in loaded header, which might be random binary code.
-      CHECK(header == serialisation_header_) << error::OldSerialization();
+      CHECK(header == serialisation_header_) << ModelMsg();
      int64_t sz {-1};
      CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
      if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -1397,8 +1413,6 @@ class LearnerImpl : public LearnerIO {
    this->CheckModelInitialized();

    auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    out_predictions.version = 0;
-
    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
    if (type == PredictionType::kValue) {
      obj_->PredTransform(&out_predictions.predictions);
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -577,8 +577,8 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
                       if (lj(0) >= Eps64()) {
                         tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
                       }
-                       assert(!isinf(ti_plus(i)));
-                       assert(!isinf(tj_minus(i)));
+                       assert(!std::isinf(ti_plus(i)));
+                       assert(!std::isinf(tj_minus(i)));
                     });
 }
 }  // namespace cuda_impl
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -16,7 +16,6 @@
 #include "../common/bitfield.h"               // for RBitField8
 #include "../common/categorical.h"            // for IsCat, Decision
 #include "../common/common.h"                 // for DivRoundUp
-#include "../common/error_msg.h"              // for InplacePredictProxy
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
@@ -468,6 +467,7 @@ class ColumnSplitHelper {
  void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
    auto const &tree = *model_.trees[tree_id];
    auto const &cats = tree.GetCategoriesMatrix();
+    auto const has_categorical = tree.HasCategoricalSplit();
    bst_node_t n_nodes = tree.GetNodes().size();

    for (bst_node_t nid = 0; nid < n_nodes; nid++) {
@@ -484,10 +484,16 @@ class ColumnSplitHelper {
      }

      auto const fvalue = feat.GetFvalue(split_index);
-      auto const decision = tree.HasCategoricalSplit()
-                                ? GetDecision<true>(node, nid, fvalue, cats)
-                                : GetDecision<false>(node, nid, fvalue, cats);
-      if (decision) {
+      if (has_categorical && common::IsCat(cats.split_type, nid)) {
+        auto const node_categories =
+            cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
+        if (!common::Decision(node_categories, fvalue)) {
+          decision_bits_.Set(bit_index);
+        }
+        continue;
+      }
+
+      if (fvalue >= node.SplitCond()) {
        decision_bits_.Set(bit_index);
      }
    }
@@ -505,7 +511,7 @@ class ColumnSplitHelper {
    if (missing_bits_.Check(bit_index)) {
      return node.DefaultChild();
    } else {
-      return node.LeftChild() + !decision_bits_.Check(bit_index);
+      return node.LeftChild() + decision_bits_.Check(bit_index);
    }
  }

@@ -742,7 +748,7 @@ class CPUPredictor : public Predictor {
                      PredictionCacheEntry *out_preds, uint32_t tree_begin,
                      unsigned tree_end) const override {
    auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
-    CHECK(proxy)<< error::InplacePredictProxy();
+    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
    CHECK(!p_m->Info().IsColumnSplit())
        << "Inplace predict support for column-wise data split is not yet implemented.";
    auto x = proxy->Adapter();
@@ -884,8 +890,9 @@ class CPUPredictor : public Predictor {
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      auto page = batch.GetView();
      // parallel over local batch
-      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
-        auto row_idx = batch.base_rowid + i;
+      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
+      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
+        auto row_idx = static_cast<size_t>(batch.base_rowid + i);
        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
        if (feats.Size() == 0) {
          feats.Init(num_feature);
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -11,13 +11,10 @@
 #include <any>  // for any, any_cast
 #include <memory>

-#include "../collective/communicator-inl.cuh"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
-#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/device_helpers.cuh"
-#include "../common/error_msg.h"  // for InplacePredictProxy
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -113,11 +110,13 @@ struct SparsePageLoader {
  bool use_shared;
  SparsePageView data;
  float* smem;
+  size_t entry_start;

  __device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
                              bst_row_t num_rows, size_t entry_start, float)
      : use_shared(use_shared),
-        data(data) {
+        data(data),
+        entry_start(entry_start) {
    extern __shared__ float _smem[];
    smem = _smem;
    // Copy instances
@@ -440,7 +439,7 @@ struct ShapSplitCondition {
    if (isnan(x)) {
      return is_missing_branch;
    }
-    if (categories.Capacity() != 0) {
+    if (categories.Size() != 0) {
      auto cat = static_cast<uint32_t>(x);
      return categories.Check(cat);
    } else {
@@ -455,7 +454,7 @@ struct ShapSplitCondition {
    if (l.Data() == r.Data()) {
      return l;
    }
-    if (l.Capacity() > r.Capacity()) {
+    if (l.Size() > r.Size()) {
      thrust::swap(l, r);
    }
    for (size_t i = 0; i < r.Bits().size(); ++i) {
@@ -467,7 +466,7 @@ struct ShapSplitCondition {
  // Combine two split conditions on the same feature
  XGBOOST_DEVICE void Merge(ShapSplitCondition other) {
    // Combine duplicate features
-    if (categories.Capacity() != 0 || other.categories.Capacity() != 0) {
+    if (categories.Size() != 0 || other.categories.Size() != 0) {
      categories = Intersect(categories, other.categories);
    } else {
      feature_lower_bound = max(feature_lower_bound, other.feature_lower_bound);
@@ -623,199 +622,6 @@ size_t SharedMemoryBytes(size_t cols, size_t max_shared_memory_bytes) {
  }
  return shared_memory_bytes;
 }
-
-using BitVector = LBitField64;
-
-__global__ void MaskBitVectorKernel(
-    SparsePageView data, common::Span<RegTree::Node const> d_nodes,
-    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
-    common::Span<FeatureType const> d_tree_split_types,
-    common::Span<std::uint32_t const> d_cat_tree_segments,
-    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
-    std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
-    std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
-  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx >= num_rows) {
-    return;
-  }
-  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
-
-  std::size_t tree_offset = 0;
-  for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-    TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                    d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                    d_cat_node_segments, d_categories};
-    auto const tree_nodes = d_tree.d_tree.size();
-    for (auto nid = 0; nid < tree_nodes; nid++) {
-      auto const& node = d_tree.d_tree[nid];
-      if (node.IsDeleted() || node.IsLeaf()) {
-          continue;
-      }
-      auto const fvalue = loader.GetElement(row_idx, node.SplitIndex());
-      auto const is_missing = common::CheckNAN(fvalue);
-      auto const bit_index = row_idx * num_nodes + tree_offset + nid;
-      if (is_missing) {
-          missing_bits.Set(bit_index);
-      } else {
-          auto const decision = d_tree.HasCategoricalSplit()
-                                    ? GetDecision<true>(node, nid, fvalue, d_tree.cats)
-                                    : GetDecision<false>(node, nid, fvalue, d_tree.cats);
-          if (decision) {
-            decision_bits.Set(bit_index);
-          }
-      }
-    }
-    tree_offset += tree_nodes;
-  }
-}
-
-__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
-                                          BitVector const& decision_bits,
-                                          BitVector const& missing_bits, std::size_t num_nodes,
-                                          std::size_t tree_offset) {
-  bst_node_t nidx = 0;
-  RegTree::Node n = tree.d_tree[nidx];
-  while (!n.IsLeaf()) {
-    auto const bit_index = ridx * num_nodes + tree_offset + nidx;
-    if (missing_bits.Check(bit_index)) {
-      nidx = n.DefaultChild();
-    } else {
-      nidx = n.LeftChild() + !decision_bits.Check(bit_index);
-    }
-    n = tree.d_tree[nidx];
-  }
-  return tree.d_tree[nidx].LeafValue();
-}
-
-__global__ void PredictByBitVectorKernel(
-    common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
-    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
-    common::Span<FeatureType const> d_tree_split_types,
-    common::Span<std::uint32_t const> d_cat_tree_segments,
-    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
-    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
-    std::size_t tree_begin, std::size_t tree_end, std::size_t num_rows, std::size_t num_nodes,
-    std::uint32_t num_group) {
-  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx >= num_rows) {
-    return;
-  }
-
-  std::size_t tree_offset = 0;
-  if (num_group == 1) {
-    float sum = 0;
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
-      sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
-                                      tree_offset);
-      tree_offset += d_tree.d_tree.size();
-    }
-    d_out_predictions[row_idx] += sum;
-  } else {
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto const tree_group = d_tree_group[tree_idx];
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
-      bst_uint out_prediction_idx = row_idx * num_group + tree_group;
-      d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
-          row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
-      tree_offset += d_tree.d_tree.size();
-    }
-  }
-}
-
-class ColumnSplitHelper {
- public:
-  explicit ColumnSplitHelper(Context const* ctx) : ctx_{ctx} {}
-
-  void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
-                    gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
-    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
-    PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
-                   model.learner_model_param->num_output_group);
-  }
-
- private:
-  using BitType = BitVector::value_type;
-
-  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
-                      bst_feature_t num_features, std::uint32_t num_group) const {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    dh::caching_device_vector<BitType> decision_storage{};
-    dh::caching_device_vector<BitType> missing_storage{};
-
-    auto constexpr kBlockThreads = 128;
-    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
-    auto const shared_memory_bytes =
-        SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
-    auto const use_shared = shared_memory_bytes != 0;
-
-    auto const num_nodes = model.nodes.Size();
-    std::size_t batch_offset = 0;
-    for (auto const& batch : dmat->GetBatches<SparsePage>()) {
-      auto const num_rows = batch.Size();
-      ResizeBitVectors(&decision_storage, &missing_storage, num_rows * num_nodes);
-      BitVector decision_bits{dh::ToSpan(decision_storage)};
-      BitVector missing_bits{dh::ToSpan(missing_storage)};
-
-      batch.offset.SetDevice(ctx_->gpu_id);
-      batch.data.SetDevice(ctx_->gpu_id);
-      std::size_t entry_start = 0;
-      SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
-
-      auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
-      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()} (
-          MaskBitVectorKernel, data, model.nodes.ConstDeviceSpan(),
-          model.tree_segments.ConstDeviceSpan(), model.tree_group.ConstDeviceSpan(),
-          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
-          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_features, num_rows,
-          entry_start, num_nodes, use_shared, nan(""));
-
-      AllReduceBitVectors(&decision_storage, &missing_storage);
-
-      dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
-          PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
-          out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
-          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
-          model.categories_tree_segments.ConstDeviceSpan(),
-          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
-          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_rows, num_nodes,
-          num_group);
-
-      batch_offset += batch.Size() * num_group;
-    }
-  }
-
-  void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
-                           dh::caching_device_vector<BitType>* missing_storage) const {
-    collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
-    collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
-    collective::Synchronize(ctx_->gpu_id);
-  }
-
-  void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
-                               dh::caching_device_vector<BitType>* missing_storage,
-                               std::size_t total_bits) const {
-    auto const size = BitVector::ComputeStorageSize(total_bits);
-    if (decision_storage->size() < size) {
-      decision_storage->resize(size);
-    }
-    thrust::fill(ctx_->CUDACtx()->CTP(), decision_storage->begin(), decision_storage->end(), 0);
-    if (missing_storage->size() < size) {
-      missing_storage->resize(size);
-    }
-    thrust::fill(ctx_->CUDACtx()->CTP(), missing_storage->begin(), missing_storage->end(), 0);
-  }
-
-  Context const* ctx_;
-};
 }  // anonymous namespace

 class GPUPredictor : public xgboost::Predictor {
@@ -891,11 +697,6 @@ class GPUPredictor : public xgboost::Predictor {
    DeviceModel d_model;
    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);

-    if (dmat->Info().IsColumnSplit()) {
-      column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
-      return;
-    }
-
    if (dmat->PageExists<SparsePage>()) {
      size_t batch_offset = 0;
      for (auto &batch : dmat->GetBatches<SparsePage>()) {
@@ -919,8 +720,7 @@ class GPUPredictor : public xgboost::Predictor {
  }

 public:
-  explicit GPUPredictor(Context const* ctx)
-      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
+  explicit GPUPredictor(Context const* ctx) : Predictor::Predictor{ctx} {}

  ~GPUPredictor() override {
    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
@@ -990,7 +790,7 @@ class GPUPredictor : public xgboost::Predictor {
                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
                      unsigned tree_end) const override {
    auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
-    CHECK(proxy) << error::InplacePredictProxy();
+    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
    auto x = proxy->Adapter();
    if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
      this->DispatchedInplacePredict<data::CupyAdapter,
@@ -1219,8 +1019,6 @@ class GPUPredictor : public xgboost::Predictor {
    }
    return 0;
  }
-
-  ColumnSplitHelper column_split_helper_;
 };

 XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -7,18 +7,6 @@
 #include "xgboost/tree_model.h"

 namespace xgboost::predictor {
-/** @brief Whether it should traverse to the left branch of a tree. */
-template <bool has_categorical>
-XGBOOST_DEVICE bool GetDecision(RegTree::Node const &node, bst_node_t nid, float fvalue,
-                                RegTree::CategoricalSplitMatrix const &cats) {
-  if (has_categorical && common::IsCat(cats.split_type, nid)) {
-    auto node_categories = cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
-    return common::Decision(node_categories, fvalue);
-  } else {
-    return fvalue < node.SplitCond();
-  }
-}
-
 template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
                                             float fvalue, bool is_missing,
@@ -26,7 +14,13 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
  if (has_missing && is_missing) {
    return node.DefaultChild();
  } else {
-    return node.LeftChild() + !GetDecision<has_categorical>(node, nid, fvalue, cats);
+    if (has_categorical && common::IsCat(cats.split_type, nid)) {
+      auto node_categories =
+          cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
+      return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
+    } else {
+      return node.LeftChild() + !(fvalue < node.SplitCond());
+    }
  }
 }

--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -1,5 +1,5 @@
-/**
- * Copyright 2019-2023, XGBoost contributors
+/*!
+ * Copyright 2019 XGBoost contributors
 */
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
@@ -140,20 +140,20 @@ void FeatureInteractionConstraintDevice::Reset() {
 __global__ void ClearBuffersKernel(
    LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < result_buffer_output.Capacity()) {
+  if (tid < result_buffer_output.Size()) {
    result_buffer_output.Clear(tid);
  }
-  if (tid < result_buffer_input.Capacity()) {
+  if (tid < result_buffer_input.Size()) {
    result_buffer_input.Clear(tid);
  }
 }

 void FeatureInteractionConstraintDevice::ClearBuffers() {
-  CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
-  CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
+  CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
+  CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
  uint32_t constexpr kBlockThreads = 256;
  auto const n_grids = static_cast<uint32_t>(
-      common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
+      common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
  dh::LaunchKernel {n_grids, kBlockThreads} (
      ClearBuffersKernel,
      output_buffer_bits_, input_buffer_bits_);
@@ -207,11 +207,11 @@ common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
  ClearBuffers();

  LBitField64 node_constraints = s_node_constraints_[nid];
-  CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());
+  CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());

  uint32_t constexpr kBlockThreads = 256;
  auto n_grids = static_cast<uint32_t>(
-      common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
+      common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
  dh::LaunchKernel {n_grids, kBlockThreads} (
      SetInputBufferKernel,
      feature_list, input_buffer_bits_);
@@ -274,13 +274,13 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
                                                 LBitField64 left,
                                                 LBitField64 right) {
  auto tid = threadIdx.x + blockDim.x * blockIdx.x;
-  if (tid > node.Capacity()) {
+  if (tid > node.Size()) {
    return;
  }
  // enable constraints from feature
  node |= feature;
  // clear the buffer after use
-  if (tid < feature.Capacity()) {
+  if (tid < feature.Size()) {
    feature.Clear(tid);
  }

@@ -323,7 +323,7 @@ void FeatureInteractionConstraintDevice::Split(
      s_sets_, s_sets_ptr_);

  uint32_t constexpr kBlockThreads = 256;
-  auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));
+  auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));

  dh::LaunchKernel {n_grids, kBlockThreads} (
      InteractionConstraintSplitKernel,
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -146,30 +146,27 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
  CombineGradientPair combine_;
 };

-NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
+NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}

-GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
+GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
                                       DMatrix* dmat) {
-  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
-  return {dmat->Info().num_row_, page, gpair};
+  return {dmat->Info().num_row_, page_, gpair};
 }

-ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
-    : batch_param_{std::move(batch_param)} {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
+                                                   size_t n_rows, BatchParam batch_param)
+    : batch_param_{std::move(batch_param)},
+      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
+                                n_rows)) {}

 GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
                                                     common::Span<GradientPair> gpair,
                                                     DMatrix* dmat) {
  if (!page_concatenated_) {
    // Concatenate all the external memory ELLPACK pages into a single in-memory page.
-    page_.reset(nullptr);
    size_t offset = 0;
    for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
      auto page = batch.Impl();
-      if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
-                                                  page->row_stride, dmat->Info().num_row_);
-      }
      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
      offset += num_elements;
    }
@@ -178,8 +175,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
  return {dmat->Info().num_row_, page_.get(), gpair};
 }

-UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
-    : batch_param_{std::move(batch_param)}, subsample_(subsample) {}
+UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
+    : page_(page), subsample_(subsample) {}

 GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                            DMatrix* dmat) {
@@ -188,8 +185,7 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<std::size_t>(0),
                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
-  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
-  return {dmat->Info().num_row_, page, gpair};
+  return {dmat->Info().num_row_, page_, gpair};
 }

 ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
@@ -240,10 +236,12 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }

-GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
+GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
+                                             size_t n_rows,
+                                             const BatchParam&,
                                             float subsample)
-    : subsample_(subsample),
-      batch_param_{std::move(batch_param)},
+    : page_(page),
+      subsample_(subsample),
      threshold_(n_rows + 1, 0.0f),
      grad_sum_(n_rows, 0.0f) {}

@@ -254,19 +252,18 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

-  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
-
  // Perform Poisson sampling in place.
  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                    RandomWeight(common::GlobalRandom()())));
-  return {n_rows, page, gpair};
+  return {n_rows, page_, gpair};
 }

-ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
-                                                                         BatchParam batch_param,
-                                                                         float subsample)
+ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
+    size_t n_rows,
+    BatchParam batch_param,
+    float subsample)
    : batch_param_(std::move(batch_param)),
      subsample_(subsample),
      threshold_(n_rows + 1, 0.0f),
@@ -276,15 +273,16 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t
 GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
                                                                common::Span<GradientPair> gpair,
                                                                DMatrix* dmat) {
-  auto cuctx = ctx->CUDACtx();
-  bst_row_t n_rows = dmat->Info().num_row_;
+  size_t n_rows = dmat->Info().num_row_;
  size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
      gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);

  // Perform Poisson sampling in place.
-  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0),
+                    dh::tbegin(gpair),
+                    PoissonSampling(dh::ToSpan(threshold_),
+                                    threshold_index,
                                    RandomWeight(common::GlobalRandom()())));

  // Count the sampled rows.
@@ -292,15 +290,16 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c

  // Compact gradient pairs.
  gpair_.resize(sample_rows);
-  thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());

  // Index the sample rows.
-  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
-                    IsNonZero());
-  thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
-                         sample_row_index_.begin());
-  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
-                    sample_row_index_.begin(), ClearEmptyRows());
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
+  thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
+    sample_row_index_.begin());
+  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
+                    sample_row_index_.begin(),
+                    sample_row_index_.begin(),
+                    ClearEmptyRows());

  auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
  auto first_page = (*batch_iterator.begin()).Impl();
@@ -318,13 +317,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
  return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }

-GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
-                                           const BatchParam& batch_param, float subsample,
-                                           int sampling_method, bool is_external_memory) {
-  // The ctx is kept here for future development of stream-based operations.
+GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
+                                           size_t n_rows, const BatchParam& batch_param,
+                                           float subsample, int sampling_method) {
  monitor_.Init("gradient_based_sampler");

  bool is_sampling = subsample < 1.0;
+  bool is_external_memory = page->n_rows != n_rows;

  if (is_sampling) {
    switch (sampling_method) {
@@ -332,24 +331,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows
        if (is_external_memory) {
          strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
        } else {
-          strategy_.reset(new UniformSampling(batch_param, subsample));
+          strategy_.reset(new UniformSampling(page, subsample));
        }
        break;
      case TrainParam::kGradientBased:
        if (is_external_memory) {
-          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+          strategy_.reset(
+              new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
        } else {
-          strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
+          strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
        }
        break;
-      default:
-        LOG(FATAL) << "unknown sampling method";
+      default:LOG(FATAL) << "unknown sampling method";
    }
  } else {
    if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
    } else {
-      strategy_.reset(new NoSampling(batch_param));
+      strategy_.reset(new NoSampling(page));
    }
  }
 }
@@ -363,11 +362,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
  return sample;
 }

-size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
-                                                     common::Span<float> threshold,
-                                                     common::Span<float> grad_sum,
-                                                     size_t sample_rows) {
-  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
+size_t GradientBasedSampler::CalculateThresholdIndex(
+    common::Span<GradientPair> gpair, common::Span<float> threshold,
+    common::Span<float> grad_sum, size_t sample_rows) {
+  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
+               std::numeric_limits<float>::max());
  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
                    CombineGradientPair());
  thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
@@ -380,5 +379,6 @@ size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair>
      thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
  return thrust::distance(dh::tbegin(grad_sum), min) + 1;
 }
+
 };  // namespace tree
 };  // namespace xgboost
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -1,5 +1,5 @@
-/**
- * Copyright 2019-2023, XGBoost Contributors
+/*!
+ * Copyright 2019 by XGBoost Contributors
 */
 #pragma once
 #include <xgboost/base.h>
@@ -32,36 +32,37 @@ class SamplingStrategy {
 /*! \brief No sampling in in-memory mode. */
 class NoSampling : public SamplingStrategy {
 public:
-  explicit NoSampling(BatchParam batch_param);
+  explicit NoSampling(EllpackPageImpl const* page);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  BatchParam batch_param_;
+  EllpackPageImpl const* page_;
 };

 /*! \brief No sampling in external memory mode. */
 class ExternalMemoryNoSampling : public SamplingStrategy {
 public:
-  explicit ExternalMemoryNoSampling(BatchParam batch_param);
+  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+                           BatchParam batch_param);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
  BatchParam batch_param_;
-  std::unique_ptr<EllpackPageImpl> page_{nullptr};
+  std::unique_ptr<EllpackPageImpl> page_;
  bool page_concatenated_{false};
 };

 /*! \brief Uniform sampling in in-memory mode. */
 class UniformSampling : public SamplingStrategy {
 public:
-  UniformSampling(BatchParam batch_param, float subsample);
+  UniformSampling(EllpackPageImpl const* page, float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  BatchParam batch_param_;
+  EllpackPageImpl const* page_;
  float subsample_;
 };

@@ -83,12 +84,13 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in in-memory mode.. */
 class GradientBasedSampling : public SamplingStrategy {
 public:
-  GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
+  GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
+                        float subsample);
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                             DMatrix* dmat) override;

 private:
-  BatchParam batch_param_;
+  EllpackPageImpl const* page_;
  float subsample_;
  dh::caching_device_vector<float> threshold_;
  dh::caching_device_vector<float> grad_sum_;
@@ -104,11 +106,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 private:
  BatchParam batch_param_;
  float subsample_;
-  dh::device_vector<float> threshold_;
-  dh::device_vector<float> grad_sum_;
+  dh::caching_device_vector<float> threshold_;
+  dh::caching_device_vector<float> grad_sum_;
  std::unique_ptr<EllpackPageImpl> page_;
  dh::device_vector<GradientPair> gpair_;
-  dh::device_vector<size_t> sample_row_index_;
+  dh::caching_device_vector<size_t> sample_row_index_;
 };

 /*! \brief Draw a sample of rows from a DMatrix.
@@ -122,8 +124,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
 */
 class GradientBasedSampler {
 public:
-  GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
-                       float subsample, int sampling_method, bool is_external_memory);
+  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+                       const BatchParam& batch_param, float subsample, int sampling_method);

  /*! \brief Sample from a DMatrix based on the given gradient pairs. */
  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -213,7 +213,7 @@ std::vector<bst_cat_t> GetSplitCategories(RegTree const &tree, int32_t nidx) {
  auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};

  std::vector<bst_cat_t> cats;
-  for (size_t i = 0; i < split.Capacity(); ++i) {
+  for (size_t i = 0; i < split.Size(); ++i) {
    if (split.Check(i)) {
      cats.push_back(static_cast<bst_cat_t>(i));
    }
@@ -1004,7 +1004,7 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
      auto segment = split_categories_segments_[i];
      auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
      common::KCatBitField const cat_bits(node_categories);
-      for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
+      for (size_t i = 0; i < cat_bits.Size(); ++i) {
        if (cat_bits.Check(i)) {
          categories.GetArray().emplace_back(i);
        }
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -176,7 +176,7 @@ struct GPUHistMakerDevice {
  Context const* ctx_;

 public:
-  EllpackPageImpl const* page{nullptr};
+  EllpackPageImpl const* page;
  common::Span<FeatureType const> feature_types;
  BatchParam batch_param;

@@ -205,39 +205,41 @@ struct GPUHistMakerDevice {

  std::unique_ptr<FeatureGroups> feature_groups;

-  GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
-                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
+
+  GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
+                     common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
                     TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
                     BatchParam _batch_param)
      : evaluator_{_param, n_features, ctx->gpu_id},
        ctx_(ctx),
+        page(_page),
        feature_types{_feature_types},
        param(std::move(_param)),
        column_sampler(column_sampler_seed),
        interaction_constraints(param, n_features),
        batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
-                                           param.sampling_method, is_external_memory));
+    sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
+                                           param.sampling_method));
    if (!param.monotone_constraints.empty()) {
      // Copy assigning an empty vector causes an exception in MSVC debug builds
      monotone_constraints = param.monotone_constraints;
    }

+    // Init histogram
+    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
+                                           dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                           sizeof(GradientSumT)));
  }

-  ~GPUHistMakerDevice() = default;
-
-  void InitFeatureGroupsOnce() {
-    if (!feature_groups) {
-      CHECK(page);
-      feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
-                                             dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                             sizeof(GradientSumT)));
-    }
+  ~GPUHistMakerDevice() {  // NOLINT
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
  }

  // Reset values for each update iteration
+  // Note that the column sampler must be passed by value because it is not
+  // thread safe
  void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
    auto const& info = dmat->Info();
    this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
@@ -245,30 +247,26 @@ struct GPUHistMakerDevice {
                              param.colsample_bytree);
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));

+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
+                           ctx_->gpu_id);
+
    this->interaction_constraints.Reset();

    if (d_gpair.size() != dh_gpair->Size()) {
      d_gpair.resize(dh_gpair->Size());
    }
-    dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
-                                  dh_gpair->Size() * sizeof(GradientPair),
-                                  cudaMemcpyDeviceToDevice));
+    dh::safe_cuda(cudaMemcpyAsync(
+        d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
+        dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
    auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
    page = sample.page;
    gpair = sample.gpair;

-    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
-
    quantiser.reset(new GradientQuantiser(this->gpair));

    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
-
-    // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id,  sample.sample_rows));
    hist.Reset();
-
-    this->InitFeatureGroupsOnce();
  }

  GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -810,11 +808,12 @@ class GPUHistMaker : public TreeUpdater {
    collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);

    auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
+    auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
    info_->feature_types.SetDevice(ctx_->gpu_id);
    maker.reset(new GPUHistMakerDevice<GradientSumT>(
-        ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
-        *param, column_sampling_seed, info_->num_col_, batch_param));
+        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
+        column_sampling_seed, info_->num_col_, batch_param));

    p_last_fmat_ = dmat;
    initialised_ = true;
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x

 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.08
+RAPIDS_VERSION=23.06
 SPARK_VERSION=3.4.0
 JDK_VERSION=8

--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -23,11 +23,7 @@ class LintersPaths:
        "tests/python/test_predict.py",
        "tests/python/test_quantile_dmatrix.py",
        "tests/python/test_tree_regularization.py",
-        "tests/python/test_shap.py",
        "tests/python-gpu/test_gpu_data_iterator.py",
-        "tests/python-gpu/test_gpu_prediction.py",
-        "tests/python-gpu/load_pickle.py",
-        "tests/python-gpu/test_gpu_pickling.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
@@ -41,7 +37,6 @@ class LintersPaths:
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
        "demo/guide-python/learning_to_rank.py",
-        "demo/aft_survival/aft_survival_viz_demo.py",
        # CI
        "tests/ci_build/lint_python.py",
        "tests/ci_build/test_r_package.py",
@@ -71,7 +66,6 @@ class LintersPaths:
        "tests/python/test_dt.py",
        "tests/python/test_data_iterator.py",
        "tests/python-gpu/test_gpu_data_iterator.py",
-        "tests/python-gpu/load_pickle.py",
        "tests/test_distributed/test_with_spark/test_data.py",
        "tests/test_distributed/test_gpu_with_spark/test_data.py",
        "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
@@ -84,7 +78,6 @@ class LintersPaths:
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
        "demo/guide-python/learning_to_rank.py",
-        "demo/aft_survival/aft_survival_viz_demo.py",
        # CI
        "tests/ci_build/lint_python.py",
        "tests/ci_build/test_r_package.py",
@@ -121,13 +114,7 @@ def run_black(rel_path: str, fix: bool) -> bool:
@cd(PY_PACKAGE)
 def run_isort(rel_path: str, fix: bool) -> bool:
    # Isort gets confused when trying to find the config file, so specified explicitly.
-    cmd = [
-        "isort",
-        "--settings-path",
-        PY_PACKAGE,
-        f"--src={PY_PACKAGE}",
-        os.path.join(ROOT, rel_path),
-    ]
+    cmd = ["isort", "--settings-path", PY_PACKAGE, os.path.join(ROOT, rel_path)]
    if not fix:
        cmd += ["--check"]

--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -5,18 +5,21 @@

 #include <gtest/gtest.h>

-#include <bitset>
 #include <string>  // for string

-#include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/nccl_device_communicator.cuh"
-#include "../helpers.h"
+#include "../../../src/collective/communicator-inl.cuh"

 namespace xgboost {
 namespace collective {

 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
+  EXPECT_THROW(construct(), dmlc::Error);
+}
+
+TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
+  auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
  EXPECT_THROW(construct(), dmlc::Error);
 }

@@ -28,69 +31,6 @@ TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
    ASSERT_TRUE(str.find("environment variables") != std::string::npos);
  }
 }
-
-namespace {
-void VerifyAllReduceBitwiseAND() {
-  auto const rank = collective::GetRank();
-  std::bitset<64> original{};
-  original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
-  collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
-  collective::Synchronize(rank);
-  EXPECT_EQ(buffer.HostVector()[0], 0ULL);
-}
-}  // anonymous namespace
-
-TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseAND) {
-  auto const n_gpus = common::AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseAND test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseAND);
-}
-
-namespace {
-void VerifyAllReduceBitwiseOR() {
-  auto const world_size = collective::GetWorldSize();
-  auto const rank = collective::GetRank();
-  std::bitset<64> original{};
-  original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
-  collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
-  collective::Synchronize(rank);
-  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
-}
-}  // anonymous namespace
-
-TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseOR) {
-  auto const n_gpus = common::AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseOR test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseOR);
-}
-
-namespace {
-void VerifyAllReduceBitwiseXOR() {
-  auto const world_size = collective::GetWorldSize();
-  auto const rank = collective::GetRank();
-  std::bitset<64> original{~0ULL};
-  original[rank] = false;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
-  collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
-  collective::Synchronize(rank);
-  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
-}
-}  // anonymous namespace
-
-TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseXOR) {
-  auto const n_gpus = common::AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseXOR test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseXOR);
-}
-
 }  // namespace collective
 }  // namespace xgboost

--- a/tests/cpp/common/test_bitfield.cc
+++ b/tests/cpp/common/test_bitfield.cc
@@ -1,5 +1,5 @@
-/**
- * Copyright 2019-2023, XGBoost contributors
+/*!
+ * Copyright 2019 XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include "../../../src/common/bitfield.h"
@@ -14,7 +14,7 @@ TEST(BitField, Check) {
                static_cast<typename common::Span<LBitField64::value_type>::index_type>(
                    storage.size())});
    size_t true_bit = 190;
-    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
      ASSERT_FALSE(bits.Check(i));
    }
    ASSERT_TRUE(bits.Check(true_bit));
@@ -34,7 +34,7 @@ TEST(BitField, Check) {
      ASSERT_FALSE(bits.Check(i));
    }
    ASSERT_TRUE(bits.Check(true_bit));
-    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
      ASSERT_FALSE(bits.Check(i));
    }
  }
--- a/tests/cpp/common/test_bitfield.cu
+++ b/tests/cpp/common/test_bitfield.cu
@@ -1,5 +1,5 @@
-/**
- * Copyright 2019-2023, XGBoost contributors
+/*!
+ * Copyright 2019 XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -12,7 +12,7 @@ namespace xgboost {

 __global__ void TestSetKernel(LBitField64 bits) {
  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < bits.Capacity()) {
+  if (tid < bits.Size()) {
    bits.Set(tid);
  }
 }
@@ -36,16 +36,20 @@ TEST(BitField, GPUSet) {

  std::vector<LBitField64::value_type> h_storage(storage.size());
  thrust::copy(storage.begin(), storage.end(), h_storage.begin());
-  LBitField64 outputs{
-      common::Span<LBitField64::value_type>{h_storage.data(), h_storage.data() + h_storage.size()}};
+
+  LBitField64 outputs {
+    common::Span<LBitField64::value_type>{h_storage.data(),
+                                       h_storage.data() + h_storage.size()}};
  for (size_t i = 0; i < kBits; ++i) {
    ASSERT_TRUE(outputs.Check(i));
  }
 }

-namespace {
-template <bool is_and, typename Op>
-void TestGPULogic(Op op) {
+__global__ void TestOrKernel(LBitField64 lhs, LBitField64 rhs) {
+  lhs |= rhs;
+}
+
+TEST(BitField, GPUAnd) {
  uint32_t constexpr kBits = 128;
  dh::device_vector<LBitField64::value_type> lhs_storage(kBits);
  dh::device_vector<LBitField64::value_type> rhs_storage(kBits);
@@ -53,32 +57,13 @@ void TestGPULogic(Op op) {
  auto rhs = LBitField64(dh::ToSpan(rhs_storage));
  thrust::fill(lhs_storage.begin(), lhs_storage.end(), 0UL);
  thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<LBitField64::value_type>(0UL));
-  dh::LaunchN(kBits, [=] __device__(auto) mutable { op(lhs, rhs); });
+  TestOrKernel<<<1, kBits>>>(lhs, rhs);

  std::vector<LBitField64::value_type> h_storage(lhs_storage.size());
  thrust::copy(lhs_storage.begin(), lhs_storage.end(), h_storage.begin());
-  LBitField64 outputs{{h_storage.data(), h_storage.data() + h_storage.size()}};
-  if (is_and) {
-    for (size_t i = 0; i < kBits; ++i) {
-      ASSERT_FALSE(outputs.Check(i));
-    }
-  } else {
-    for (size_t i = 0; i < kBits; ++i) {
-      ASSERT_TRUE(outputs.Check(i));
-    }
+  LBitField64 outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
+  for (size_t i = 0; i < kBits; ++i) {
+    ASSERT_TRUE(outputs.Check(i));
  }
 }
-
-void TestGPUAnd() {
-  TestGPULogic<true>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs &= rhs; });
-}
-
-void TestGPUOr() {
-  TestGPULogic<false>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs |= rhs; });
-}
-}  // namespace
-
-TEST(BitField, GPUAnd) { TestGPUAnd(); }
-
-TEST(BitField, GPUOr) { TestGPUOr(); }
-}  // namespace xgboost
+}  // namespace xgboost
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -83,9 +83,7 @@ template <typename BinIdxType>
 void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
                                const GHistIndexMatrix& gmat) {
  for (auto i = 0ull; i < col.Size(); i++) {
-    if (col.IsMissing(i)) {
-      continue;
-    }
+    if (col.IsMissing(i)) continue;
    EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i));
  }
 }
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -143,14 +143,11 @@ TEST(HistUtil, DeviceSketchCategoricalFeatures) {

 void TestMixedSketch() {
  size_t n_samples = 1000, n_features = 2, n_categories = 3;
-  bst_bin_t n_bins = 64;
-
  std::vector<float> data(n_samples * n_features);
  SimpleLCG gen;
  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
  SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
  for (size_t i = 0; i < n_samples * n_features; ++i) {
-    // two features, row major. The first column is numeric and the second is categorical.
    if (i % 2 == 0) {
      data[i] = std::floor(cat_d(&gen));
    } else {
@@ -162,75 +159,12 @@ void TestMixedSketch() {
  m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
  m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);

-  auto cuts = DeviceSketch(0, m.get(), n_bins);
-  ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
+  auto cuts = DeviceSketch(0, m.get(), 64);
+  ASSERT_EQ(cuts.Values().size(), 64 + n_categories);
 }

-TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
-
-TEST(HistUtil, RemoveDuplicatedCategories) {
-  bst_row_t n_samples = 512;
-  bst_feature_t n_features = 3;
-  bst_cat_t n_categories = 5;
-
-  auto ctx = MakeCUDACtx(0);
-  SimpleLCG rng;
-  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
-
-  dh::device_vector<Entry> sorted_entries(n_samples * n_features);
-  for (std::size_t i = 0; i < n_samples; ++i) {
-    for (bst_feature_t j = 0; j < n_features; ++j) {
-      float fvalue{0.0f};
-      // The second column is categorical
-      if (j == 1) {
-        fvalue = std::floor(cat_d(&rng));
-      } else {
-        fvalue = i;
-      }
-      sorted_entries[i * n_features + j] = Entry{j, fvalue};
-    }
-  }
-
-  MetaInfo info;
-  info.num_col_ = n_features;
-  info.num_row_ = n_samples;
-  info.feature_types.HostVector() = std::vector<FeatureType>{
-      FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
-  ASSERT_EQ(info.feature_types.Size(), n_features);
-
-  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
-  cuts_ptr.SetDevice(0);
-
-  dh::device_vector<float> weight(n_samples * n_features, 0);
-  dh::Iota(dh::ToSpan(weight));
-
-  dh::caching_device_vector<bst_row_t> columns_ptr(4);
-  for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
-    columns_ptr[i] = i * n_samples;
-  }
-  // sort into column major
-  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
-                      detail::EntryCompareOp());
-
-  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
-                                     &weight, &columns_ptr);
-
-  auto const& h_cptr = cuts_ptr.ConstHostVector();
-  ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
-  // check numerical
-  for (std::size_t i = 0; i < n_samples; ++i) {
-    ASSERT_EQ(weight[i], i * 3);
-  }
-  auto beg = n_samples + n_categories;
-  for (std::size_t i = 0; i < n_samples; ++i) {
-    ASSERT_EQ(weight[i + beg], i * 3 + 2);
-  }
-  // check categorical
-  beg = n_samples;
-  for (std::size_t i = 0; i < n_categories; ++i) {
-    // all from the second column
-    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
-  }
+TEST(HistUtil, DeviceSketchMixedFeatures) {
+  TestMixedSketch();
 }

 TEST(HistUtil, DeviceSketchMultipleColumns) {
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -1,16 +1,16 @@
-/**
- * Copyright 2019-2023, XGBoost Contributors
+/*!
+ * Copyright (c) by XGBoost Contributors 2019
 */
 #include <gtest/gtest.h>

-#include <cstddef>  // for size_t
-#include <fstream>  // for ofstream
+#include <fstream>

 #include "../../../src/common/io.h"
-#include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
+#include "../filesystem.h"  // dmlc::TemporaryDirectory

-namespace xgboost::common {
+namespace xgboost {
+namespace common {
 TEST(MemoryFixSizeBuffer, Seek) {
  size_t constexpr kSize { 64 };
  std::vector<int32_t> memory( kSize );
@@ -89,123 +89,5 @@ TEST(IO, LoadSequentialFile) {

  ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }
-
-TEST(IO, Resource) {
-  {
-    // test malloc basic
-    std::size_t n = 128;
-    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
-    ASSERT_EQ(resource->Size(), n);
-    ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
-  }
-
-  // test malloc resize
-  auto test_malloc_resize = [](bool force_malloc) {
-    std::size_t n = 64;
-    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
-    auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
-    std::iota(ptr, ptr + n, 0);
-
-    auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
-    ASSERT_TRUE(malloc_resource);
-    if (force_malloc) {
-      malloc_resource->Resize<true>(n * 2);
-    } else {
-      malloc_resource->Resize<false>(n * 2);
-    }
-    for (std::size_t i = 0; i < n; ++i) {
-      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
-    }
-    for (std::size_t i = n; i < 2 * n; ++i) {
-      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
-    }
-  };
-  test_malloc_resize(true);
-  test_malloc_resize(false);
-
-  {
-    // test mmap
-    dmlc::TemporaryDirectory tmpdir;
-    auto path = tmpdir.path + "/testfile";
-
-    std::ofstream fout(path, std::ios::binary);
-    double val{1.0};
-    fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
-    fout << 1.0 << std::endl;
-    fout.close();
-
-    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
-    ASSERT_EQ(resource->Size(), sizeof(double));
-    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
-    ASSERT_EQ(resource->DataAs<double>()[0], val);
-  }
-}
-
-TEST(IO, PrivateMmapStream) {
-  dmlc::TemporaryDirectory tempdir;
-  auto path = tempdir.path + "/testfile";
-
-  // The page size on Linux is usually set to 4096, while the allocation granularity on
-  // the Windows machine where this test is writted is 65536. We span the test to cover
-  // all of them.
-  std::size_t n_batches{64};
-  std::size_t multiplier{2048};
-
-  std::vector<std::vector<std::int32_t>> batches;
-  std::vector<std::size_t> offset{0ul};
-
-  using T = std::int32_t;
-
-  {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
-    for (std::size_t i = 0; i < n_batches; ++i) {
-      std::size_t size = (i + 1) * multiplier;
-      std::vector<T> data(size, 0);
-      std::iota(data.begin(), data.end(), i * i);
-
-      fo->Write(static_cast<std::uint64_t>(data.size()));
-      fo->Write(data.data(), data.size() * sizeof(T));
-
-      std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
-      offset.push_back(bytes);
-
-      batches.emplace_back(std::move(data));
-    }
-  }
-
-  // Turn size info offset
-  std::partial_sum(offset.begin(), offset.end(), offset.begin());
-
-  // Test read
-  for (std::size_t i = 0; i < n_batches; ++i) {
-    std::size_t off = offset[i];
-    std::size_t n = offset.at(i + 1) - offset[i];
-    auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
-    std::vector<T> data;
-
-    std::uint64_t size{0};
-    ASSERT_TRUE(fi->Read(&size));
-    ASSERT_EQ(fi->Tell(), sizeof(size));
-    data.resize(size);
-
-    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
-    ASSERT_EQ(data, batches[i]);
-  }
-
-  // Test consume
-  for (std::size_t i = 0; i < n_batches; ++i) {
-    std::size_t off = offset[i];
-    std::size_t n = offset.at(i + 1) - offset[i];
-    std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
-    std::vector<T> data;
-
-    std::uint64_t size{0};
-    ASSERT_TRUE(fi->Consume(&size));
-    ASSERT_EQ(fi->Tell(), sizeof(size));
-    data.resize(size);
-
-    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
-    ASSERT_EQ(data, batches[i]);
-  }
-}
-}  // namespace xgboost::common
+}  // namespace common
+}  // namespace xgboost
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -41,6 +41,7 @@ std::string GetModelStr() {
    "num_class": "0",
    "num_feature": "10",
    "objective": "reg:linear",
+    "predictor": "gpu_predictor",
    "tree_method": "gpu_hist",
    "updater": "grow_gpu_hist"
  },
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -1,108 +0,0 @@
-/**
- * Copyright 2023, XGBoost Contributors
- */
-#include <gtest/gtest.h>
-
-#include <cstddef>  // for size_t
-#include <memory>   // for make_shared, make_unique
-#include <numeric>  // for iota
-#include <vector>   // for vector
-
-#include "../../../src/common/ref_resource_view.h"
-#include "dmlc/filesystem.h"  // for TemporaryDirectory
-
-namespace xgboost::common {
-TEST(RefResourceView, Basic) {
-  std::size_t n_bytes = 1024;
-  auto mem = std::make_shared<MallocResource>(n_bytes);
-  {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
-
-    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
-                          mem};
-    ASSERT_EQ(mem.use_count(), 3);
-    ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
-    ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
-  }
-  {
-    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
-                         1.5f};
-    for (auto v : view) {
-      ASSERT_EQ(v, 1.5f);
-    }
-    std::iota(view.begin(), view.end(), 0.0f);
-    ASSERT_EQ(view.front(), 0.0f);
-    ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
-
-    view.front() = 1.0f;
-    view.back() = 2.0f;
-    ASSERT_EQ(view.front(), 1.0f);
-    ASSERT_EQ(view.back(), 2.0f);
-  }
-  ASSERT_EQ(mem.use_count(), 1);
-}
-
-TEST(RefResourceView, IO) {
-  dmlc::TemporaryDirectory tmpdir;
-  auto path = tmpdir.path + "/testfile";
-  auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
-
-  {
-    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
-    ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
-  }
-  {
-    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
-    ASSERT_EQ(WriteVec(fo.get(), data),
-              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
-  }
-  {
-    auto fi = std::make_unique<PrivateMmapConstStream>(
-        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
-    auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
-    ASSERT_TRUE(ReadVec(fi.get(), &read));
-    for (auto v : read) {
-      ASSERT_EQ(v, 1ul);
-    }
-  }
-}
-
-TEST(RefResourceView, IOAligned) {
-  dmlc::TemporaryDirectory tmpdir;
-  auto path = tmpdir.path + "/testfile";
-  auto data = MakeFixedVecWithMalloc(123, 1.0f);
-
-  {
-    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
-    // + sizeof(float) for alignment
-    ASSERT_EQ(WriteVec(fo.get(), data),
-              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
-  }
-  {
-    auto fi = std::make_unique<PrivateMmapConstStream>(
-        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
-    // wrong type, float vs. double
-    auto read = MakeFixedVecWithMalloc(123, 2.0);
-    ASSERT_FALSE(ReadVec(fi.get(), &read));
-  }
-  {
-    auto fi = std::make_unique<PrivateMmapConstStream>(
-        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
-    auto read = MakeFixedVecWithMalloc(123, 2.0f);
-    ASSERT_TRUE(ReadVec(fi.get(), &read));
-    for (auto v : read) {
-      ASSERT_EQ(v, 1ul);
-    }
-  }
-  {
-    // Test std::vector
-    std::vector<float> data(123);
-    std::iota(data.begin(), data.end(), 0.0f);
-    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
-    // + sizeof(float) for alignment
-    ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
-                                            sizeof(RefResourceView<std::size_t>::size_type) +
-                                            sizeof(float));
-  }
-}
-}  // namespace xgboost::common
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -4,14 +4,14 @@
 #include <gtest/gtest.h>
 #include <xgboost/data.h>

-#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"

-namespace xgboost::data {
+namespace xgboost {
+namespace data {
 TEST(EllpackPageRawFormat, IO) {
  Context ctx{MakeCUDACtx(0)};
  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
@@ -22,17 +22,15 @@ TEST(EllpackPageRawFormat, IO) {
  dmlc::TemporaryDirectory tmpdir;
  std::string path = tmpdir.path + "/ellpack.page";

-  std::size_t n_bytes{0};
  {
-    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
+    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
    for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
-      n_bytes += format->Write(ellpack, fo.get());
+      format->Write(ellpack, fo.get());
    }
  }

  EllpackPage page;
-  std::unique_ptr<common::AlignedResourceReadStream> fi{
-      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
+  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
  format->Read(&page, fi.get());

  for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
@@ -46,4 +44,5 @@ TEST(EllpackPageRawFormat, IO) {
    ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
  }
 }
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -26,32 +26,28 @@
 #include "xgboost/context.h"                    // for Context
 #include "xgboost/host_device_vector.h"         // for HostDeviceVector

-namespace xgboost::data {
-TEST(GradientIndex, ExternalMemoryBaseRowID) {
+namespace xgboost {
+namespace data {
+TEST(GradientIndex, ExternalMemory) {
  Context ctx;
-  auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
-                    .Device(ctx.gpu_id)
-                    .Batches(8)
-                    .GenerateSparsePageDMatrix("cache", true);
-
+  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
  std::vector<size_t> base_rowids;
-  std::vector<float> hessian(p_fmat->Info().num_row_, 1);
-  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
+  std::vector<float> hessian(dmat->Info().num_row_, 1);
+  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
    base_rowids.push_back(page.base_rowid);
  }
-
-  std::size_t i = 0;
-  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
+  size_t i = 0;
+  for (auto const &page : dmat->GetBatches<SparsePage>()) {
    ASSERT_EQ(base_rowids[i], page.base_rowid);
    ++i;
  }

  base_rowids.clear();
-  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
+  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
    base_rowids.push_back(page.base_rowid);
  }
  i = 0;
-  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
+  for (auto const &page : dmat->GetBatches<SparsePage>()) {
    ASSERT_EQ(base_rowids[i], page.base_rowid);
    ++i;
  }
@@ -175,7 +171,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
    gpu_ctx.gpu_id = 0;
    for (auto const &page : Xy->GetBatches<EllpackPage>(
             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
-      from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
+      from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
    }

    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
@@ -203,15 +199,13 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo

      std::string from_sparse_buf;
      {
-        common::AlignedMemWriteStream fo{&from_sparse_buf};
-        auto n_bytes = columns_from_sparse.Write(&fo);
-        ASSERT_EQ(fo.Tell(), n_bytes);
+        common::MemoryBufferStream fo{&from_sparse_buf};
+        columns_from_sparse.Write(&fo);
      }
      std::string from_ellpack_buf;
      {
-        common::AlignedMemWriteStream fo{&from_ellpack_buf};
-        auto n_bytes = columns_from_sparse.Write(&fo);
-        ASSERT_EQ(fo.Tell(), n_bytes);
+        common::MemoryBufferStream fo{&from_ellpack_buf};
+        columns_from_sparse.Write(&fo);
      }
      ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
    }
@@ -235,4 +229,5 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
                                         std::make_tuple(.6f, .4)));  // dense columns

 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace xgboost::data
+}  // namespace data
+}  // namespace xgboost
--- a/Show More
+++ b/Show More