[CI] Update RAPIDS to latest stable

Fix NCCL test hang (#9367 )
Preserve order of saved updaters config. (#9355 )
2023-08-10 20:02:16 +00:00 · 2023-07-07 11:21:35 +08:00 · 2023-07-05 20:20:07 +08:00 · 2023-07-05 16:49:25 +08:00 · 2023-07-05 15:17:22 +08:00 · 2023-07-05 05:46:17 +08:00
118 changed files with 3722 additions and 1844 deletions
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -255,3 +255,44 @@ jobs:
      shell: bash -l {0}
      run: |
        pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
+
+  python-system-installation-on-ubuntu:
+    name: Test XGBoost Python package System Installation on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'true'
+
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+
+      - name: Install ninja
+        run: |
+          sudo apt-get update && sudo apt-get install -y ninja-build
+
+      - name: Build XGBoost on Ubuntu
+        run: |
+          mkdir build
+          cd build
+          cmake .. -GNinja
+          ninja
+
+      - name: Copy lib to system lib
+        run: |
+          cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib"
+
+      - name: Install XGBoost in Virtual Environment
+        run: |
+          cd python-package
+          pip install virtualenv
+          virtualenv venv
+          source venv/bin/activate && \
+            pip install -v . --config-settings use_system_libxgboost=True && \
+            python -c 'import xgboost'
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,23 @@ XGBoost Change Log

 This file records the changes in xgboost library in reverse chronological order.

+## 1.7.6 (2023 Jun 16)
+
+This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.
+
+### Bug Fixes
+* Fix distributed training with mixed dense and sparse partitions. (#9272)
+* Fix monotone constraints on CPU with large trees. (#9122)
+* [spark] Make the spark model have the same UID as its estimator (#9022)
+* Optimize prediction with `QuantileDMatrix`. (#9096)
+
+### Document
+* Improve doxygen (#8959)
+* Update the cuDF pip index URL. (#9106)
+
+### Maintenance
+* Fix tests with pandas 2.0. (#9014)
+
 ## 1.7.5 (2023 Mar 30)
 This is a patch release for bug fixes.

--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -85,9 +85,18 @@ test_that("dart prediction works", {
    rnorm(100)

  set.seed(1994)
-  booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
-                                rate_drop = 0.5, one_drop = TRUE,
-                                eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
+  booster_by_xgboost <- xgboost(
+    data = d,
+    label = y,
+    max_depth = 2,
+    booster = "dart",
+    rate_drop = 0.5,
+    one_drop = TRUE,
+    eta = 1,
+    nthread = 2,
+    nrounds = nrounds,
+    objective = "reg:squarederror"
+  )
  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
  expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
@@ -97,19 +106,19 @@ test_that("dart prediction works", {

  set.seed(1994)
  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
-  booster_by_train <- xgb.train(params = list(
-                                    booster = "dart",
-                                    max_depth = 2,
-                                    eta = 1,
-                                    rate_drop = 0.5,
-                                    one_drop = TRUE,
-                                    nthread = 1,
-                                    tree_method = "exact",
-                                    objective = "reg:squarederror"
-                                ),
-                                data = dtrain,
-                                nrounds = nrounds
-                                )
+  booster_by_train <- xgb.train(
+    params = list(
+      booster = "dart",
+      max_depth = 2,
+      eta = 1,
+      rate_drop = 0.5,
+      one_drop = TRUE,
+      nthread = 1,
+      objective = "reg:squarederror"
+    ),
+    data = dtrain,
+    nrounds = nrounds
+  )
  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
  pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
@@ -399,7 +408,7 @@ test_that("colsample_bytree works", {
  xgb.importance(model = bst)
  # If colsample_bytree works properly, a variety of features should be used
  # in the 100 trees
-  expect_gte(nrow(xgb.importance(model = bst)), 30)
+  expect_gte(nrow(xgb.importance(model = bst)), 28)
 })

 test_that("Configuration works", {
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -76,32 +76,20 @@ test_that("Models from previous versions of XGBoost can be loaded", {
    name <- m[3]
    is_rds <- endsWith(model_file, '.rds')
    is_json <- endsWith(model_file, '.json')
-
-    cpp_warning <- capture.output({
-      # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
-      if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
+    # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
+    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
+      booster <- readRDS(model_file)
+      expect_warning(predict(booster, newdata = pred_data))
+      booster <- readRDS(model_file)
+      expect_warning(run_booster_check(booster, name))
+    } else {
+      if (is_rds) {
        booster <- readRDS(model_file)
-        expect_warning(predict(booster, newdata = pred_data))
-        booster <- readRDS(model_file)
-        expect_warning(run_booster_check(booster, name))
      } else {
-        if (is_rds) {
-          booster <- readRDS(model_file)
-        } else {
-          booster <- xgb.load(model_file)
-        }
-        predict(booster, newdata = pred_data)
-        run_booster_check(booster, name)
+        booster <- xgb.load(model_file)
      }
-    })
-    cpp_warning <- paste0(cpp_warning, collapse = ' ')
-    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') >= 0) {
-      # Expect a C++ warning when a model is loaded from RDS and it was generated by old XGBoost`
-      m <- grepl(paste0('.*If you are loading a serialized model ',
-                        '\\(like pickle in Python, RDS in R\\).*',
-                        'for more details about differences between ',
-                        'saving model and serializing.*'), cpp_warning, perl = TRUE)
-      expect_true(length(m) > 0 && all(m))
+      predict(booster, newdata = pred_data)
+      run_booster_check(booster, name)
    }
  })
 })
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -13,7 +13,10 @@ test_that("updating the model works", {
  watchlist <- list(train = dtrain, test = dtest)

  # no-subsampling
-  p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
+  p1 <- list(
+    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
+    updater = "grow_colmaker,prune"
+  )
  set.seed(11)
  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
  tr1 <- xgb.model.dt.tree(model = bst1)
--- a/R-package/vignettes/xgboost.bib
+++ b/R-package/vignettes/xgboost.bib
@@ -18,13 +18,11 @@
  publisher={Institute of Mathematical Statistics}
 }

-
@misc{
    Bache+Lichman:2013 ,
    author = "K. Bache and M. Lichman",
    year = "2013",
    title = "{UCI} Machine Learning Repository",
-    url = "http://archive.ics.uci.edu/ml/",
-    institution = "University of California, Irvine, School of Information and Computer Sciences" 
+    url = "https://archive.ics.uci.edu/",
+    institution = "University of California, Irvine, School of Information and Computer Sciences"
 }
-
--- a/demo/aft_survival/aft_survival_viz_demo.py
+++ b/demo/aft_survival/aft_survival_viz_demo.py
@@ -11,33 +11,43 @@ import numpy as np

 import xgboost as xgb

-plt.rcParams.update({'font.size': 13})
+plt.rcParams.update({"font.size": 13})
+

 # Function to visualize censored labels
-def plot_censored_labels(X, y_lower, y_upper):
-    def replace_inf(x, target_value):
+def plot_censored_labels(
+    X: np.ndarray, y_lower: np.ndarray, y_upper: np.ndarray
+) -> None:
+    def replace_inf(x: np.ndarray, target_value: float) -> np.ndarray:
        x[np.isinf(x)] = target_value
        return x
-    plt.plot(X, y_lower, 'o', label='y_lower', color='blue')
-    plt.plot(X, y_upper, 'o', label='y_upper', color='fuchsia')
-    plt.vlines(X, ymin=replace_inf(y_lower, 0.01), ymax=replace_inf(y_upper, 1000),
-               label='Range for y', color='gray')
+
+    plt.plot(X, y_lower, "o", label="y_lower", color="blue")
+    plt.plot(X, y_upper, "o", label="y_upper", color="fuchsia")
+    plt.vlines(
+        X,
+        ymin=replace_inf(y_lower, 0.01),
+        ymax=replace_inf(y_upper, 1000.0),
+        label="Range for y",
+        color="gray",
+    )
+

 # Toy data
 X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
 INF = np.inf
-y_lower = np.array([ 10,  15, -INF, 30, 100])
-y_upper = np.array([INF, INF,   20, 50, INF])
+y_lower = np.array([10, 15, -INF, 30, 100])
+y_upper = np.array([INF, INF, 20, 50, INF])

 # Visualize toy data
 plt.figure(figsize=(5, 4))
 plot_censored_labels(X, y_lower, y_upper)
 plt.ylim((6, 200))
-plt.legend(loc='lower right')
-plt.title('Toy data')
-plt.xlabel('Input feature')
-plt.ylabel('Label')
-plt.yscale('log')
+plt.legend(loc="lower right")
+plt.title("Toy data")
+plt.xlabel("Input feature")
+plt.ylabel("Label")
+plt.yscale("log")
 plt.tight_layout()
 plt.show(block=True)

@@ -46,54 +56,83 @@ grid_pts = np.linspace(0.8, 5.2, 1000).reshape((-1, 1))

 # Train AFT model using XGBoost
 dmat = xgb.DMatrix(X)
-dmat.set_float_info('label_lower_bound', y_lower)
-dmat.set_float_info('label_upper_bound', y_upper)
-params = {'max_depth': 3, 'objective':'survival:aft', 'min_child_weight': 0}
+dmat.set_float_info("label_lower_bound", y_lower)
+dmat.set_float_info("label_upper_bound", y_upper)
+params = {"max_depth": 3, "objective": "survival:aft", "min_child_weight": 0}

 accuracy_history = []
-def plot_intermediate_model_callback(env):
-    """Custom callback to plot intermediate models"""
-    # Compute y_pred = prediction using the intermediate model, at current boosting iteration
-    y_pred = env.model.predict(dmat)
-    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
-    #              the corresponding predicted label (y_pred)
-    acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X) * 100)
-    accuracy_history.append(acc)

-    # Plot ranged labels as well as predictions by the model
-    plt.subplot(5, 3, env.iteration + 1)
-    plot_censored_labels(X, y_lower, y_upper)
-    y_pred_grid_pts = env.model.predict(xgb.DMatrix(grid_pts))
-    plt.plot(grid_pts, y_pred_grid_pts, 'r-', label='XGBoost AFT model', linewidth=4)
-    plt.title('Iteration {}'.format(env.iteration), x=0.5, y=0.8)
-    plt.xlim((0.8, 5.2))
-    plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
-    plt.yscale('log')

-res = {}
-plt.figure(figsize=(12,13))
-bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=res,
-                callbacks=[plot_intermediate_model_callback])
+class PlotIntermediateModel(xgb.callback.TrainingCallback):
+    """Custom callback to plot intermediate models."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def after_iteration(
+        self,
+        model: xgb.Booster,
+        epoch: int,
+        evals_log: xgb.callback.TrainingCallback.EvalsLog,
+    ) -> bool:
+        """Run after training is finished."""
+        # Compute y_pred = prediction using the intermediate model, at current boosting
+        # iteration
+        y_pred = model.predict(dmat)
+        # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
+        #              includes the corresponding predicted label (y_pred)
+        acc = np.sum(
+            np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X) * 100
+        )
+        accuracy_history.append(acc)
+
+        # Plot ranged labels as well as predictions by the model
+        plt.subplot(5, 3, epoch + 1)
+        plot_censored_labels(X, y_lower, y_upper)
+        y_pred_grid_pts = model.predict(xgb.DMatrix(grid_pts))
+        plt.plot(
+            grid_pts, y_pred_grid_pts, "r-", label="XGBoost AFT model", linewidth=4
+        )
+        plt.title("Iteration {}".format(epoch), x=0.5, y=0.8)
+        plt.xlim((0.8, 5.2))
+        plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
+        plt.yscale("log")
+        return False
+
+
+res: xgb.callback.TrainingCallback.EvalsLog = {}
+plt.figure(figsize=(12, 13))
+bst = xgb.train(
+    params,
+    dmat,
+    15,
+    [(dmat, "train")],
+    evals_result=res,
+    callbacks=[PlotIntermediateModel()],
+)
 plt.tight_layout()
-plt.legend(loc='lower center', ncol=4,
-           bbox_to_anchor=(0.5, 0),
-           bbox_transform=plt.gcf().transFigure)
+plt.legend(
+    loc="lower center",
+    ncol=4,
+    bbox_to_anchor=(0.5, 0),
+    bbox_transform=plt.gcf().transFigure,
+)
 plt.tight_layout()

 # Plot negative log likelihood over boosting iterations
-plt.figure(figsize=(8,3))
+plt.figure(figsize=(8, 3))
 plt.subplot(1, 2, 1)
-plt.plot(res['train']['aft-nloglik'], 'b-o', label='aft-nloglik')
-plt.xlabel('# Boosting Iterations')
-plt.legend(loc='best')
+plt.plot(res["train"]["aft-nloglik"], "b-o", label="aft-nloglik")
+plt.xlabel("# Boosting Iterations")
+plt.legend(loc="best")

 # Plot "accuracy" over boosting iterations
 # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
 #              the corresponding predicted label (y_pred)
 plt.subplot(1, 2, 2)
-plt.plot(accuracy_history, 'r-o', label='Accuracy (%)')
-plt.xlabel('# Boosting Iterations')
-plt.legend(loc='best')
+plt.plot(accuracy_history, "r-o", label="Accuracy (%)")
+plt.xlabel("# Boosting Iterations")
+plt.legend(loc="best")
 plt.tight_layout()

 plt.show()
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -259,7 +259,7 @@ There are several ways to build and install the package from source:

    import sys
    import pathlib
-    libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
+    libpath = pathlib.Path(sys.base_prefix).joinpath("lib", "libxgboost.so")
    assert libpath.exists()

  Then pass ``use_system_libxgboost=True`` option to ``pip install``:
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -33,6 +33,8 @@ DMatrix
 .. doxygengroup:: DMatrix
   :project: xgboost

+.. _c_streaming:
+
 Streaming
 ---------

--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -45,7 +45,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as

 .. code-block:: python

-  model.set_param({"predictor": "gpu_predictor"})
+  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
  shap_values = model.predict(dtrain, pred_contribs=True)
  shap_interaction_values = model.predict(dtrain, pred_interactions=True)

--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -199,18 +199,6 @@ Parameters for Tree Booster
  - Maximum number of discrete bins to bucket continuous features.
  - Increasing this number improves the optimality of splits at the cost of higher computation time.

-* ``predictor``, [default= ``auto``]
-
-  - The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
-
-    - ``auto``: Configure predictor based on heuristics.
-    - ``cpu_predictor``: Multicore CPU prediction algorithm.
-    - ``gpu_predictor``: Prediction using GPU.  Used when ``tree_method`` is ``gpu_hist``.
-      When ``predictor`` is set to default value ``auto``, the ``gpu_hist`` tree method is
-      able to provide GPU based prediction without copying training data to GPU memory.
-      If ``gpu_predictor`` is explicitly specified, then all data is copied into GPU, only
-      recommended for performing prediction tasks.
-
 * ``num_parallel_tree``, [default=1]

  - Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -87,15 +87,6 @@ with the native Python interface :py:meth:`xgboost.Booster.predict` and
 behavior.  Also the ``save_best`` parameter from :py:obj:`xgboost.callback.EarlyStopping`
 might be useful.

-*********
-Predictor
-*********
-
-There are 2 predictors in XGBoost (3 if you have the one-api plugin enabled), namely
-``cpu_predictor`` and ``gpu_predictor``.  The default option is ``auto`` so that XGBoost
-can employ some heuristics for saving GPU memory during training.  They might have slight
-different outputs due to floating point errors.
-

 ***********
 Base Margin
@@ -134,15 +125,6 @@ it.  Be aware that the output of in-place prediction depends on input data type,
 input is on GPU data output is :py:obj:`cupy.ndarray`, otherwise a :py:obj:`numpy.ndarray`
 is returned.

-****************
-Categorical Data
-****************
-
-Other than users performing encoding, XGBoost has experimental support for categorical
-data using ``gpu_hist`` and ``gpu_predictor``.  No special operation needs to be done on
-input test data since the information about categories is encoded into the model during
-training.
-
 *************
 Thread Safety
 *************
@@ -159,7 +141,6 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi

    def predict_fn(clf: xgb.XGBClassifier, X):
        X = preprocess(X)
-        clf.set_params(predictor="gpu_predictor")  # NOT safe!
        clf.set_params(n_jobs=1)  # NOT safe!
        return clf.predict_proba(X, iteration_range=(0, 10))

--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -54,6 +54,9 @@ on a dask cluster:
        y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))

        dtrain = xgb.dask.DaskDMatrix(client, X, y)
+        # or
+        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
+        # `DaskQuantileDMatrix` is available for the `hist` and `gpu_hist` tree method.

        output = xgb.dask.train(
            client,
@@ -145,8 +148,8 @@ Also for inplace prediction:

 .. code-block:: python

-  booster.set_param({'predictor': 'gpu_predictor'})
-  # where X is a dask DataFrame or dask Array containing cupy or cuDF backed data.
+  # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
+  booster.set_param({"gpu_id": "0"})
  prediction = xgb.dask.inplace_predict(client, booster, X)

 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -22,6 +22,15 @@ GPU-based training algorithm. We will introduce them in the following sections.

   The feature is still experimental as of 2.0. The performance is not well optimized.

+The external memory support has gone through multiple iterations and is still under heavy
+development. Like the :py:class:`~xgboost.QuantileDMatrix` with
+:py:class:`~xgboost.DataIter`, XGBoost loads data batch-by-batch using a custom iterator
+supplied by the user. However, unlike the :py:class:`~xgboost.QuantileDMatrix`, external
+memory will not concatenate the batches unless GPU is used (it uses a hybrid approach,
+more details follow). Instead, it will cache all batches on the external memory and fetch
+them on-demand.  Go to the end of the document to see a comparison between
+`QuantileDMatrix` and external memory.
+
 *************
 Data Iterator
 *************
@@ -113,10 +122,11 @@ External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set
 ``gpu_hist``). However, the algorithm used for GPU is different from the one used for
 CPU. When training on a CPU, the tree method iterates through all batches from external
 memory for each step of the tree construction algorithm. On the other hand, the GPU
-algorithm concatenates all batches into one and stores it in GPU memory. To reduce overall
-memory usage, users can utilize subsampling. The good news is that the GPU hist tree
-method supports gradient-based sampling, enabling users to set a low sampling rate without
-compromising accuracy.
+algorithm uses a hybrid approach. It iterates through the data during the beginning of
+each iteration and concatenates all batches into one in GPU memory. To reduce overall
+memory usage, users can utilize subsampling. The GPU hist tree method supports
+`gradient-based sampling`, enabling users to set a low sampling rate without compromising
+accuracy.

 .. code-block:: python

@@ -134,6 +144,8 @@ see `this paper <https://arxiv.org/abs/2005.09148>`_.
   When GPU is running out of memory during iteration on external memory, user might
   recieve a segfault instead of an OOM exception.

+.. _ext_remarks:
+
 *******
 Remarks
 *******
@@ -142,17 +154,64 @@ When using external memory with XBGoost, data is divided into smaller chunks so
 a fraction of it needs to be stored in memory at any given time. It's important to note
 that this method only applies to the predictor data (``X``), while other data, like labels
 and internal runtime structures are concatenated. This means that memory reduction is most
-effective when dealing with wide datasets where ``X`` is larger compared to other data
-like ``y``, while it has little impact on slim datasets.
+effective when dealing with wide datasets where ``X`` is significantly larger in size
+compared to other data like ``y``, while it has little impact on slim datasets.
+
+As one might expect, fetching data on-demand puts significant pressure on the storage
+device. Today's computing device can process way more data than a storage can read in a
+single unit of time. The ratio is at order of magnitudes. An GPU is capable of processing
+hundred of Gigabytes of floating-point data in a split second. On the other hand, a
+four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data transfer
+rate. As a result, the training is likely to be severely bounded by your storage
+device. Before adopting the external memory solution, some back-of-envelop calculations
+might help you see whether it's viable. For instance, if your NVMe drive can transfer 4GB
+(a fairly practical number) of data per second and you have a 100GB of data in compressed
+XGBoost cache (which corresponds to a dense float32 numpy array with the size of 200GB,
+give or take). A tree with depth 8 needs at least 16 iterations through the data when the
+parameter is right. You need about 14 minutes to train a single tree without accounting
+for some other overheads and assume the computation overlaps with the IO. If your dataset
+happens to have TB-level size, then you might need thousands of trees to get a generalized
+model. These calculations can help you get an estimate on the expected training time.
+
+However, sometimes we can ameliorate this limitation. One should also consider that the OS
+(mostly talking about the Linux kernel) can usually cache the data on host memory. It only
+evicts pages when new data comes in and there's no room left. In practice, at least some
+portion of the data can persist on the host memory throughout the entire training
+session. We are aware of this cache when optimizing the external memory fetcher. The
+compressed cache is usually smaller than the raw input data, especially when the input is
+dense without any missing value. If the host memory can fit a significant portion of this
+compressed cache, then the performance should be decent after initialization. Our
+development so far focus on two fronts of optimization for external memory:
+
+- Avoid iterating through the data whenever appropriate.
+- If the OS can cache the data, the performance should be close to in-core training.

 Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
-yet tested against system errors like disconnected network devices (`SIGBUS`). Also, it's
-worth noting that most tests have been conducted on Linux distributions.
+tested against system errors like disconnected network devices (`SIGBUS`). In the face of
+a bus error, you will see a hard crash and need to clean up the cache files. If the
+training session might take a long time and you are using solutions like NVMe-oF, we
+recommend checkpointing your model periodically. Also, it's worth noting that most tests
+have been conducted on Linux distributions.
+

 Another important point to keep in mind is that creating the initial cache for XGBoost may
-take some time. The interface to external memory is through custom iterators, which may or
-may not be thread-safe. Therefore, initialization is performed sequentially.
+take some time. The interface to external memory is through custom iterators, which we can
+not assume to be thread-safe. Therefore, initialization is performed sequentially. Using
+the `xgboost.config_context` with `verbosity=2` can give you some information on what
+XGBoost is doing during the wait if you don't mind the extra output.

+*******************************
+Compared to the QuantileDMatrix
+*******************************
+
+Passing an iterator to the :py:class:`~xgboost.QuantileDmatrix` enables direct
+construction of `QuantileDmatrix` with data chunks. On the other hand, if it's passed to
+:py:class:`~xgboost.DMatrix`, it instead enables the external memory feature. The
+:py:class:`~xgboost.QuantileDmatrix` concatenates the data on memory after compression and
+doesn't fetch data during training. On the other hand, the external memory `DMatrix`
+fetches data batches from external memory on-demand.  Use the `QuantileDMatrix` (with
+iterator if necessary) when you can fit most of your data in memory. The training would be
+an order of magnitute faster than using external memory.

 ****************
 Text File Inputs
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -11,22 +11,22 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo

  model
  saving_model
+  learning_to_rank
+  dart
+  monotonic
+  feature_interaction_constraint
+  aft_survival_analysis
+  categorical
+  multioutput
+  rf
  kubernetes
  Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
  Distributed XGBoost with XGBoost4J-Spark-GPU <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_gpu_tutorial.html>
  dask
  spark_estimator
  ray
-  dart
-  monotonic
-  rf
-  feature_interaction_constraint
-  learning_to_rank
-  aft_survival_analysis
+  external_memory
  c_api_tutorial
  input_format
  param_tuning
-  external_memory
  custom_metric_obj
-  categorical
-  multioutput
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -48,8 +48,9 @@ Notice that the samples are sorted based on their query index in a non-decreasin
  import xgboost as xgb

  # Make a synthetic ranking dataset for demonstration
-  X, y = make_classification(random_state=rng)
-  rng = np.random.default_rng(1994)
+  seed = 1994 
+  X, y = make_classification(random_state=seed)
+  rng = np.random.default_rng(seed)
  n_query_groups = 3
  qid = rng.integers(0, 3, size=X.shape[0])

--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -58,3 +58,45 @@ This can affect the training of XGBoost model, and there are two ways to improve

  - In such a case, you cannot re-balance the dataset
  - Set parameter ``max_delta_step`` to a finite number (say 1) to help convergence
+
+
+*********************
+Reducing Memory Usage
+*********************
+
+If you are using a HPO library like :py:class:`sklearn.model_selection.GridSearchCV`,
+please control the number of threads it can use. It's best to let XGBoost to run in
+parallel instead of asking `GridSearchCV` to run multiple experiments at the same
+time. For instance, creating a fold of data for cross validation can consume a significant
+amount of memory:
+
+.. code-block:: python
+
+    # This creates a copy of dataset. X and X_train are both in memory at the same time.
+
+    # This happens for every thread at the same time if you run `GridSearchCV` with
+    # `n_jobs` larger than 1
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+.. code-block:: python
+
+    df = pd.DataFrame()
+    # This creates a new copy of the dataframe, even if you specify the inplace parameter
+    new_df = df.drop(...)
+
+.. code-block:: python
+
+    array = np.array(...)
+    # This may or may not make a copy of the data, depending on the type of the data
+    array.astype(np.float32)
+
+.. code-block::
+
+    # np by default uses double, do you actually need it?
+    array = np.array(...)
+
+You can find some more specific memory reduction practices scattered through the documents
+For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`. However, before going into
+these, being conscious about making data copies is a good starting point. It usually
+consumes a lot more memory than people expect.
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -173,7 +173,6 @@ Will print out something similar to (not actual output as it's too long for demo
        "gradient_booster": {
          "gbtree_train_param": {
            "num_parallel_tree": "1",
-            "predictor": "gpu_predictor",
            "process_type": "default",
            "tree_method": "gpu_hist",
            "updater": "grow_gpu_hist",
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -10,6 +10,7 @@
 #include <dmlc/omp.h>

 #include <cmath>
+#include <cstdint>
 #include <iostream>
 #include <string>
 #include <utility>
@@ -112,7 +113,7 @@ using bst_row_t = std::size_t;   // NOLINT
 /*! \brief Type for tree node index. */
 using bst_node_t = std::int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
-using bst_group_t = std::uint32_t;      // NOLINT
+using bst_group_t = std::uint32_t;  // NOLINT
 /**
 * \brief Type for indexing into output targets.
 */
@@ -125,6 +126,10 @@ using bst_layer_t = std::int32_t;  // NOLINT
 * \brief Type for indexing trees.
 */
 using bst_tree_t = std::int32_t;  // NOLINT
+/**
+ * @brief Ordinal of a CUDA device.
+ */
+using bst_d_ordinal_t = std::int16_t;  // NOLINT

 namespace detail {
 /*! \brief Implementation of gradient statistics pair. Template specialisation
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1067,6 +1067,9 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
 /**
 * \brief Inplace prediction from CPU dense matrix.
 *
+ * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle.
 * \param values        JSON encoded __array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1091,6 +1094,9 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
 /**
 * \brief Inplace prediction from CPU CSR matrix.
 *
+ * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle.
 * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
 * \param indices       JSON encoded __array_interface__ to column indices in CSR.
@@ -1116,6 +1122,9 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
 /**
 * \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
 *
+ * \note If the booster is configured to run on a CPU, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle
 * \param values        JSON encoded __cuda_array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1137,6 +1146,9 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *valu
 /**
 * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
 *
+ * \note If the booster is configured to run on a CPU, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle
 * \param values        List of __cuda_array_interface__ for all columns encoded in JSON list.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -1,20 +1,80 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
 * \file context.h
 */
 #ifndef XGBOOST_CONTEXT_H_
 #define XGBOOST_CONTEXT_H_

-#include <xgboost/logging.h>
-#include <xgboost/parameter.h>
+#include <xgboost/base.h>       // for bst_d_ordinal_t
+#include <xgboost/logging.h>    // for CHECK_GE
+#include <xgboost/parameter.h>  // for XGBoostParameter

-#include <memory>  // std::shared_ptr
-#include <string>
+#include <cstdint>      // for int16_t, int32_t, int64_t
+#include <memory>       // for shared_ptr
+#include <string>       // for string, to_string
+#include <type_traits>  // for invoke_result_t, is_same_v

 namespace xgboost {

 struct CUDAContext;

+/**
+ * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
+ *        viewing types like `linalg::TensorView`.
+ */
+struct DeviceOrd {
+  enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
+  // CUDA device ordinal.
+  bst_d_ordinal_t ordinal{-1};
+
+  [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
+  [[nodiscard]] bool IsCPU() const { return device == kCPU; }
+
+  DeviceOrd() = default;
+  constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
+
+  DeviceOrd(DeviceOrd const& that) = default;
+  DeviceOrd& operator=(DeviceOrd const& that) = default;
+  DeviceOrd(DeviceOrd&& that) = default;
+  DeviceOrd& operator=(DeviceOrd&& that) = default;
+
+  /**
+   * @brief Constructor for CPU.
+   */
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  /**
+   * @brief Constructor for CUDA device.
+   *
+   * @param ordinal CUDA device ordinal.
+   */
+  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+
+  [[nodiscard]] bool operator==(DeviceOrd const& that) const {
+    return device == that.device && ordinal == that.ordinal;
+  }
+  [[nodiscard]] bool operator!=(DeviceOrd const& that) const { return !(*this == that); }
+  /**
+   * @brief Get a string representation of the device and the ordinal.
+   */
+  [[nodiscard]] std::string Name() const {
+    switch (device) {
+      case DeviceOrd::kCPU:
+        return "CPU";
+      case DeviceOrd::kCUDA:
+        return "CUDA:" + std::to_string(ordinal);
+      default: {
+        LOG(FATAL) << "Unknown device.";
+        return "";
+      }
+    }
+  }
+};
+
+static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
+
+/**
+ * @brief Runtime context for XGBoost. Contains information like threads and device.
+ */
 struct Context : public XGBoostParameter<Context> {
 public:
  // Constant representing the device ID of CPU.
@@ -36,33 +96,82 @@ struct Context : public XGBoostParameter<Context> {
  // fail when gpu_id is invalid
  bool fail_on_invalid_gpu_id{false};
  bool validate_parameters{false};
-
-  /*!
-   * \brief Configure the parameter `gpu_id'.
+  /**
+   * @brief Configure the parameter `gpu_id'.
   *
-   * \param require_gpu  Whether GPU is explicitly required from user.
+   * @param require_gpu Whether GPU is explicitly required by the user through other
+   *                    configurations.
   */
  void ConfigureGpuId(bool require_gpu);
-  /*!
-   * Return automatically chosen threads.
+  /**
+   * @brief Returns the automatically chosen number of threads based on the `nthread`
+   *        parameter and the system settting.
   */
-  std::int32_t Threads() const;
-
-  bool IsCPU() const { return gpu_id == kCpuId; }
-  bool IsCUDA() const { return !IsCPU(); }
-
-  CUDAContext const* CUDACtx() const;
-  // Make a CUDA context based on the current context.
-  Context MakeCUDA(std::int32_t device = 0) const {
+  [[nodiscard]] std::int32_t Threads() const;
+  /**
+   * @brief Is XGBoost running on CPU?
+   */
+  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
+  /**
+   * @brief Is XGBoost running on a CUDA device?
+   */
+  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
+  /**
+   * @brief Get the current device and ordinal.
+   */
+  [[nodiscard]] DeviceOrd Device() const {
+    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
+  }
+  /**
+   * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
+   */
+  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
+  /**
+   * @brief Name of the current device.
+   */
+  [[nodiscard]] std::string DeviceName() const { return Device().Name(); }
+  /**
+   * @brief Get a CUDA device context for allocator and stream.
+   */
+  [[nodiscard]] CUDAContext const* CUDACtx() const;
+  /**
+   * @brief Make a CUDA context based on the current context.
+   *
+   * @param ordinal The CUDA device ordinal.
+   */
+  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
    Context ctx = *this;
-    ctx.gpu_id = device;
+    CHECK_GE(ordinal, 0);
+    ctx.gpu_id = ordinal;
    return ctx;
  }
-  Context MakeCPU() const {
+  /**
+   * @brief Make a CPU context based on the current context.
+   */
+  [[nodiscard]] Context MakeCPU() const {
    Context ctx = *this;
    ctx.gpu_id = kCpuId;
    return ctx;
  }
+  /**
+   * @brief Call function based on the current device.
+   */
+  template <typename CPUFn, typename CUDAFn>
+  decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn) const {
+    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
+    switch (this->Device().device) {
+      case DeviceOrd::kCPU:
+        return cpu_fn();
+      case DeviceOrd::kCUDA:
+        return cuda_fn();
+      default:
+        // Do not use the device name as this is likely an internal error, the name
+        // wouldn't be valid.
+        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
+        break;
+    }
+    return std::invoke_result_t<CPUFn>();
+  }

  // declare parameters
  DMLC_DECLARE_PARAMETER(Context) {
@@ -87,9 +196,9 @@ struct Context : public XGBoostParameter<Context> {
  }

 private:
-  // mutable for lazy initialization for cuda context to avoid initializing CUDA at load.
-  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define p_impl
-  // while trying to hide CUDA code from host compiler.
+  // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
+  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
+  // p_impl while trying to hide CUDA code from the host compiler.
  mutable std::shared_ptr<CUDAContext> cuctx_;
  // cached value for CFS CPU limit. (used in containerized env)
  std::int32_t cfs_cpu_count_;  // NOLINT
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -149,18 +149,14 @@ class GradientBooster : public Model, public Configurable {
   * \param layer_begin Beginning of boosted tree layer used for prediction.
   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param approximate use a faster (inconsistent) approximation of SHAP values
-   * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
-   * \param condition_feature feature to condition on (i.e. fix) during calculations
   */
-  virtual void PredictContribution(DMatrix* dmat,
-                                   HostDeviceVector<bst_float>* out_contribs,
-                                   unsigned layer_begin, unsigned layer_end,
-                                   bool approximate = false, int condition = 0,
-                                   unsigned condition_feature = 0) = 0;
+  virtual void PredictContribution(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
+                                   bst_layer_t layer_begin, bst_layer_t layer_end,
+                                   bool approximate = false) = 0;

-  virtual void PredictInteractionContributions(
-      DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
-      unsigned layer_begin, unsigned layer_end, bool approximate) = 0;
+  virtual void PredictInteractionContributions(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
+                                               bst_layer_t layer_begin, bst_layer_t layer_end,
+                                               bool approximate) = 0;

  /*!
   * \brief dump the model in the requested format
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -6,24 +6,22 @@
 */
 #pragma once
 #include <xgboost/base.h>
-#include <xgboost/cache.h>  // DMatrixCache
+#include <xgboost/cache.h>    // for DMatrixCache
+#include <xgboost/context.h>  // for Context
 #include <xgboost/context.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>

-#include <functional>  // std::function
-#include <memory>
+#include <functional>  // for function
+#include <memory>      // for shared_ptr
 #include <string>
-#include <thread>   // for get_id
 #include <utility>  // for make_pair
 #include <vector>

 // Forward declarations
-namespace xgboost {
-namespace gbm {
+namespace xgboost::gbm {
 struct GBTreeModel;
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm

 namespace xgboost {
 /**
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -91,6 +91,9 @@
                    <value>ON</value>
                </property>
            </activation>
+            <properties>
+               <use.cuda>ON</use.cuda>
+            </properties>
            <modules>
                <module>xgboost4j-gpu</module>
                <module>xgboost4j-spark-gpu</module>
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -78,7 +78,6 @@ public class BoosterTest {
        put("num_round", round);
        put("num_workers", 1);
        put("tree_method", "gpu_hist");
-        put("predictor", "gpu_predictor");
        put("max_bin", maxBin);
      }
    };
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -281,7 +281,6 @@ object GpuPreXGBoost extends PreXGBoostProvider {
            // - predictor: Force to gpu predictor since native doesn't save predictor.
            val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
            booster.setParam("gpu_id", gpuId.toString)
-            booster.setParam("predictor", "gpu_predictor")
            logger.info("GPU transform on device: " + gpuId)
            boosterFlag.isGpuParamsSet = true;
          }
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(

    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
-        libxgboost_sys = sys_prefix / "lib" / _lib_name()
+        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
+        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
        if not libxgboost_sys.exists():
            raise RuntimeError(
                f"use_system_libxgboost was specified but {_lib_name()} is "
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2187,20 +2187,25 @@ class Booster:
        base_margin: Any = None,
        strict_shape: bool = False,
    ) -> NumpyOrCupy:
-        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
-        does not cache the prediction result.
+        """Run prediction in-place when possible, Unlike :py:meth:`predict` method,
+        inplace prediction does not cache the prediction result.

        Calling only ``inplace_predict`` in multiple threads is safe and lock
        free.  But the safety does not hold when used in conjunction with other
        methods. E.g. you can't train the booster in one thread and perform
        prediction in the other.

+        .. note::
+
+            If the device ordinal of the input data doesn't match the one configured for
+            the booster, data will be copied to the booster device.
+
        .. code-block:: python

-            booster.set_param({"predictor": "gpu_predictor"})
+            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({"predictor": "cpu_predictor"})
+            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2208,9 +2213,7 @@ class Booster:
        Parameters
        ----------
        data :
-            The input data, must not be a view for numpy array.  Set
-            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
-            array or CuDF DataFrame.
+            The input data.
        iteration_range :
            See :py:meth:`predict` for details.
        predict_type :
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
        os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
        # use libxgboost from a system prefix, if available.  This should be the last
        # option.
-        os.path.join(sys.prefix, "lib"),
+        os.path.join(sys.base_prefix, "lib"),
    ]

    if sys.platform == "win32":
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
            + ("\n- ".join(dll_path))
            + "\nXGBoost Python package path: "
            + curr_path
-            + "\nsys.prefix: "
-            + sys.prefix
+            + "\nsys.base_prefix: "
+            + sys.base_prefix
            + "\nSee: "
            + link
            + " for installing XGBoost."
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -277,9 +277,6 @@ __model_doc = f"""
        Device ordinal.
    validate_parameters : Optional[bool]
        Give warnings for unknown parameter.
-    predictor : Optional[str]
-        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
-        gpu_predictor].
    enable_categorical : bool

        .. versionadded:: 1.5.0
@@ -652,7 +649,6 @@ class XGBModel(XGBModelBase):
        importance_type: Optional[str] = None,
        gpu_id: Optional[int] = None,
        validate_parameters: Optional[bool] = None,
-        predictor: Optional[str] = None,
        enable_categorical: bool = False,
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
@@ -699,7 +695,6 @@ class XGBModel(XGBModelBase):
        self.importance_type = importance_type
        self.gpu_id = gpu_id
        self.validate_parameters = validate_parameters
-        self.predictor = predictor
        self.enable_categorical = enable_categorical
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
@@ -935,8 +930,7 @@ class XGBModel(XGBModelBase):
        callbacks = self.callbacks if self.callbacks is not None else callbacks

        tree_method = params.get("tree_method", None)
-        cat_support = {"gpu_hist", "approx", "hist"}
-        if self.enable_categorical and tree_method not in cat_support:
+        if self.enable_categorical and tree_method == "exact":
            raise ValueError(
                "Experimental support for categorical data is not implemented for"
                " current tree method yet."
@@ -1093,12 +1087,7 @@ class XGBModel(XGBModelBase):
            return self

    def _can_use_inplace_predict(self) -> bool:
-        # When predictor is explicitly set, using `inplace_predict` might result into
-        # error with incompatible data type.
-        # Inplace predict doesn't handle as many data types as DMatrix, but it's
-        # sufficient for dask interface where input is simpiler.
-        predictor = self.get_xgb_params().get("predictor", None)
-        if predictor in ("auto", None) and self.booster != "gblinear":
+        if self.booster != "gblinear":
            return True
        return False

@@ -1124,9 +1113,9 @@ class XGBModel(XGBModelBase):
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then
-        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
-        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
-        prediction is run on GPU automatically, otherwise it will run on CPU.
+        :py:attr:`best_iteration` is used automatically. The estimator uses
+        `inplace_predict` by default and falls back to using :py:class:`DMatrix` if
+        devices between the data and the estimator don't match.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1588,7 +1577,9 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    ) -> np.ndarray:
        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
-        automatically.
+        automatically. The estimator uses `inplace_predict` by default and falls back to
+        using :py:class:`DMatrix` if devices between the data and the estimator don't
+        match.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -25,6 +25,7 @@ from typing import (
    Set,
    Tuple,
    TypedDict,
+    TypeVar,
    Union,
 )

@@ -711,6 +712,27 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
    )


+M = TypeVar("M", xgb.Booster, xgb.XGBModel)
+
+
+def set_ordinal(ordinal: int, booster: M) -> M:
+    """Temporary solution for setting the device ordinal until we move away from
+    `gpu_id`.
+
+    """
+    if ordinal < 0:
+        params = {"gpu_id": -1, "tree_method": "hist"}
+    else:
+        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
+
+    if isinstance(booster, xgb.Booster):
+        booster.set_param(params)
+    elif isinstance(booster, xgb.XGBModel):
+        booster.set_params(**params)
+
+    return booster
+
+
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -19,8 +19,7 @@
 #include "rabit/internal/utils.h"
 #include "rabit/serializable.h"

-namespace rabit {
-namespace utils {
+namespace rabit::utils {
 /*! \brief re-use definition of dmlc::SeekStream */
 using SeekStream = dmlc::SeekStream;
 /**
@@ -31,9 +30,6 @@ struct MemoryFixSizeBuffer : public SeekStream {
  // similar to SEEK_END in libc
  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();

- protected:
-  MemoryFixSizeBuffer() = default;
-
 public:
  /**
   * @brief Ctor
@@ -68,7 +64,7 @@ struct MemoryFixSizeBuffer : public SeekStream {
   * @brief Current position in the buffer (stream).
   */
  std::size_t Tell() override { return curr_ptr_; }
-  virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
+  [[nodiscard]] virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }

 protected:
  /*! \brief in memory buffer */
@@ -119,6 +115,5 @@ struct MemoryBufferStream : public SeekStream {
  /*! \brief current pointer */
  size_t curr_ptr_;
 };  // class MemoryBufferStream
-}  // namespace utils
-}  // namespace rabit
+}  // namespace rabit::utils
 #endif  // RABIT_INTERNAL_IO_H_
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1023,7 +1023,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
                        const float **out_result) {
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";

  HostDeviceVector<float> *p_predt{nullptr};
  auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1042,6 +1041,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
  xgboost_CHECK_C_ARG_PTR(out_dim);
  CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
                   learner->BoostedRounds(), &shape, out_dim);
+  CHECK_GE(p_predt->Size(), n_samples);

  xgboost_CHECK_C_ARG_PTR(out_result);
  xgboost_CHECK_C_ARG_PTR(out_shape);
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -92,7 +92,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  API_END();
 }

-int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
+int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                       char const *c_json_config, std::shared_ptr<DMatrix> p_m,
                       xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
                       const float **out_result) {
@@ -107,7 +107,6 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
  proxy->SetCUDAArray(c_array_interface);

  auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
  auto *learner = static_cast<Learner *>(handle);

  HostDeviceVector<float> *p_predt{nullptr};
@@ -118,7 +117,13 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
                          RequiredArg<Integer>(config, "iteration_begin", __func__),
                          RequiredArg<Integer>(config, "iteration_end", __func__));
  CHECK(p_predt);
-  CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
+  if (learner->Ctx()->IsCPU()) {
+    // Prediction using DMatrix as fallback.
+    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
+  } else {
+    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
+  }
+  p_predt->SetDevice(proxy->DeviceIdx());

  auto &shape = learner->GetThreadLocal().prediction_shape;
  size_t n_samples = p_m->Info().num_row_;
@@ -146,7 +151,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
  if (m) {
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
-  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }

@@ -159,6 +164,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
    p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
  }
  xgboost_CHECK_C_ARG_PTR(out_result);
-  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                            out_result);
 }
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -29,13 +29,21 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
    old_device_ordinal = device_ordinal;
    old_world_size = communicator_->GetWorldSize();
 #ifdef XGBOOST_USE_NCCL
-    if (type_ != CommunicatorType::kFederated) {
-      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
-    } else {
-      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
+    switch (type_) {
+      case CommunicatorType::kRabit:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        break;
+      case CommunicatorType::kFederated:
+        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
+        break;
+      case CommunicatorType::kInMemory:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        break;
+      default:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
    }
 #else
-    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
+    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
 #endif
  }
  return device_communicator_.get();
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -11,21 +11,18 @@ namespace collective {

 class DeviceCommunicatorAdapter : public DeviceCommunicator {
 public:
-  DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
-      : device_ordinal_{device_ordinal}, communicator_{communicator} {
+  explicit DeviceCommunicatorAdapter(int device_ordinal)
+      : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
    if (device_ordinal_ < 0) {
      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
    }
-    if (communicator_ == nullptr) {
-      LOG(FATAL) << "Communicator cannot be null.";
-    }
  }

  ~DeviceCommunicatorAdapter() override = default;

  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override {
-    if (communicator_->GetWorldSize() == 1) {
+    if (world_size_ == 1) {
      return;
    }

@@ -33,37 +30,34 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
    auto size = count * GetTypeSize(data_type);
    host_buffer_.reserve(size);
    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
-    communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
+    Allreduce(host_buffer_.data(), count, data_type, op);
    dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
  }

  void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                  dh::caching_device_vector<char> *receive_buffer) override {
-    if (communicator_->GetWorldSize() == 1) {
+    if (world_size_ == 1) {
      return;
    }

    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    int const world_size = communicator_->GetWorldSize();
-    int const rank = communicator_->GetRank();

    segments->clear();
-    segments->resize(world_size, 0);
-    segments->at(rank) = length_bytes;
-    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
-                             Operation::kMax);
+    segments->resize(world_size_, 0);
+    segments->at(rank_) = length_bytes;
+    Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
    receive_buffer->resize(total_bytes);

    host_buffer_.reserve(total_bytes);
    size_t offset = 0;
-    for (int32_t i = 0; i < world_size; ++i) {
+    for (int32_t i = 0; i < world_size_; ++i) {
      size_t as_bytes = segments->at(i);
-      if (i == rank) {
-        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
+      if (i == rank_) {
+        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
                                 cudaMemcpyDefault));
      }
-      communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
+      Broadcast(host_buffer_.data() + offset, as_bytes, i);
      offset += as_bytes;
    }
    dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
@@ -76,7 +70,8 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {

 private:
  int const device_ordinal_;
-  Communicator *communicator_;
+  int const world_size_;
+  int const rank_;
  /// Host buffer used to call communicator functions.
  std::vector<char> host_buffer_{};
 };
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -7,31 +7,27 @@
 namespace xgboost {
 namespace collective {

-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
-    : device_ordinal_{device_ordinal}, communicator_{communicator} {
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+    : device_ordinal_{device_ordinal},
+      needs_sync_{needs_sync},
+      world_size_{GetWorldSize()},
+      rank_{GetRank()} {
  if (device_ordinal_ < 0) {
    LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
  }
-  if (communicator_ == nullptr) {
-    LOG(FATAL) << "Communicator cannot be null.";
-  }
-
-  int32_t const rank = communicator_->GetRank();
-  int32_t const world = communicator_->GetWorldSize();
-
-  if (world == 1) {
+  if (world_size_ == 1) {
    return;
  }

-  std::vector<uint64_t> uuids(world * kUuidLength, 0);
+  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
-  auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
+  auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
  GetCudaUUID(s_this_uuid);

  // TODO(rongou): replace this with allgather.
-  communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
+  Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);

-  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
+  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
  size_t j = 0;
  for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
    converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
@@ -41,18 +37,18 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, Communicator
  auto iter = std::unique(converted.begin(), converted.end());
  auto n_uniques = std::distance(converted.begin(), iter);

-  CHECK_EQ(n_uniques, world)
+  CHECK_EQ(n_uniques, world_size_)
      << "Multiple processes within communication group running on same CUDA "
      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";

  nccl_unique_id_ = GetUniqueId();
  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
+  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
 }

 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
    return;
  }
  if (cuda_stream_) {
@@ -129,38 +125,41 @@ template <typename Func>
 void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
                         std::size_t size, cudaStream_t stream) {
  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
-    out_buffer[idx] = device_buffer[idx];
+    auto result = device_buffer[idx];
    for (auto rank = 1; rank < world_size; rank++) {
-      out_buffer[idx] = func(out_buffer[idx], device_buffer[rank * size + idx]);
+      result = func(result, device_buffer[rank * size + idx]);
    }
+    out_buffer[idx] = result;
  });
 }
 }  // anonymous namespace

 void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
                                              DataType data_type, Operation op) {
-  auto const world_size = communicator_->GetWorldSize();
  auto const size = count * GetTypeSize(data_type);
-  dh::caching_device_vector<char> buffer(size * world_size);
+  dh::caching_device_vector<char> buffer(size * world_size_);
  auto *device_buffer = buffer.data().get();

  // First gather data from all the workers.
  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
                              nccl_comm_, cuda_stream_));
+  if (needs_sync_) {
+    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+  }

  // Then reduce locally.
  auto *out_buffer = static_cast<char *>(send_receive_buffer);
  switch (op) {
    case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size, size,
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
                          cuda_stream_);
      break;
    case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size, size,
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
                          cuda_stream_);
      break;
    case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size, size,
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
                          cuda_stream_);
      break;
    default:
@@ -170,7 +169,7 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si

 void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
                                       DataType data_type, Operation op) {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
    return;
  }

@@ -189,24 +188,22 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
                                        std::vector<std::size_t> *segments,
                                        dh::caching_device_vector<char> *receive_buffer) {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
    return;
  }

  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  int const world_size = communicator_->GetWorldSize();
-  int const rank = communicator_->GetRank();

  segments->clear();
-  segments->resize(world_size, 0);
-  segments->at(rank) = length_bytes;
-  communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
+  segments->resize(world_size_, 0);
+  segments->at(rank_) = length_bytes;
+  Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
  auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
  receive_buffer->resize(total_bytes);

  size_t offset = 0;
  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size; ++i) {
+  for (int32_t i = 0; i < world_size_; ++i) {
    size_t as_bytes = segments->at(i);
    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
                                ncclChar, i, nccl_comm_, cuda_stream_));
@@ -216,7 +213,7 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
 }

 void NcclDeviceCommunicator::Synchronize() {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
    return;
  }
  dh::safe_cuda(cudaSetDevice(device_ordinal_));
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -12,7 +12,20 @@ namespace collective {

 class NcclDeviceCommunicator : public DeviceCommunicator {
 public:
-  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator);
+  /**
+   * @brief Construct a new NCCL communicator.
+   * @param device_ordinal The GPU device id.
+   * @param needs_sync Whether extra CUDA stream synchronization is needed.
+   *
+   * In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
+   * a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
+   * makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
+   *
+   * The Rabit communicator runs with one process per GPU, so the additional synchronization is not
+   * needed. The in-memory communicator is used in tests with multiple threads, each thread
+   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
+   */
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
  ~NcclDeviceCommunicator() override;
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override;
@@ -49,11 +62,10 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  ncclUniqueId GetUniqueId() {
    static const int kRootRank = 0;
    ncclUniqueId id;
-    if (communicator_->GetRank() == kRootRank) {
+    if (rank_ == kRootRank) {
      dh::safe_nccl(ncclGetUniqueId(&id));
    }
-    communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
-                             static_cast<int>(kRootRank));
+    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
    return id;
  }

@@ -61,7 +73,9 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
                        Operation op);

  int const device_ordinal_;
-  Communicator *communicator_;
+  bool const needs_sync_;
+  int const world_size_;
+  int const rank_;
  ncclComm_t nccl_comm_{};
  cudaStream_t cuda_stream_{};
  ncclUniqueId nccl_unique_id_{};
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -1,16 +1,27 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
 * \brief Utility for fast column-wise access
 */
 #include "column_matrix.h"

-namespace xgboost {
-namespace common {
+#include <algorithm>    // for transform
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint64_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <type_traits>  // for remove_reference_t
+#include <vector>       // for vector
+
+#include "../data/gradient_index.h"  // for GHistIndexMatrix
+#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "xgboost/base.h"            // for bst_feaature_t
+#include "xgboost/span.h"            // for Span
+
+namespace xgboost::common {
 void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
  auto const nfeature = gmat.Features();
  const size_t nrow = gmat.Size();
  // identify type of each column
-  type_.resize(nfeature);
+  type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});

  uint32_t max_val = std::numeric_limits<uint32_t>::max();
  for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
@@ -34,7 +45,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  // want to compute storage boundary for each feature
  // using variants of prefix sum scan
-  feature_offsets_.resize(nfeature + 1);
+  feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
  size_t accum_index = 0;
  feature_offsets_[0] = accum_index;
  for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
@@ -49,9 +60,11 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
-  index_.resize(storage_size, 0);
+
+  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
+
  if (!all_dense_column) {
-    row_ind_.resize(feature_offsets_[nfeature]);
+    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
  }

  // store least bin id for each feature
@@ -59,7 +72,51 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres

  any_missing_ = !gmat.IsDense();

-  missing_flags_.clear();
+  missing_ = MissingIndicator{0, false};
 }
-}  // namespace common
-}  // namespace xgboost
+
+// IO procedures for external memory.
+bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
+  if (!common::ReadVec(fi, &index_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &type_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &row_ind_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &feature_offsets_)) {
+    return false;
+  }
+
+  if (!common::ReadVec(fi, &missing_.storage)) {
+    return false;
+  }
+  missing_.InitView();
+
+  index_base_ = index_base;
+  if (!fi->Read(&bins_type_size_)) {
+    return false;
+  }
+  if (!fi->Read(&any_missing_)) {
+    return false;
+  }
+  return true;
+}
+
+std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
+  std::size_t bytes{0};
+
+  bytes += common::WriteVec(fo, index_);
+  bytes += common::WriteVec(fo, type_);
+  bytes += common::WriteVec(fo, row_ind_);
+  bytes += common::WriteVec(fo, feature_offsets_);
+  bytes += common::WriteVec(fo, missing_.storage);
+
+  bytes += fo->Write(bins_type_size_);
+  bytes += fo->Write(any_missing_);
+
+  return bytes;
+}
+}  // namespace xgboost::common
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 by Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
 * \file column_matrix.h
 * \brief Utility for fast column-wise access
 * \author Philip Cho
@@ -8,25 +8,30 @@
 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
 #define XGBOOST_COMMON_COLUMN_MATRIX_H_

-#include <dmlc/endian.h>
-
 #include <algorithm>
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
-#include <utility>  // std::move
+#include <utility>  // for move
 #include <vector>

 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "algorithm.h"
+#include "bitfield.h"  // for RBitField8
 #include "hist_util.h"
+#include "ref_resource_view.h"  // for RefResourceView
+#include "xgboost/base.h"       // for bst_bin_t
+#include "xgboost/span.h"       // for Span

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 class ColumnMatrix;
+class AlignedFileWriteStream;
+class AlignedResourceReadStream;
+
 /*! \brief column type */
-enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };
+enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };

 /*! \brief a column storage, to be used with ApplySplit. Note that each
    bin id is stored as index[i] + index_base.
@@ -41,12 +46,12 @@ class Column {
      : index_(index), index_base_(least_bin_idx) {}
  virtual ~Column() = default;

-  bst_bin_t GetGlobalBinIdx(size_t idx) const {
+  [[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
    return index_base_ + static_cast<bst_bin_t>(index_[idx]);
  }

  /* returns number of elements in column */
-  size_t Size() const { return index_.size(); }
+  [[nodiscard]] size_t Size() const { return index_.size(); }

 private:
  /* bin indexes in range [0, max_bins - 1] */
@@ -63,7 +68,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  common::Span<const size_t> row_ind_;
  size_t idx_;

-  size_t const* RowIndices() const { return row_ind_.data(); }
+  [[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }

 public:
  SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
@@ -81,7 +86,7 @@ class SparseColumnIter : public Column<BinIdxT> {
  SparseColumnIter(SparseColumnIter const&) = delete;
  SparseColumnIter(SparseColumnIter&&) = default;

-  size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
+  [[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
  bst_bin_t operator[](size_t rid) {
    const size_t column_size = this->Size();
    if (!((idx_) < column_size)) {
@@ -101,6 +106,10 @@ class SparseColumnIter : public Column<BinIdxT> {
  }
 };

+/**
+ * @brief Column stored as a dense vector. It might still contain missing values as
+ *        indicated by the missing flags.
+ */
 template <typename BinIdxT, bool any_missing>
 class DenseColumnIter : public Column<BinIdxT> {
 public:
@@ -109,17 +118,19 @@ class DenseColumnIter : public Column<BinIdxT> {
 private:
  using Base = Column<BinIdxT>;
  /* flags for missing values in dense columns */
-  std::vector<ByteType> const& missing_flags_;
+  LBitField32 missing_flags_;
  size_t feature_offset_;

 public:
  explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
-                           std::vector<ByteType> const& missing_flags, size_t feature_offset)
+                           LBitField32 missing_flags, size_t feature_offset)
      : Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
  DenseColumnIter(DenseColumnIter const&) = delete;
  DenseColumnIter(DenseColumnIter&&) = default;

-  bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }
+  [[nodiscard]] bool IsMissing(size_t ridx) const {
+    return missing_flags_.Check(feature_offset_ + ridx);
+  }

  bst_bin_t operator[](size_t ridx) const {
    if (any_missing) {
@@ -131,12 +142,54 @@ class DenseColumnIter : public Column<BinIdxT> {
 };

 /**
- * \brief Column major matrix for gradient index. This matrix contains both dense column
- * and sparse column, the type of the column is controlled by sparse threshold. When the
- * number of missing values in a column is below the threshold it's classified as dense
- * column.
+ * @brief Column major matrix for gradient index on CPU.
+ *
+ *    This matrix contains both dense columns and sparse columns, the type of the column
+ *    is controlled by the sparse threshold parameter. When the number of missing values
+ *    in a column is below the threshold it's classified as dense column.
 */
 class ColumnMatrix {
+  /**
+   * @brief A bit set for indicating whether an element in a dense column is missing.
+   */
+  struct MissingIndicator {
+    LBitField32 missing;
+    RefResourceView<std::uint32_t> storage;
+
+    MissingIndicator() = default;
+    /**
+     * @param n_elements Size of the bit set
+     * @param init       Initialize the indicator to true or false.
+     */
+    MissingIndicator(std::size_t n_elements, bool init) {
+      auto m_size = missing.ComputeStorageSize(n_elements);
+      storage = common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
+      this->InitView();
+    }
+    /** @brief Set the i^th element to be a valid element (instead of missing). */
+    void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
+    /** @brief assign the storage to the view. */
+    void InitView() {
+      missing = LBitField32{Span{storage.data(), storage.size()}};
+    }
+
+    void GrowTo(std::size_t n_elements, bool init) {
+      CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
+          << "[Internal Error]: Cannot grow the vector when external memory is used.";
+      auto m_size = missing.ComputeStorageSize(n_elements);
+      CHECK_GE(m_size, storage.size());
+      if (m_size == storage.size()) {
+        return;
+      }
+
+      auto new_storage =
+          common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
+      std::copy_n(storage.cbegin(), storage.size(), new_storage.begin());
+      storage = std::move(new_storage);
+      this->InitView();
+    }
+  };
+
  void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);

  template <typename ColumnBinT, typename BinT, typename RIdx>
@@ -144,9 +197,10 @@ class ColumnMatrix {
    if (type_[fid] == kDenseColumn) {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[rid] = bin_id - index_base_[fid];
-      // not thread-safe with bool vector.  FIXME(jiamingy): We can directly assign
-      // kMissingId to the index to avoid missing flags.
-      missing_flags_[feature_offsets_[fid] + rid] = false;
+      // not thread-safe with bit field.
+      // FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
+      // flags.
+      missing_.SetValid(feature_offsets_[fid] + rid);
    } else {
      ColumnBinT* begin = &local_index[feature_offsets_[fid]];
      begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
@@ -158,7 +212,9 @@ class ColumnMatrix {
 public:
  using ByteType = bool;
  // get number of features
-  bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
+  [[nodiscard]] bst_feature_t GetNumFeature() const {
+    return static_cast<bst_feature_t>(type_.size());
+  }

  ColumnMatrix() = default;
  ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
@@ -166,7 +222,7 @@ class ColumnMatrix {
  }

  /**
-   * \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
+   * @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
   *        SparsePage.
   */
  void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
@@ -178,8 +234,8 @@ class ColumnMatrix {
  }

  /**
-   * \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
-   * data.
+   * @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
+   *        data.
   *
   *    This function requires a binary search for each bin to get back the feature index
   *    for those bins.
@@ -199,7 +255,7 @@ class ColumnMatrix {
    }
  }

-  bool IsInitialized() const { return !type_.empty(); }
+  [[nodiscard]] bool IsInitialized() const { return !type_.empty(); }

  /**
   * \brief Push batch of data for Quantile DMatrix support.
@@ -257,7 +313,7 @@ class ColumnMatrix {
        reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
        column_size};
    return std::move(DenseColumnIter<BinIdxType, any_missing>{
-        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
+        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
  }

  // all columns are dense column and has no missing value
@@ -265,7 +321,8 @@ class ColumnMatrix {
  template <typename RowBinIdxT>
  void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                         const size_t n_features, int32_t n_threads) {
-    missing_flags_.resize(feature_offsets_[n_features], false);
+    missing_.GrowTo(feature_offsets_[n_features], false);
+
    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
      auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
@@ -290,9 +347,15 @@ class ColumnMatrix {
  void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
                            float missing) {
    auto n_features = gmat.Features();
-    missing_flags_.resize(feature_offsets_[n_features], true);
-    auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
-    num_nonzeros_.resize(n_features, 0);
+
+    missing_.GrowTo(feature_offsets_[n_features], true);
+    auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
+    if (num_nonzeros_.empty()) {
+      num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
+    } else {
+      CHECK_EQ(num_nonzeros_.size(), n_features);
+    }
+
    auto is_valid = data::IsValidFunctor{missing};

    DispatchBinType(bins_type_size_, [&](auto t) {
@@ -321,8 +384,9 @@ class ColumnMatrix {
   */
  void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
    auto n_features = gmat.Features();
-    missing_flags_.resize(feature_offsets_[n_features], true);
-    num_nonzeros_.resize(n_features, 0);
+
+    missing_ = MissingIndicator{feature_offsets_[n_features], true};
+    num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});

    DispatchBinType(bins_type_size_, [&](auto t) {
      using ColumnBinT = decltype(t);
@@ -335,106 +399,34 @@ class ColumnMatrix {
    });
  }

-  BinTypeSize GetTypeSize() const { return bins_type_size_; }
-  auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
+  [[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
+  [[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }

  // And this returns part of state
-  bool AnyMissing() const { return any_missing_; }
+  [[nodiscard]] bool AnyMissing() const { return any_missing_; }

  // IO procedures for external memory.
-  bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
-    fi->Read(&index_);
-#if !DMLC_LITTLE_ENDIAN
-    // s390x
-    std::vector<std::underlying_type<ColumnType>::type> int_types;
-    fi->Read(&int_types);
-    type_.resize(int_types.size());
-    std::transform(
-        int_types.begin(), int_types.end(), type_.begin(),
-        [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
-#else
-    fi->Read(&type_);
-#endif  // !DMLC_LITTLE_ENDIAN
-
-    fi->Read(&row_ind_);
-    fi->Read(&feature_offsets_);
-
-    std::vector<std::uint8_t> missing;
-    fi->Read(&missing);
-    missing_flags_.resize(missing.size());
-    std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
-                   [](std::uint8_t flag) { return !!flag; });
-
-    index_base_ = index_base;
-#if !DMLC_LITTLE_ENDIAN
-    std::underlying_type<BinTypeSize>::type v;
-    fi->Read(&v);
-    bins_type_size_ = static_cast<BinTypeSize>(v);
-#else
-    fi->Read(&bins_type_size_);
-#endif
-
-    fi->Read(&any_missing_);
-    return true;
-  }
-
-  size_t Write(dmlc::Stream* fo) const {
-    size_t bytes{0};
-
-    auto write_vec = [&](auto const& vec) {
-      fo->Write(vec);
-      bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
-               sizeof(uint64_t);
-    };
-    write_vec(index_);
-#if !DMLC_LITTLE_ENDIAN
-    // s390x
-    std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
-    std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
-      return static_cast<std::underlying_type<ColumnType>::type>(t);
-    });
-    write_vec(int_types);
-#else
-    write_vec(type_);
-#endif  // !DMLC_LITTLE_ENDIAN
-    write_vec(row_ind_);
-    write_vec(feature_offsets_);
-    // dmlc can not handle bool vector
-    std::vector<std::uint8_t> missing(missing_flags_.size());
-    std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
-                   [](bool flag) { return static_cast<std::uint8_t>(flag); });
-    write_vec(missing);
-
-#if !DMLC_LITTLE_ENDIAN
-    auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
-    fo->Write(v);
-#else
-    fo->Write(bins_type_size_);
-#endif  // DMLC_LITTLE_ENDIAN
-    bytes += sizeof(bins_type_size_);
-    fo->Write(any_missing_);
-    bytes += sizeof(any_missing_);
-
-    return bytes;
-  }
+  [[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
+  [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;

 private:
-  std::vector<uint8_t> index_;
+  RefResourceView<std::uint8_t> index_;

-  std::vector<ColumnType> type_;
-  /* indptr of a CSC matrix. */
-  std::vector<size_t> row_ind_;
-  /* indicate where each column's index and row_ind is stored. */
-  std::vector<size_t> feature_offsets_;
-  /* The number of nnz of each column. */
-  std::vector<size_t> num_nonzeros_;
+  RefResourceView<ColumnType> type_;
+  /** @brief indptr of a CSC matrix. */
+  RefResourceView<std::size_t> row_ind_;
+  /** @brief indicate where each column's index and row_ind is stored. */
+  RefResourceView<std::size_t> feature_offsets_;
+  /** @brief The number of nnz of each column. */
+  RefResourceView<std::size_t> num_nonzeros_;

  // index_base_[fid]: least bin id for feature fid
-  uint32_t const* index_base_;
-  std::vector<ByteType> missing_flags_;
+  std::uint32_t const* index_base_;
+
+  MissingIndicator missing_;
+
  BinTypeSize bins_type_size_;
  bool any_missing_;
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -6,6 +6,11 @@
 #ifndef XGBOOST_COMMON_ERROR_MSG_H_
 #define XGBOOST_COMMON_ERROR_MSG_H_

+#include <cinttypes>  // for uint64_t
+#include <limits>     // for numeric_limits
+
+#include "xgboost/base.h"  // for bst_feature_t
+#include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView

 namespace xgboost::error {
@@ -33,5 +38,46 @@ constexpr StringView InconsistentMaxBin() {
  return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
         "and consistent with the Booster being trained.";
 }
+
+constexpr StringView UnknownDevice() { return "Unknown device type."; }
+
+inline void MaxFeatureSize(std::uint64_t n_features) {
+  auto max_n_features = std::numeric_limits<bst_feature_t>::max();
+  CHECK_LE(n_features, max_n_features)
+      << "Unfortunately, XGBoost does not support data matrices with "
+      << std::numeric_limits<bst_feature_t>::max() << " features or greater";
+}
+
+constexpr StringView InplacePredictProxy() {
+  return "Inplace predict accepts only DMatrixProxy as input.";
+}
+
+inline void MaxSampleSize(std::size_t n) {
+  LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
+             << ". Consider using a different updater or tree_method.";
+}
+
+constexpr StringView OldSerialization() {
+  return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
+configuration generated by an older version of XGBoost, please export the model by calling
+`Booster.save_model` from that version first, then load it back in current version. See:
+
+    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
+
+for more details about differences between saving model and serializing.
+)doc";
+}
+
+inline void WarnOldSerialization() {
+  // Display it once is enough. Otherwise this can be really verbose in distributed
+  // environments.
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+
+  LOG(WARNING) << OldSerialization();
+  logged = true;
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -127,55 +127,76 @@ void SortByWeight(dh::device_vector<float>* weights,
                                });
 }

-void RemoveDuplicatedCategories(
-    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
-    dh::device_vector<Entry> *p_sorted_entries,
-    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
+void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+                                dh::device_vector<Entry>* p_sorted_entries,
+                                dh::device_vector<float>* p_sorted_weights,
+                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
  info.feature_types.SetDevice(device);
  auto d_feature_types = info.feature_types.ConstDeviceSpan();
  CHECK(!d_feature_types.empty());
-  auto &column_sizes_scan = *p_column_sizes_scan;
-  auto &sorted_entries = *p_sorted_entries;
+  auto& column_sizes_scan = *p_column_sizes_scan;
+  auto& sorted_entries = *p_sorted_entries;
  // Removing duplicated entries in categorical features.
+
+  // We don't need to accumulate weight for duplicated entries as there's no weighted
+  // sketching for categorical features, the categories are the cut values.
  dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
-  dh::SegmentedUnique(column_sizes_scan.data().get(),
-                      column_sizes_scan.data().get() + column_sizes_scan.size(),
-                      sorted_entries.begin(), sorted_entries.end(),
-                      new_column_scan.data().get(), sorted_entries.begin(),
-                      [=] __device__(Entry const &l, Entry const &r) {
-                        if (l.index == r.index) {
-                          if (IsCat(d_feature_types, l.index)) {
-                            return l.fvalue == r.fvalue;
-                          }
-                        }
-                        return false;
-                      });
+  std::size_t n_uniques{0};
+  if (p_sorted_weights) {
+    using Pair = thrust::tuple<Entry, float>;
+    auto d_sorted_entries = dh::ToSpan(sorted_entries);
+    auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
+    auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
+    auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
+    n_uniques = dh::SegmentedUnique(
+        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
+        val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
+        [=] __device__(Pair const& l, Pair const& r) {
+          Entry const& le = thrust::get<0>(l);
+          Entry const& re = thrust::get<0>(r);
+          if (le.index == re.index && IsCat(d_feature_types, le.index)) {
+            return le.fvalue == re.fvalue;
+          }
+          return false;
+        });
+    p_sorted_weights->resize(n_uniques);
+  } else {
+    n_uniques = dh::SegmentedUnique(
+        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
+        sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
+        sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
+          if (l.index == r.index) {
+            if (IsCat(d_feature_types, l.index)) {
+              return l.fvalue == r.fvalue;
+            }
+          }
+          return false;
+        });
+  }
+  sorted_entries.resize(n_uniques);

  // Renew the column scan and cut scan based on categorical data.
  auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
-  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
-      info.num_col_ + 1);
+  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
  CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
-  dh::LaunchN(
-      new_column_scan.size(),
-      [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
-       d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
-       d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
-        d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
-        if (idx == d_new_columns_ptr.size() - 1) {
-          return;
-        }
-        if (IsCat(d_feature_types, idx)) {
-          // Cut size is the same as number of categories in input.
-          d_new_cuts_size[idx] =
-              d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
-        } else {
-          d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
-        }
-      });
+  dh::LaunchN(new_column_scan.size(),
+              [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
+               d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
+               d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
+                d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
+                if (idx == d_new_columns_ptr.size() - 1) {
+                  return;
+                }
+                if (IsCat(d_feature_types, idx)) {
+                  // Cut size is the same as number of categories in input.
+                  d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
+                } else {
+                  d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
+                }
+              });
  // Turn size into ptr.
-  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
-                         new_cuts_size.cend(), d_cuts_ptr.data());
+  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
+                         d_cuts_ptr.data());
 }
 }  // namespace detail

@@ -209,8 +230,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();

  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+                                       &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
@@ -276,8 +297,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
                             &column_sizes_scan);
  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
+                                       &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -240,10 +240,10 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                  dh::device_vector<Entry>* sorted_entries);

-void RemoveDuplicatedCategories(
-    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
-    dh::device_vector<Entry> *p_sorted_entries,
-    dh::caching_device_vector<size_t> *p_column_sizes_scan);
+void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+                                dh::device_vector<Entry>* p_sorted_entries,
+                                dh::device_vector<float>* p_sorted_weights,
+                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
 }  // namespace detail

 // Compute sketch on DMatrix.
@@ -275,8 +275,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+                                       &column_sizes_scan);
  }

  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -354,8 +354,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,

  if (sketch_container->HasCategorical()) {
    auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
+                                       &column_sizes_scan);
  }

  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -203,13 +203,33 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
 }

 /**
- * \brief Optionally compressed gradient index. The compression works only with dense
+ * @brief Optionally compressed gradient index. The compression works only with dense
 *        data.
 *
 *   The main body of construction code is in gradient_index.cc, this struct is only a
- *   storage class.
+ *   view class.
 */
-struct Index {
+class Index {
+ private:
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
+              binTypeSize == kUint32BinsTypeSize);
+    }
+  }
+
+ public:
  // Inside the compressor, bin_idx is the index for cut value across all features. By
  // subtracting it with starting pointer of each feature, we can reduce it to smaller
  // value and store it with smaller types. Usable only with dense data.
@@ -233,10 +253,24 @@ struct Index {
  }

  Index() { SetBinTypeSize(binTypeSize_); }
-  Index(const Index& i) = delete;
-  Index& operator=(Index i) = delete;
+
+  Index(Index const& i) = delete;
+  Index& operator=(Index const& i) = delete;
  Index(Index&& i) = delete;
-  Index& operator=(Index&& i) = delete;
+
+  /** @brief Move assignment for lazy initialization. */
+  Index& operator=(Index&& i) = default;
+
+  /**
+   * @brief Construct the index from data.
+   *
+   * @param data     Storage for compressed histogram bin.
+   * @param bin_size Number of bytes for each bin.
+   */
+  Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
+    this->SetBinTypeSize(bin_size);
+  }
+
  uint32_t operator[](size_t i) const {
    if (!bin_offset_.empty()) {
      // dense, compressed
@@ -247,26 +281,7 @@ struct Index {
      return func_(data_.data(), i);
    }
  }
-  void SetBinTypeSize(BinTypeSize binTypeSize) {
-    binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
-              binTypeSize == kUint32BinsTypeSize);
-    }
-  }
-  BinTypeSize GetBinTypeSize() const {
-    return binTypeSize_;
-  }
+  [[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
  template <typename T>
  T const* data() const {  // NOLINT
    return reinterpret_cast<T const*>(data_.data());
@@ -275,30 +290,27 @@ struct Index {
  T* data() {  // NOLINT
    return reinterpret_cast<T*>(data_.data());
  }
-  uint32_t const* Offset() const { return bin_offset_.data(); }
-  size_t OffsetSize() const { return bin_offset_.size(); }
-  size_t Size() const { return data_.size() / (binTypeSize_); }
+  [[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
+  [[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
+  [[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }

-  void Resize(const size_t n_bytes) {
-    data_.resize(n_bytes);
-  }
  // set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
  void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
    bin_offset_.resize(cut_ptrs.size() - 1);  // resize to number of features.
    std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
  }
-  std::vector<uint8_t>::const_iterator begin() const {  // NOLINT
-    return data_.begin();
+  auto begin() const {  // NOLINT
+    return data_.data();
  }
-  std::vector<uint8_t>::const_iterator end() const {  // NOLINT
-    return data_.end();
+  auto end() const {  // NOLINT
+    return data_.data() + data_.size();
  }

-  std::vector<uint8_t>::iterator begin() {  // NOLINT
-    return data_.begin();
+  auto begin() {  // NOLINT
+    return data_.data();
  }
-  std::vector<uint8_t>::iterator end() {  // NOLINT
-    return data_.end();
+  auto end() {  // NOLINT
+    return data_.data() + data_.size();
  }

 private:
@@ -313,12 +325,12 @@ struct Index {

  using Func = uint32_t (*)(uint8_t const*, size_t);

-  std::vector<uint8_t> data_;
+  Span<std::uint8_t> data_;
  // starting position of each feature inside the cut values (the indptr of the CSC cut matrix
  // HistogramCuts without the last entry.) Used for bin compression.
  std::vector<uint32_t> bin_offset_;

-  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
+  BinTypeSize binTypeSize_{kUint8BinsTypeSize};
  Func func_;
 };

--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -200,21 +200,43 @@ std::string FileExtension(std::string fname, bool lower) {
  }
 }

-struct PrivateMmapConstStream::MMAPFile {
+// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
+// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
+ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
+
+struct MMAPFile {
 #if defined(xgboost_IS_WIN)
  HANDLE fd{INVALID_HANDLE_VALUE};
  HANDLE file_map{INVALID_HANDLE_VALUE};
 #else
  std::int32_t fd{0};
 #endif
-  char* base_ptr{nullptr};
+  std::byte* base_ptr{nullptr};
  std::size_t base_size{0};
+  std::size_t delta{0};
  std::string path;
+
+  MMAPFile() = default;
+
+#if defined(xgboost_IS_WIN)
+  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd},
+        file_map{fm},
+        base_ptr{base_ptr},
+        base_size{base_size},
+        delta{delta},
+        path{std::move(path)} {}
+#else
+  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
+#endif
 };

-char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::size_t length) {
+std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
  if (length == 0) {
-    return nullptr;
+    return std::make_unique<MMAPFile>();
  }

 #if defined(xgboost_IS_WIN)
@@ -226,16 +248,18 @@ char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::si
  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
 #endif

-  char* ptr{nullptr};
+  std::byte* ptr{nullptr};
  // Round down for alignment.
  auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
  auto view_size = length + (offset - view_start);

 #if defined(__linux__) || defined(__GLIBC__)
  int prot{PROT_READ};
-  ptr = reinterpret_cast<char*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  madvise(ptr, view_size, MADV_WILLNEED);
  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
+  auto handle =
+      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
 #elif defined(xgboost_IS_WIN)
  auto file_size = GetFileSize(fd, nullptr);
  DWORD access = PAGE_READONLY;
@@ -244,33 +268,32 @@ char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::si
  std::uint32_t loff = static_cast<std::uint32_t>(view_start);
  std::uint32_t hoff = view_start >> 32;
  CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  ptr = reinterpret_cast<char*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
+  ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  handle_.reset(new MMAPFile{fd, map_file, ptr, view_size, std::move(path)});
+  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
+                                           std::move(path));
 #else
  CHECK_LE(offset, std::numeric_limits<off_t>::max())
      << "File size has exceeded the limit on the current system.";
  int prot{PROT_READ};
-  ptr = reinterpret_cast<char*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
+  auto handle =
+      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
 #endif  // defined(__linux__)

-  ptr += (offset - view_start);
-  return ptr;
+  return handle;
 }

-PrivateMmapConstStream::PrivateMmapConstStream(std::string path, std::size_t offset,
-                                               std::size_t length)
-    : MemoryFixSizeBuffer{}, handle_{nullptr} {
-  this->p_buffer_ = Open(std::move(path), offset, length);
-  this->buffer_size_ = length;
-}
+MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}

-PrivateMmapConstStream::~PrivateMmapConstStream() {
-  CHECK(handle_);
+MmapResource::~MmapResource() noexcept(false) {
+  if (!handle_) {
+    return;
+  }
 #if defined(xgboost_IS_WIN)
-  if (p_buffer_) {
+  if (handle_->base_ptr) {
    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
  }
  if (handle_->fd != INVALID_HANDLE_VALUE) {
@@ -290,6 +313,43 @@ PrivateMmapConstStream::~PrivateMmapConstStream() {
  }
 #endif
 }
+
+[[nodiscard]] void* MmapResource::Data() {
+  if (!handle_) {
+    return nullptr;
+  }
+  return handle_->base_ptr + handle_->delta;
+}
+
+[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
+
+// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
+// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
+AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {}  // NOLINT
+PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {}        // NOLINT
+
+AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
+    : pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
+
+[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
+                                                          std::size_t n_bytes) noexcept(true) {
+  pimpl_->Write(ptr, n_bytes);
+  return n_bytes;
+}
+
+AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
+    : pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
+AlignedMemWriteStream::~AlignedMemWriteStream() = default;
+
+[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
+                                                         std::size_t n_bytes) noexcept(true) {
+  this->pimpl_->Write(ptr, n_bytes);
+  return n_bytes;
+}
+
+[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
+  return this->pimpl_->Tell();
+}
 }  // namespace xgboost::common

 #if defined(xgboost_IS_WIN)
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -4,22 +4,29 @@
 * \brief general stream interface for serialization, I/O
 * \author Tianqi Chen
 */
-
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_

 #include <dmlc/io.h>
 #include <rabit/rabit.h>

-#include <cstring>
-#include <fstream>
-#include <memory>  // for unique_ptr
-#include <string>  // for string
+#include <algorithm>    // for min
+#include <array>        // for array
+#include <cstddef>      // for byte, size_t
+#include <cstdlib>      // for malloc, realloc, free
+#include <cstring>      // for memcpy
+#include <fstream>      // for ifstream
+#include <limits>       // for numeric_limits
+#include <memory>       // for unique_ptr
+#include <string>       // for string
+#include <type_traits>  // for alignment_of_v, enable_if_t
+#include <utility>      // for move
+#include <vector>       // for vector

 #include "common.h"
+#include "xgboost/string_view.h"  // for StringView

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
 using MemoryBufferStream = rabit::utils::MemoryBufferStream;

@@ -58,8 +65,8 @@ class FixedSizeStream : public PeekableInStream {

  size_t Read(void* dptr, size_t size) override;
  size_t PeekRead(void* dptr, size_t size) override;
-  size_t Size() const { return buffer_.size(); }
-  size_t Tell() const { return pointer_; }
+  [[nodiscard]] std::size_t Size() const { return buffer_.size(); }
+  [[nodiscard]] std::size_t Tell() const { return pointer_; }
  void Seek(size_t pos);

  void Write(const void*, size_t) override {
@@ -129,18 +136,245 @@ inline std::string ReadAll(std::string const &path) {
  return content;
 }

+struct MMAPFile;
+
+/**
+ * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
+ *        fixed once it's constructed. Users cannot use mutable operations like resize
+ *        without acquiring the specific resource first.
+ */
+class ResourceHandler {
+ public:
+  // RTTI
+  enum Kind : std::uint8_t {
+    kMalloc = 0,
+    kMmap = 1,
+  };
+
+ private:
+  Kind kind_{kMalloc};
+
+ public:
+  virtual void* Data() = 0;
+  template <typename T>
+  [[nodiscard]] T* DataAs() {
+    return reinterpret_cast<T*>(this->Data());
+  }
+
+  [[nodiscard]] virtual std::size_t Size() const = 0;
+  [[nodiscard]] auto Type() const { return kind_; }
+
+  // Allow exceptions for cleaning up resource.
+  virtual ~ResourceHandler() noexcept(false);
+
+  explicit ResourceHandler(Kind kind) : kind_{kind} {}
+  // Use shared_ptr to manage a pool like resource handler. All copy and assignment
+  // operators are disabled.
+  ResourceHandler(ResourceHandler const& that) = delete;
+  ResourceHandler& operator=(ResourceHandler const& that) = delete;
+  ResourceHandler(ResourceHandler&& that) = delete;
+  ResourceHandler& operator=(ResourceHandler&& that) = delete;
+  /**
+   * @brief Wether two resources have the same type. (both malloc or both mmap).
+   */
+  [[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
+    return this->Type() == that.Type();
+  }
+};
+
+class MallocResource : public ResourceHandler {
+  void* ptr_{nullptr};
+  std::size_t n_{0};
+
+  void Clear() noexcept(true) {
+    std::free(ptr_);
+    ptr_ = nullptr;
+    n_ = 0;
+  }
+
+ public:
+  explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
+  ~MallocResource() noexcept(true) override { this->Clear(); }
+
+  void* Data() override { return ptr_; }
+  [[nodiscard]] std::size_t Size() const override { return n_; }
+  /**
+   * @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
+   *        over malloc.
+   *
+   * @tparam force_malloc Force the use of malloc over realloc. Used for testing.
+   *
+   * @param n_bytes The new size.
+   */
+  template <bool force_malloc = false>
+  void Resize(std::size_t n_bytes) {
+    // realloc(ptr, 0) works, but is deprecated.
+    if (n_bytes == 0) {
+      this->Clear();
+      return;
+    }
+
+    // If realloc fails, we need to copy the data ourselves.
+    bool need_copy{false};
+    void* new_ptr{nullptr};
+    // use realloc first, it can handle nullptr.
+    if constexpr (!force_malloc) {
+      new_ptr = std::realloc(ptr_, n_bytes);
+    }
+    // retry with malloc if realloc fails
+    if (!new_ptr) {
+      // ptr_ is preserved if realloc fails
+      new_ptr = std::malloc(n_bytes);
+      need_copy = true;
+    }
+    if (!new_ptr) {
+      // malloc fails
+      LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
+    }
+
+    if (need_copy) {
+      std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
+    }
+    // default initialize
+    std::memset(reinterpret_cast<std::byte*>(new_ptr) + n_, '\0', n_bytes - n_);
+    // free the old ptr if malloc is used.
+    if (need_copy) {
+      this->Clear();
+    }
+
+    ptr_ = new_ptr;
+    n_ = n_bytes;
+  }
+};
+
+/**
+ * @brief A class for wrapping mmap as a resource for RAII.
+ */
+class MmapResource : public ResourceHandler {
+  std::unique_ptr<MMAPFile> handle_;
+  std::size_t n_;
+
+ public:
+  MmapResource(std::string path, std::size_t offset, std::size_t length);
+  ~MmapResource() noexcept(false) override;
+
+  [[nodiscard]] void* Data() override;
+  [[nodiscard]] std::size_t Size() const override;
+};
+
+/**
+ * @param Alignment for resource read stream and aligned write stream.
+ */
+constexpr std::size_t IOAlignment() {
+  // For most of the pod types in XGBoost, 8 byte is sufficient.
+  return 8;
+}
+
+/**
+ * @brief Wrap resource into a dmlc stream.
+ *
+ *  This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
+ *  method or the `Consume()` method. The former copies data into output, while the latter
+ *  makes copy only if it's a primitive type.
+ *
+ *  Input is required to be aligned to IOAlignment().
+ */
+class AlignedResourceReadStream {
+  std::shared_ptr<ResourceHandler> resource_;
+  std::size_t curr_ptr_{0};
+
+  // Similar to SEEK_END in libc
+  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
+
+ public:
+  explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
+      : resource_{std::move(resource)} {}
+
+  [[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
+  /**
+   * @brief Consume n_bytes of data, no copying is performed.
+   *
+   * @return A pair with the beginning pointer and the number of available bytes, which
+   *         may be smaller than requested.
+   */
+  [[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
+    auto res_size = resource_->Size();
+    auto data = reinterpret_cast<std::byte*>(resource_->Data());
+    auto ptr = data + curr_ptr_;
+
+    // Move the cursor
+    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
+    auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
+    std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
+
+    curr_ptr_ += aligned_forward;
+
+    return std::pair{ptr, forward};
+  }
+
+  template <typename T>
+  [[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
+    auto [ptr, size] = this->Consume(sizeof(T));
+    if (size != sizeof(T)) {
+      return false;
+    }
+    CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
+    *out = *reinterpret_cast<T*>(ptr);
+    return true;
+  }
+
+  [[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
+  /**
+   * @brief Read n_bytes of data, output is copied into ptr.
+   */
+  [[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
+    auto [res_ptr, forward] = this->Consume(n_bytes);
+    if (forward != 0) {
+      std::memcpy(ptr, res_ptr, forward);
+    }
+    return forward;
+  }
+  /**
+   * @brief Read a primitive type.
+   *
+   * @return Whether the read is successful.
+   */
+  template <typename T>
+  [[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
+    return this->Consume(out);
+  }
+  /**
+   * @brief Read a vector.
+   *
+   * @return Whether the read is successful.
+   */
+  template <typename T>
+  [[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
+    std::uint64_t n{0};
+    if (!this->Consume(&n)) {
+      return false;
+    }
+    out->resize(n);
+
+    auto n_bytes = sizeof(T) * n;
+    if (this->Read(out->data(), n_bytes) != n_bytes) {
+      return false;
+    }
+    return true;
+  }
+
+  virtual ~AlignedResourceReadStream() noexcept(false);
+};
+
 /**
 * @brief Private mmap file as a read-only stream.
 *
 *  It can calculate alignment automatically based on system page size (or allocation
 *  granularity on Windows).
+ *
+ *  The file is required to be aligned by IOAlignment().
 */
-class PrivateMmapConstStream : public MemoryFixSizeBuffer {
-  struct MMAPFile;
-  std::unique_ptr<MMAPFile> handle_;
-
-  char* Open(std::string path, std::size_t offset, std::size_t length);
-
+class PrivateMmapConstStream : public AlignedResourceReadStream {
 public:
  /**
   * @brief Construct a private mmap stream.
@@ -149,11 +383,71 @@ class PrivateMmapConstStream : public MemoryFixSizeBuffer {
   * @param offset    See the `offset` parameter of `mmap` for details.
   * @param length    See the `length` parameter of `mmap` for details.
   */
-  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length);
-  void Write(void const*, std::size_t) override { LOG(FATAL) << "Read-only stream."; }
-
-  ~PrivateMmapConstStream() override;
+  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
+      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+  ~PrivateMmapConstStream() noexcept(false) override;
 };
-}  // namespace common
-}  // namespace xgboost
+
+/**
+ * @brief Base class for write stream with alignment defined by IOAlignment().
+ */
+class AlignedWriteStream {
+ protected:
+  [[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
+                                            std::size_t n_bytes) noexcept(true) = 0;
+
+ public:
+  virtual ~AlignedWriteStream() = default;
+
+  [[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
+    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
+    auto w_n_bytes = this->DoWrite(ptr, n_bytes);
+    CHECK_EQ(w_n_bytes, n_bytes);
+    auto remaining = aligned_n_bytes - n_bytes;
+    if (remaining > 0) {
+      std::array<std::uint8_t, IOAlignment()> padding;
+      std::memset(padding.data(), '\0', padding.size());
+      w_n_bytes = this->DoWrite(padding.data(), remaining);
+      CHECK_EQ(w_n_bytes, remaining);
+    }
+    return aligned_n_bytes;
+  }
+
+  template <typename T>
+  [[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
+    return this->Write(&v, sizeof(T));
+  }
+};
+
+/**
+ * @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
+ */
+class AlignedFileWriteStream : public AlignedWriteStream {
+  std::unique_ptr<dmlc::Stream> pimpl_;
+
+ protected:
+  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
+
+ public:
+  AlignedFileWriteStream() = default;
+  AlignedFileWriteStream(StringView path, StringView flags);
+  ~AlignedFileWriteStream() override = default;
+};
+
+/**
+ * @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
+ */
+class AlignedMemWriteStream : public AlignedFileWriteStream {
+  std::unique_ptr<MemoryBufferStream> pimpl_;
+
+ protected:
+  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
+
+ public:
+  explicit AlignedMemWriteStream(std::string* p_buf);
+  ~AlignedMemWriteStream() override;
+
+  [[nodiscard]] std::size_t Tell() const noexcept(true);
+};
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_IO_H_
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
+#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
+
+#include <algorithm>    // for fill_n
+#include <cstdint>      // for uint64_t
+#include <cstring>      // for memcpy
+#include <memory>       // for shared_ptr, make_shared
+#include <type_traits>  // for is_reference_v, remove_reference_t, is_same_v
+#include <utility>      // for swap, move
+
+#include "io.h"  // for ResourceHandler, AlignedResourceReadStream, MallocResource
+#include "xgboost/logging.h"
+#include "xgboost/span.h"  // for Span
+
+namespace xgboost::common {
+/**
+ * @brief A vector-like type that holds a reference counted resource.
+ *
+ *    The vector size is immutable after construction. This way we can swap the underlying
+ *    resource when needed.
+ */
+template <typename T>
+class RefResourceView {
+  static_assert(!std::is_reference_v<T>);
+
+ public:
+  using value_type = T;             // NOLINT
+  using size_type = std::uint64_t;  // NOLINT
+
+ private:
+  value_type* ptr_{nullptr};
+  size_type size_{0};
+  std::shared_ptr<common::ResourceHandler> mem_{nullptr};
+
+ public:
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
+      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
+    CHECK_GE(mem_->Size(), n);
+  }
+  /**
+   * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
+   *
+   * @param ptr  The pointer to view.
+   * @param n    The length of the view.
+   * @param mem  The owner of the pointer.
+   * @param init Initialize the view with this value.
+   */
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
+                  T const& init)
+      : RefResourceView{ptr, n, mem} {
+    if (n != 0) {
+      std::fill_n(ptr_, n, init);
+    }
+  }
+
+  ~RefResourceView() = default;
+
+  RefResourceView() = default;
+  RefResourceView(RefResourceView const& that) = delete;
+  RefResourceView(RefResourceView&& that) = delete;
+  RefResourceView& operator=(RefResourceView const& that) = delete;
+  /**
+   * @brief We allow move assignment for lazy initialization.
+   */
+  RefResourceView& operator=(RefResourceView&& that) = default;
+
+  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
+  [[nodiscard]] size_type size_bytes() const {            // NOLINT
+    return Span{data(), size()}.size_bytes();
+  }
+  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
+  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
+  [[nodiscard]] bool empty() const { return size() == 0; }        // NOLINT
+
+  [[nodiscard]] auto cbegin() const { return data(); }         // NOLINT
+  [[nodiscard]] auto begin() { return data(); }                // NOLINT
+  [[nodiscard]] auto begin() const { return cbegin(); }        // NOLINT
+  [[nodiscard]] auto cend() const { return data() + size(); }  // NOLINT
+  [[nodiscard]] auto end() { return data() + size(); }         // NOLINT
+  [[nodiscard]] auto end() const { return cend(); }            // NOLINT
+
+  [[nodiscard]] auto const& front() const { return data()[0]; }          // NOLINT
+  [[nodiscard]] auto& front() { return data()[0]; }                      // NOLINT
+  [[nodiscard]] auto const& back() const { return data()[size() - 1]; }  // NOLINT
+  [[nodiscard]] auto& back() { return data()[size() - 1]; }              // NOLINT
+
+  [[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
+  [[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
+
+  /**
+   * @brief Get the underlying resource.
+   */
+  auto Resource() const { return mem_; }
+};
+
+/**
+ * @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
+ *
+ *  If the output vector is a referenced counted view, no copying occur.
+ */
+template <typename Vec>
+[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
+  std::uint64_t n{0};
+  if (!fi->Read(&n)) {
+    return false;
+  }
+  if (n == 0) {
+    return true;
+  }
+
+  using T = typename Vec::value_type;
+  auto expected_bytes = sizeof(T) * n;
+
+  auto [ptr, n_bytes] = fi->Consume(expected_bytes);
+  if (n_bytes != expected_bytes) {
+    return false;
+  }
+
+  if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
+    *vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
+  } else {
+    vec->resize(n);
+    std::memcpy(vec->data(), ptr, n_bytes);
+  }
+  return true;
+}
+
+/**
+ * @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
+ */
+template <typename Vec>
+[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
+  std::size_t bytes{0};
+  auto n = static_cast<std::uint64_t>(vec.size());
+  bytes += fo->Write(n);
+  if (n == 0) {
+    return sizeof(n);
+  }
+
+  using T = typename std::remove_reference_t<decltype(vec)>::value_type;
+  bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
+
+  return bytes;
+}
+
+/**
+ * @brief Make a fixed size `RefResourceView` with malloc resource.
+ */
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
+  auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
+  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
+}
+}  // namespace xgboost::common
+#endif  // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -7,7 +7,7 @@
 #include <dmlc/data.h>

 #include <algorithm>
-#include <cstddef>  // std::size_t
+#include <cstddef>  // for size_t
 #include <functional>
 #include <limits>
 #include <map>
@@ -17,6 +17,7 @@
 #include <vector>

 #include "../c_api/c_api_error.h"
+#include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
 #include "arrow-cdi.h"
@@ -300,9 +301,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
    array_interface_ = ArrayInterface<2>(get<Object const>(j));
    batch_ = ArrayAdapterBatch{array_interface_};
  }
-  ArrayAdapterBatch const& Value() const override { return batch_; }
-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }

 private:
  ArrayAdapterBatch batch_;
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,60 +1,59 @@
-/*!
- * Copyright 2019-2021 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
-#include <xgboost/data.h>
 #include <dmlc/registry.h>

+#include <cstddef>  // for size_t
+
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
 #include "ellpack_page.cuh"
-#include "sparse_page_writer.h"
-#include "histogram_cut_format.h"
-
-namespace xgboost {
-namespace data {
+#include "histogram_cut_format.h"  // for ReadHistogramCuts, WriteHistogramCuts
+#include "sparse_page_writer.h"    // for SparsePageFormat

+namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);

-
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
 public:
-  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
+  bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
    auto* impl = page->Impl();
    if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
      return false;
    }
-    fi->Read(&impl->n_rows);
-    fi->Read(&impl->is_dense);
-    fi->Read(&impl->row_stride);
-    fi->Read(&impl->gidx_buffer.HostVector());
+    if (!fi->Read(&impl->n_rows)) {
+      return false;
+    }
+    if (!fi->Read(&impl->is_dense)) {
+      return false;
+    }
+    if (!fi->Read(&impl->row_stride)) {
+      return false;
+    }
+    if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
+      return false;
+    }
    if (!fi->Read(&impl->base_rowid)) {
      return false;
    }
    return true;
  }

-  size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
-    size_t bytes = 0;
+  size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
+    std::size_t bytes{0};
    auto* impl = page.Impl();
    bytes += WriteHistogramCuts(impl->Cuts(), fo);
-    fo->Write(impl->n_rows);
-    bytes += sizeof(impl->n_rows);
-    fo->Write(impl->is_dense);
-    bytes += sizeof(impl->is_dense);
-    fo->Write(impl->row_stride);
-    bytes += sizeof(impl->row_stride);
+    bytes += fo->Write(impl->n_rows);
+    bytes += fo->Write(impl->is_dense);
+    bytes += fo->Write(impl->row_stride);
    CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-    fo->Write(impl->gidx_buffer.HostVector());
-    bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-    fo->Write(impl->base_rowid);
-    bytes += sizeof(impl->base_rowid);
+    bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
+    bytes += fo->Write(impl->base_rowid);
    return bytes;
  }
 };

 XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
    .describe("Raw ELLPACK binary data format.")
-    .set_body([]() {
-      return new EllpackPageRawFormat();
-    });
-
-}  // namespace data
-}  // namespace xgboost
+    .set_body([]() { return new EllpackPageRawFormat(); });
+}  // namespace xgboost::data
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -29,7 +29,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
  cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);

  const uint32_t nbins = cut.Ptrs().back();
-  hit_count.resize(nbins, 0);
+  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
  hit_count_tloc_.resize(ctx->Threads() * nbins, 0);

  size_t new_size = 1;
@@ -37,8 +37,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
    new_size += batch.Size();
  }

-  row_ptr.resize(new_size);
-  row_ptr[0] = 0;
+  row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});

  const bool isDense = p_fmat->IsDense();
  this->isDense_ = isDense;
@@ -61,8 +60,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_

 GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
                                   bst_bin_t max_bin_per_feat)
-    : row_ptr(info.num_row_ + 1, 0),
-      hit_count(cuts.TotalBins(), 0),
+    : row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
+      hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
      cut{std::forward<common::HistogramCuts>(cuts)},
      max_numeric_bins_per_feat(max_bin_per_feat),
      isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
@@ -95,12 +94,10 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
      isDense_{isDense} {
  CHECK_GE(n_threads, 1);
  CHECK_EQ(row_ptr.size(), 0);
-  // The number of threads is pegged to the batch size. If the OMP
-  // block is parallelized on anything other than the batch/block size,
-  // it should be reassigned
-  row_ptr.resize(batch.Size() + 1, 0);
+  row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
+
  const uint32_t nbins = cut.Ptrs().back();
-  hit_count.resize(nbins, 0);
+  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
  hit_count_tloc_.resize(n_threads * nbins, 0);

  this->PushBatch(batch, ft, n_threads);
@@ -128,20 +125,45 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH

 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
+  auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
+    // Must resize instead of allocating a new one. This function is called everytime a
+    // new batch is pushed, and we grow the size accordingly without loosing the data the
+    // previous batches.
+    using T = decltype(t);
+    std::size_t n_bytes = sizeof(T) * n_index;
+    CHECK_GE(n_bytes, this->data.size());
+
+    auto resource = this->data.Resource();
+    decltype(this->data) new_vec;
+    if (!resource) {
+      CHECK(this->data.empty());
+      new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
+    } else {
+      CHECK(resource->Type() == common::ResourceHandler::kMalloc);
+      auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
+      CHECK(malloc_resource);
+      malloc_resource->Resize(n_bytes);
+
+      // gcc-11.3 doesn't work if DataAs is used.
+      std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
+      new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
+    }
+    this->data = std::move(new_vec);
+    this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
+  };
+
  if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
      isDense) {
    // compress dense index to uint8
-    index.SetBinTypeSize(common::kUint8BinsTypeSize);
-    index.Resize((sizeof(uint8_t)) * n_index);
+    make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
  } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
              MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
             isDense) {
    // compress dense index to uint16
-    index.SetBinTypeSize(common::kUint16BinsTypeSize);
-    index.Resize((sizeof(uint16_t)) * n_index);
+    make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
  } else {
-    index.SetBinTypeSize(common::kUint32BinsTypeSize);
-    index.Resize((sizeof(uint32_t)) * n_index);
+    // no compression
+    make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
  }
 }

@@ -214,11 +236,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
  return std::numeric_limits<float>::quiet_NaN();
 }

-bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
+bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
  return this->columns_->Read(fi, this->cut.Ptrs().data());
 }

-size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
+std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
  return this->columns_->Write(fo);
 }
 }  // namespace xgboost
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023, XGBoost Contributors
 */
 #include <memory>  // std::unique_ptr

@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
 }

 void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
-                          std::vector<size_t>* p_out) {
+                          common::RefResourceView<std::size_t>* p_out) {
  auto& row_ptr = *p_out;
-  row_ptr.resize(page->Size() + 1, 0);
+  row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
  if (page->is_dense) {
    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
  } else {
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
        ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
  }

-  this->hit_count.resize(n_bins_total, 0);
+  this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
  this->GatherHitCount(ctx->Threads(), n_bins_total);

  // sanity checks
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -9,13 +9,14 @@
 #include <atomic>     // for atomic
 #include <cinttypes>  // for uint32_t
 #include <cstddef>    // for size_t
-#include <memory>
+#include <memory>     // for make_unique
 #include <vector>

 #include "../common/categorical.h"
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
+#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
@@ -25,9 +26,11 @@
 namespace xgboost {
 namespace common {
 class ColumnMatrix;
+class AlignedFileWriteStream;
 }  // namespace common
-/*!
- * \brief preprocessed global index matrix, in CSR format
+
+/**
+ * @brief preprocessed global index matrix, in CSR format.
 *
 *  Transform floating values to integer index in histogram This is a global histogram
 *  index for CPU histogram.  On GPU ellpack page is used.
@@ -133,20 +136,22 @@ class GHistIndexMatrix {
  }

 public:
-  /*! \brief row pointer to rows by element position */
-  std::vector<size_t> row_ptr;
-  /*! \brief The index data */
+  /** @brief row pointer to rows by element position */
+  common::RefResourceView<std::size_t> row_ptr;
+  /** @brief data storage for index. */
+  common::RefResourceView<std::uint8_t> data;
+  /** @brief The histogram index. */
  common::Index index;
-  /*! \brief hit count of each index, used for constructing the ColumnMatrix */
-  std::vector<size_t> hit_count;
-  /*! \brief The corresponding cuts */
+  /** @brief hit count of each index, used for constructing the ColumnMatrix */
+  common::RefResourceView<std::size_t> hit_count;
+  /** @brief The corresponding cuts */
  common::HistogramCuts cut;
-  /** \brief max_bin for each feature. */
+  /** @brief max_bin for each feature. */
  bst_bin_t max_numeric_bins_per_feat;
-  /*! \brief base row index for current page (used by external memory) */
-  size_t base_rowid{0};
+  /** @brief base row index for current page (used by external memory) */
+  bst_row_t base_rowid{0};

-  bst_bin_t MaxNumBinPerFeat() const {
+  [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
    return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
  }

@@ -218,29 +223,27 @@ class GHistIndexMatrix {
    }
  }

-  bool IsDense() const {
-    return isDense_;
-  }
+  [[nodiscard]] bool IsDense() const { return isDense_; }
  void SetDense(bool is_dense) { isDense_ = is_dense; }
  /**
-   * \brief Get the local row index.
+   * @brief Get the local row index.
   */
-  size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
+  [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }

-  bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
-  bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
+  [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
+  [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }

-  bool ReadColumnPage(dmlc::SeekStream* fi);
-  size_t WriteColumnPage(dmlc::Stream* fo) const;
+  [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
+  [[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;

-  common::ColumnMatrix const& Transpose() const;
+  [[nodiscard]] common::ColumnMatrix const& Transpose() const;

-  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+  [[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;

-  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
-  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
-                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
-                  bool is_cat) const;
+  [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
+                                std::vector<float> const& values, std::vector<float> const& mins,
+                                bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;

 private:
  std::unique_ptr<common::ColumnMatrix> columns_;
@@ -294,5 +297,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
    }
  });
 }
-}      // namespace xgboost
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,38 +1,49 @@
-/*!
- * Copyright 2021-2022 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
 */
-#include "sparse_page_writer.h"
-#include "gradient_index.h"
-#include "histogram_cut_format.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint8_t
+#include <type_traits>  // for underlying_type_t
+#include <vector>       // for vector

-namespace xgboost {
-namespace data {
+#include "../common/io.h"                 // for AlignedResourceReadStream
+#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
+#include "gradient_index.h"               // for GHistIndexMatrix
+#include "histogram_cut_format.h"         // for ReadHistogramCuts
+#include "sparse_page_writer.h"           // for SparsePageFormat
+
+namespace xgboost::data {
 class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
 public:
-  bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
+  bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
+    CHECK(fi);
+
    if (!ReadHistogramCuts(&page->cut, fi)) {
      return false;
    }
+
    // indptr
-    fi->Read(&page->row_ptr);
-    // data
-    std::vector<uint8_t> data;
-    if (!fi->Read(&data)) {
+    if (!common::ReadVec(fi, &page->row_ptr)) {
      return false;
    }
-    page->index.Resize(data.size());
-    std::copy(data.cbegin(), data.cend(), page->index.begin());
-    // bin type
+
+    // data
+    // - bin type
    // Old gcc doesn't support reading from enum.
    std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
    if (!fi->Read(&uint_bin_type)) {
      return false;
    }
-    common::BinTypeSize size_type =
-        static_cast<common::BinTypeSize>(uint_bin_type);
-    page->index.SetBinTypeSize(size_type);
+    common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
+    // - index buffer
+    if (!common::ReadVec(fi, &page->data)) {
+      return false;
+    }
+    // - index
+    page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
+
    // hit count
-    if (!fi->Read(&page->hit_count)) {
+    if (!common::ReadVec(fi, &page->hit_count)) {
      return false;
    }
    if (!fi->Read(&page->max_numeric_bins_per_feat)) {
@@ -50,38 +61,34 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
      page->index.SetBinOffset(page->cut.Ptrs());
    }

-    page->ReadColumnPage(fi);
+    if (!page->ReadColumnPage(fi)) {
+      return false;
+    }
    return true;
  }

-  size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
-    size_t bytes = 0;
+  std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
+    CHECK_NE(page.index.Size(), 0) << "Empty page is not supported.";
+    std::size_t bytes = 0;
    bytes += WriteHistogramCuts(page.cut, fo);
    // indptr
-    fo->Write(page.row_ptr);
-    bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
-             sizeof(uint64_t);
+    bytes += common::WriteVec(fo, page.row_ptr);
+
    // data
-    std::vector<uint8_t> data(page.index.begin(), page.index.end());
-    fo->Write(data);
-    bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
-    // bin type
-    std::underlying_type_t<common::BinTypeSize> uint_bin_type =
-        page.index.GetBinTypeSize();
-    fo->Write(uint_bin_type);
-    bytes += sizeof(page.index.GetBinTypeSize());
+    // - bin type
+    std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
+    bytes += fo->Write(uint_bin_type);
+    // - index buffer
+    std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
+    bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
+    bytes += fo->Write(data.data(), data.size());
+
    // hit count
-    fo->Write(page.hit_count);
-    bytes +=
-        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
-        sizeof(uint64_t);
+    bytes += common::WriteVec(fo, page.hit_count);
    // max_bins, base row, is_dense
-    fo->Write(page.max_numeric_bins_per_feat);
-    bytes += sizeof(page.max_numeric_bins_per_feat);
-    fo->Write(page.base_rowid);
-    bytes += sizeof(page.base_rowid);
-    fo->Write(page.IsDense());
-    bytes += sizeof(page.IsDense());
+    bytes += fo->Write(page.max_numeric_bins_per_feat);
+    bytes += fo->Write(page.base_rowid);
+    bytes += fo->Write(page.IsDense());

    bytes += page.WriteColumnPage(fo);
    return bytes;
@@ -93,6 +100,4 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
 XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
    .describe("Raw GHistIndex binary data format.")
    .set_body([]() { return new GHistIndexRawFormat(); });
-
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -1,10 +1,9 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #include "gradient_index_page_source.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 void GradientIndexPageSource::Fetch() {
  if (!this->ReadCache()) {
    if (count_ != 0 && !sync_) {
@@ -21,5 +20,4 @@ void GradientIndexPageSource::Fetch() {
    this->WriteCache();
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,36 +1,38 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
 */
 #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_

-#include "../common/hist_util.h"
+#include <dmlc/io.h>  // for Stream

-namespace xgboost {
-namespace data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
-  if (!fi->Read(&cuts->cut_values_.HostVector())) {
+#include <cstddef>  // for size_t
+
+#include "../common/hist_util.h"          // for HistogramCuts
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
+
+namespace xgboost::data {
+inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
+  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
    return false;
  }
-  if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
+  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
    return false;
  }
-  if (!fi->Read(&cuts->min_vals_.HostVector())) {
+  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
    return false;
  }
  return true;
 }

-inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
-  size_t bytes = 0;
-  fo->Write(cuts.cut_values_.ConstHostVector());
-  bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-  fo->Write(cuts.cut_ptrs_.ConstHostVector());
-  bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-  fo->Write(cuts.min_vals_.ConstHostVector());
-  bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
+                                      common::AlignedFileWriteStream *fo) {
+  std::size_t bytes = 0;
+  bytes += common::WriteVec(fo, cuts.Values());
+  bytes += common::WriteVec(fo, cuts.Ptrs());
+  bytes += common::WriteVec(fo, cuts.MinValues());
  return bytes;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -240,9 +240,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   * Generate gradient index.
   */
  this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
-  size_t rbegin = 0;
-  size_t prev_sum = 0;
-  size_t i = 0;
+  std::size_t rbegin = 0;
+  std::size_t prev_sum = 0;
+  std::size_t i = 0;
  while (iter.Next()) {
    HostAdapterDispatch(proxy, [&](auto const& batch) {
      proxy->Info().num_nonzero_ = batch_nnz[i];
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
  dh::XGBCachingDeviceAllocator<char> alloc;

  auto num_rows = [&]() {
-    return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
+    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
  };
  auto num_cols = [&]() {
-    return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
+    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
  };

  size_t row_stride = 0;
@@ -74,7 +74,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
                                     get_device());
      auto* p_sketch = &sketch_containers.back();
      proxy->Info().weights_.SetDevice(get_device());
-      Dispatch(proxy, [&](auto const& value) {
+      cuda_impl::Dispatch(proxy, [&](auto const& value) {
        common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
      });
    }
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    accumulated_rows += batch_rows;
    dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
+    row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
                            return GetRowCounts(value, row_counts_span, get_device(), missing);
                          }));
    nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
@@ -136,14 +136,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    auto rows = num_rows();
    dh::device_vector<size_t> row_counts(rows + 1, 0);
    common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    Dispatch(proxy, [=](auto const& value) {
+    cuda_impl::Dispatch(proxy, [=](auto const& value) {
      return GetRowCounts(value, row_counts_span, get_device(), missing);
    });
    auto is_dense = this->IsDense();

    proxy->Info().feature_types.SetDevice(get_device());
    auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
-    auto new_impl = Dispatch(proxy, [&](auto const& value) {
+    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
      return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
                             d_feature_types, row_stride, rows, cuts);
    });
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -1,14 +1,13 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 * \file proxy_dmatrix.cc
 */

 #include "proxy_dmatrix.h"

-namespace xgboost {
-namespace data {
-void DMatrixProxy::SetArrayData(char const *c_interface) {
-  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
+namespace xgboost::data {
+void DMatrixProxy::SetArrayData(StringView interface_str) {
+  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
  this->batch_ = adapter;
  this->Info().num_col_ = adapter->NumColumns();
  this->Info().num_row_ = adapter->NumRows();
@@ -25,5 +24,36 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
  this->Info().num_row_ = adapter->NumRows();
  this->ctx_.gpu_id = Context::kCpuId;
 }
-}  // namespace data
-}  // namespace xgboost
+
+namespace cuda_impl {
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
+#if !defined(XGBOOST_USE_CUDA)
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
+                                                float) {
+  return nullptr;
+}
+#endif  // XGBOOST_USE_CUDA
+}  // namespace cuda_impl
+
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy,
+                                                float missing) {
+  bool type_error{false};
+  std::shared_ptr<DMatrix> p_fmat{nullptr};
+  if (proxy->Ctx()->IsCPU()) {
+    p_fmat = data::HostAdapterDispatch<false>(
+        proxy.get(),
+        [&](auto const &adapter) {
+          auto p_fmat =
+              std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
+          return p_fmat;
+        },
+        &type_error);
+  } else {
+    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
+  }
+
+  return p_fmat;
+}
+}  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -1,12 +1,11 @@
-/*!
- * Copyright 2020-2022, XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
 */
-#include "proxy_dmatrix.h"
 #include "device_adapter.cuh"
+#include "proxy_dmatrix.cuh"
+#include "proxy_dmatrix.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
  auto const& value = adapter->Value();
@@ -31,5 +30,15 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
    ctx_.gpu_id = dh::CurrentDevice();
  }
 }
-}  // namespace data
-}  // namespace xgboost
+
+namespace cuda_impl {
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy,
+                                                float missing) {
+  return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
+    auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
+    return p_fmat;
+  });
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::data
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -6,19 +6,34 @@
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"

-namespace xgboost::data {
-template <typename Fn>
+namespace xgboost::data::cuda_impl {
+template <bool get_value = true, typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
  } else {
    LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
  }
 }
-}  // namespace xgboost::data
+}  // namespace xgboost::data::cuda_impl
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
  }

-  void SetArrayData(char const* c_interface);
+  void SetArrayData(StringView interface_str);
  void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
                  bst_feature_t n_features, bool on_host);

@@ -114,28 +114,62 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
  return typed;
 }

-template <typename Fn>
+/**
+ * @brief Dispatch function call based on input type.
+ *
+ * @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
+ * @tparam Fn        The type of the function to be dispatched.
+ *
+ * @param proxy The proxy object holding the reference to the input.
+ * @param fn    The function to be dispatched.
+ * @param type_error[out] Set to ture if it's not null and the input data is not recognized by
+ *                        the host.
+ *
+ * @return The return value of the function being dispatched.
+ */
+template <bool get_value = true, typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
  if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
    if (type_error) {
      *type_error = false;
    }
-    return fn(value);
  } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
    if (type_error) {
      *type_error = false;
    }
-    return fn(value);
  } else {
    if (type_error) {
      *type_error = true;
    } else {
      LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
    }
-    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    if constexpr (get_value) {
+      return std::result_of_t<Fn(
+          decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    } else {
+      return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
+    }
  }
 }
+
+/**
+ * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
+ */
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -21,8 +21,7 @@
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 MetaInfo& SimpleDMatrix::Info() { return info_; }

 const MetaInfo& SimpleDMatrix::Info() const { return info_; }
@@ -97,6 +96,10 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
  // column page doesn't exist, generate it
  if (!column_page_) {
+    auto n = std::numeric_limits<decltype(Entry::index)>::max();
+    if (this->sparse_page_->Size() > n) {
+      error::MaxSampleSize(n);
+    }
    column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
  }
  auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
@@ -106,6 +109,10 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
 BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
  // Sorted column page doesn't exist, generate it
  if (!sorted_column_page_) {
+    auto n = std::numeric_limits<decltype(Entry::index)>::max();
+    if (this->sparse_page_->Size() > n) {
+      error::MaxSampleSize(n);
+    }
    sorted_column_page_.reset(
        new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
    sorted_column_page_->SortRows(ctx->Threads());
@@ -427,5 +434,4 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i

  fmat_ctx_ = ctx;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -1,59 +1,57 @@
-/*!
- * Copyright (c) 2015-2021 by Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
 * \file sparse_page_raw_format.cc
 *  Raw binary format of sparse page.
 */
-#include <xgboost/data.h>
 #include <dmlc/registry.h>

-#include "xgboost/logging.h"
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for WriteVec
 #include "./sparse_page_writer.h"
+#include "xgboost/data.h"
+#include "xgboost/logging.h"

-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);

-template<typename T>
+template <typename T>
 class SparsePageRawFormat : public SparsePageFormat<T> {
 public:
-  bool Read(T* page, dmlc::SeekStream* fi) override {
+  bool Read(T* page, common::AlignedResourceReadStream* fi) override {
    auto& offset_vec = page->offset.HostVector();
-    if (!fi->Read(&offset_vec)) {
+    if (!common::ReadVec(fi, &offset_vec)) {
      return false;
    }
    auto& data_vec = page->data.HostVector();
    CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
    data_vec.resize(offset_vec.back());
    if (page->data.Size() != 0) {
-      size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
-                                (page->data).Size() * sizeof(Entry));
-      CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
-          << "Invalid SparsePage file";
+      if (!common::ReadVec(fi, &data_vec)) {
+        return false;
+      }
+    }
+    if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
+      return false;
    }
-    fi->Read(&page->base_rowid, sizeof(page->base_rowid));
    return true;
  }

-  size_t Write(const T& page, dmlc::Stream* fo) override {
+  std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
    const auto& offset_vec = page.offset.HostVector();
    const auto& data_vec = page.data.HostVector();
    CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
    CHECK_EQ(offset_vec.back(), page.data.Size());
-    fo->Write(offset_vec);
-    auto bytes = page.MemCostBytes();
-    bytes += sizeof(uint64_t);
+
+    std::size_t bytes{0};
+    bytes += common::WriteVec(fo, offset_vec);
    if (page.data.Size() != 0) {
-      fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
+      bytes += common::WriteVec(fo, data_vec);
    }
-    fo->Write(&page.base_rowid, sizeof(page.base_rowid));
-    bytes += sizeof(page.base_rowid);
+    bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
    return bytes;
  }

 private:
-  /*! \brief external memory column offset */
-  std::vector<size_t> disk_offset_;
 };

 XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
@@ -74,5 +72,4 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
    return new SparsePageRawFormat<SortedCSCPage>();
  });

-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -1,33 +1,31 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
 */
+#include "../common/device_helpers.cuh"  // for CurrentDevice
+#include "proxy_dmatrix.cuh"             // for Dispatch, DMatrixProxy
+#include "simple_dmatrix.cuh"            // for CopyToSparsePage
 #include "sparse_page_source.h"
-#include "proxy_dmatrix.cuh"
-#include "simple_dmatrix.cuh"
-
-namespace xgboost {
-namespace data {
+#include "xgboost/data.h"  // for SparsePage

+namespace xgboost::data {
 namespace detail {
 std::size_t NSamplesDevice(DMatrixProxy *proxy) {
-  return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
+  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
 }

 std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
-  return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
+  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
 }
 }  // namespace detail

-void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
+void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
  auto device = proxy->DeviceIdx();
  if (device < 0) {
    device = dh::CurrentDevice();
  }
  CHECK_GE(device, 0);

-  Dispatch(proxy, [&](auto const &value) {
-    CopyToSparsePage(value, device, missing, page);
-  });
+  cuda_impl::Dispatch(proxy,
+                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -6,9 +6,11 @@
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_

 #include <algorithm>  // for min
+#include <atomic>     // for atomic
 #include <future>     // for async
 #include <map>
 #include <memory>
+#include <mutex>  // for mutex
 #include <string>
 #include <thread>
 #include <utility>  // for pair, move
@@ -18,7 +20,6 @@
 #include "../common/io.h"     // for PrivateMmapConstStream
 #include "../common/timer.h"  // for Monitor, Timer
 #include "adapter.h"
-#include "dmlc/common.h"         // for OMPException
 #include "proxy_dmatrix.h"       // for DMatrixProxy
 #include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
@@ -93,6 +94,47 @@ class TryLockGuard {
  }
 };

+// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
+class ExceHandler {
+  std::mutex mutex_;
+  std::atomic<bool> flag_{false};
+  std::exception_ptr curr_exce_{nullptr};
+
+ public:
+  template <typename Fn>
+  decltype(auto) Run(Fn&& fn) noexcept(true) {
+    try {
+      return fn();
+    } catch (dmlc::Error const& e) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    } catch (std::exception const& e) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    } catch (...) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    }
+    return std::invoke_result_t<Fn>();
+  }
+
+  void Rethrow() noexcept(false) {
+    if (flag_) {
+      CHECK(curr_exce_);
+      std::rethrow_exception(curr_exce_);
+    }
+  }
+};
+
 /**
 * @brief Base class for all page sources. Handles fetching, writing, and iteration.
 */
@@ -122,7 +164,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
  // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
  // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
  // OOM error should be rare.
-  dmlc::OMPException exec_;
+  ExceHandler exce_;
  common::Monitor monitor_;

  bool ReadCache() {
@@ -141,7 +183,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
    std::size_t fetch_it = count_;

-    exec_.Rethrow();
+    exce_.Rethrow();

    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
      fetch_it %= n_batches_;  // ring
@@ -152,7 +194,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
      CHECK_LT(fetch_it, cache_info_->offset.size());
      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
        auto page = std::make_shared<S>();
-        this->exec_.Run([&] {
+        this->exce_.Run([&] {
          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
          auto name = self->cache_info_->ShardName();
          auto [offset, length] = self->cache_info_->View(fetch_it);
@@ -172,7 +214,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    CHECK(!(*ring_)[count_].valid());
    monitor_.Stop("Wait");

-    exec_.Rethrow();
+    exce_.Rethrow();

    return true;
  }
@@ -184,11 +226,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
    std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};

    auto name = cache_info_->ShardName();
-    std::unique_ptr<dmlc::Stream> fo;
+    std::unique_ptr<common::AlignedFileWriteStream> fo;
    if (this->Iter() == 0) {
-      fo.reset(dmlc::Stream::Create(name.c_str(), "wb"));
+      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
    } else {
-      fo.reset(dmlc::Stream::Create(name.c_str(), "ab"));
+      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
    }

    auto bytes = fmt->Write(*page_, fo.get());
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,52 +1,44 @@
-/*!
- * Copyright (c) 2014-2019 by Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
 * \file sparse_page_writer.h
 * \author Tianqi Chen
 */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
 #define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_

-#include <xgboost/data.h>
-#include <dmlc/io.h>
-#include <vector>
-#include <algorithm>
-#include <cstring>
-#include <string>
-#include <utility>
-#include <memory>
-#include <functional>
+#include <functional>  // for function
+#include <string>      // for string

-#if DMLC_ENABLE_STD_THREAD
-#include <dmlc/concurrency.h>
-#include <thread>
-#endif  // DMLC_ENABLE_STD_THREAD
-
-namespace xgboost {
-namespace data {
+#include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "dmlc/io.h"        // for Stream
+#include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
+#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...

+namespace xgboost::data {
 template<typename T>
 struct SparsePageFormatReg;

-/*!
- * \brief Format specification of SparsePage.
+/**
+ * @brief Format specification of various data formats like SparsePage.
 */
-template<typename T>
+template <typename T>
 class SparsePageFormat {
 public:
-  /*! \brief virtual destructor */
  virtual ~SparsePageFormat() = default;
-  /*!
-   * \brief Load all the segments into page, advance fi to end of the block.
-   * \param page The data to read page into.
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
+  /**
+   * @brief Load all the segments into page, advance fi to end of the block.
+   *
+   * @param page The data to read page into.
+   * @param fi the input stream of the file
+   * @return true of the loading as successful, false if end of file was reached
   */
-  virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
-  /*!
-   * \brief save the data to fo, when a page was written.
-   * \param fo output stream
+  virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
+  /**
+   * @brief save the data to fo, when a page was written.
+   *
+   * @param fo output stream
   */
-  virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
+  virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
 };

 /*!
@@ -105,6 +97,5 @@ struct SparsePageFormatReg
  DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
                         GHistIndexPageFmt, Name)

-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -172,8 +172,7 @@ class GBLinear : public GradientBooster {
  }

  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
-                           unsigned) override {
+                           bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
    model_.LazyInitModel();
    LinearCheckLayer(layer_begin);
    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
@@ -210,8 +209,8 @@ class GBLinear : public GradientBooster {
    }
  }

-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                                       unsigned layer_begin, unsigned /*layer_end*/,
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
                                       bool) override {
    LinearCheckLayer(layer_begin);
    std::vector<bst_float>& contribs = out_contribs->HostVector();
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,9 +18,11 @@
 #include <vector>

 #include "../common/common.h"
+#include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
+#include "../data/proxy_dmatrix.h"  // for DMatrixProxy, HostAdapterDispatch
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -39,7 +41,6 @@ namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);

 void GBTree::Configure(Args const& cfg) {
-  this->cfg_ = cfg;
  std::string updater_seq = tparam_.updater_seq;
  tparam_.UpdateAllowUnknown(cfg);
  tree_param_.UpdateAllowUnknown(cfg);
@@ -59,9 +60,8 @@ void GBTree::Configure(Args const& cfg) {
  cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
  auto n_gpus = common::AllVisibleGPUs();
-  if (!gpu_predictor_ && n_gpus != 0) {
-    gpu_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("gpu_predictor", this->ctx_));
+  if (!gpu_predictor_) {
+    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
  }
  if (n_gpus != 0) {
    gpu_predictor_->Configure(cfg);
@@ -78,10 +78,9 @@ void GBTree::Configure(Args const& cfg) {

  monitor_.Init("GBTree");

-  specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
-                   [](std::pair<std::string, std::string> const& arg) {
-                     return arg.first == "updater";
-                   });
+  specified_updater_ = std::any_of(
+      cfg.cbegin(), cfg.cend(),
+      [](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });

  if (specified_updater_ && !showed_updater_warning_) {
    LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
@@ -93,12 +92,19 @@ void GBTree::Configure(Args const& cfg) {
    showed_updater_warning_ = true;
  }

+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
  this->ConfigureUpdaters();
+
  if (updater_seq != tparam_.updater_seq) {
    updaters_.clear();
    this->InitUpdater(cfg);
  } else {
-    for (auto &up : updaters_) {
+    for (auto& up : updaters_) {
      up->Configure(cfg);
    }
  }
@@ -106,66 +112,6 @@ void GBTree::Configure(Args const& cfg) {
  configured_ = true;
 }

-// FIXME(trivialfis): This handles updaters.  Because the choice of updaters depends on
-// whether external memory is used and how large is dataset.  We can remove the dependency
-// on DMatrix once `hist` tree method can handle external memory so that we can make it
-// default.
-void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
-  CHECK(this->configured_);
-  std::string updater_seq = tparam_.updater_seq;
-  CHECK(tparam_.GetInitialised());
-
-  tparam_.UpdateAllowUnknown(cfg);
-
-  this->PerformTreeMethodHeuristic(fmat);
-  this->ConfigureUpdaters();
-
-  // initialize the updaters only when needed.
-  if (updater_seq != tparam_.updater_seq) {
-    LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
-    this->updaters_.clear();
-    this->InitUpdater(cfg);
-  }
-}
-
-void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
-  if (specified_updater_) {
-    // This method is disabled when `updater` parameter is explicitly
-    // set, since only experts are expected to do so.
-    return;
-  }
-  if (model_.learner_model_param->IsVectorLeaf()) {
-    CHECK(tparam_.tree_method == TreeMethod::kHist)
-        << "Only the hist tree method is supported for building multi-target trees with vector "
-           "leaf.";
-  }
-
-  // tparam_ is set before calling this function.
-  if (tparam_.tree_method != TreeMethod::kAuto) {
-    return;
-  }
-
-  if (collective::IsDistributed()) {
-    LOG(INFO) << "Tree method is automatically selected to be 'approx' "
-                 "for distributed training.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else if (!fmat->SingleColBlock()) {
-    LOG(INFO) << "Tree method is automatically set to 'approx' "
-                 "since external-memory data matrix is used.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
-    /* Choose tree_method='approx' automatically for large data matrix */
-    LOG(INFO) << "Tree method is automatically selected to be "
-                 "'approx' for faster speed. To use old behavior "
-                 "(exact greedy algorithm on single machine), "
-                 "set tree_method to 'exact'.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else {
-    tparam_.tree_method = TreeMethod::kExact;
-  }
-  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
-}
-
 void GBTree::ConfigureUpdaters() {
  if (specified_updater_) {
    return;
@@ -173,31 +119,25 @@ void GBTree::ConfigureUpdaters() {
  // `updater` parameter was manually specified
  /* Choose updaters according to tree_method parameters */
  switch (tparam_.tree_method) {
-    case TreeMethod::kAuto:
-      // Use heuristic to choose between 'exact' and 'approx' This
-      // choice is carried out in PerformTreeMethodHeuristic() before
-      // calling this function.
+    case TreeMethod::kAuto:  // Use hist as default in 2.0
+    case TreeMethod::kHist: {
+      tparam_.updater_seq = "grow_quantile_histmaker";
      break;
+    }
    case TreeMethod::kApprox:
      tparam_.updater_seq = "grow_histmaker";
      break;
    case TreeMethod::kExact:
      tparam_.updater_seq = "grow_colmaker,prune";
      break;
-    case TreeMethod::kHist: {
-      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
-                   "grow_quantile_histmaker.";
-      tparam_.updater_seq = "grow_quantile_histmaker";
-      break;
-    }
    case TreeMethod::kGPUHist: {
      common::AssertGPUSupport();
      tparam_.updater_seq = "grow_gpu_hist";
      break;
    }
    default:
-      LOG(FATAL) << "Unknown tree_method ("
-                 << static_cast<int>(tparam_.tree_method) << ") detected";
+      LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
+                 << ") detected";
  }
 }

@@ -253,7 +193,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                     PredictionCacheEntry* predt, ObjFunction const* obj) {
  TreesOneIter new_trees;
  bst_target_t const n_groups = model_.learner_model_param->OutputLength();
-  ConfigureWithKnownData(this->cfg_, p_fmat);
  monitor_.Start("BoostNewTrees");

  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
@@ -436,12 +375,7 @@ void GBTree::LoadConfig(Json const& in) {
  // This would cause all trees to be pushed to trees_to_update
  // e.g. updating a model, then saving and loading it would result in an empty model
  tparam_.process_type = TreeProcessType::kDefault;
-  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
-  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
-    LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine.  "
-                    "Changing predictor to auto.";
-    tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
-  }
+  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();

  auto msg = StringView{
      R"(
@@ -457,19 +391,32 @@ void GBTree::LoadConfig(Json const& in) {
    LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
  }

-  auto const& j_updaters = get<Object const>(in["updater"]);
+  std::vector<Json> updater_seq;
+  if (IsA<Object>(in["updater"])) {
+    // before 2.0
+    error::WarnOldSerialization();
+    for (auto const& kv : get<Object const>(in["updater"])) {
+      auto name = kv.first;
+      auto config = kv.second;
+      config["name"] = name;
+      updater_seq.push_back(config);
+    }
+  } else {
+    // after 2.0
+    auto const& j_updaters = get<Array const>(in["updater"]);
+    updater_seq = j_updaters;
+  }
+
  updaters_.clear();

-  for (auto const& kv : j_updaters) {
-    auto name = kv.first;
+  for (auto const& config : updater_seq) {
+    auto name = get<String>(config["name"]);
    if (n_gpus == 0 && name == "grow_gpu_hist") {
      name = "grow_quantile_histmaker";
      LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
    }
-    std::unique_ptr<TreeUpdater> up{
-        TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
-    up->LoadConfig(kv.second);
-    updaters_.push_back(std::move(up));
+    updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
+    updaters_.back()->LoadConfig(config);
  }

  specified_updater_ = get<Boolean>(in["specified_updater"]);
@@ -491,13 +438,14 @@ void GBTree::SaveConfig(Json* p_out) const {
  // language binding doesn't need to know about the forest size.
  out["gbtree_model_param"] = ToJson(model_.param);

-  out["updater"] = Object();
+  out["updater"] = Array{};
+  auto& j_updaters = get<Array>(out["updater"]);

-  auto& j_updaters = out["updater"];
-  for (auto const& up : updaters_) {
-    j_updaters[up->Name()] = Object();
-    auto& j_up = j_updaters[up->Name()];
-    up->SaveConfig(&j_up);
+  for (auto const& up : this->updaters_) {
+    Json up_config{Object{}};
+    up_config["name"] = String{up->Name()};
+    up->SaveConfig(&up_config);
+    j_updaters.emplace_back(up_config);
  }
  out["specified_updater"] = Boolean{specified_updater_};
 }
@@ -567,8 +515,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
  out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
 }

-void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
-                          bst_layer_t layer_begin, bst_layer_t layer_end) {
+void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                              bst_layer_t layer_begin, bst_layer_t layer_end) const {
  CHECK(configured_);
  if (layer_end == 0) {
    layer_end = this->BoostedRounds();
@@ -588,7 +536,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
    CHECK_EQ(out_preds->version, 0);
  }

-  auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
+  auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
  if (out_preds->version == 0) {
    // out_preds->Size() can be non-zero as it's initialized here before any
    // tree is built at the 0^th iterator.
@@ -608,52 +556,83 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
  }
 }

-std::unique_ptr<Predictor> const &
-GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
-                     DMatrix *f_dmat) const {
+namespace {
+inline void MismatchedDevices(Context const* booster, Context const* data) {
+  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
+               << "is running on: " << booster->DeviceName()
+               << ", while the input data is on: " << data->DeviceName() << ".\n"
+               << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+)";
+}
+};  // namespace
+
+void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                          bst_layer_t layer_begin, bst_layer_t layer_end) {
+  // dispatch to const function.
+  this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
+}
+
+void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
+                            PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
+                            bst_layer_t layer_end) const {
  CHECK(configured_);
-  if (tparam_.predictor != PredictorType::kAuto) {
-    if (tparam_.predictor == PredictorType::kGPUPredictor) {
-#if defined(XGBOOST_USE_CUDA)
-      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
+  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
+  if (p_m->Ctx()->Device() != this->ctx_->Device()) {
+    MismatchedDevices(this->ctx_, p_m->Ctx());
+    CHECK_EQ(out_preds->version, 0);
+    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+    CHECK(proxy) << error::InplacePredictProxy();
+    auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
+    this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
+    return;
+  }
+
+  bool known_type = this->ctx_->DispatchDevice(
+      [&, begin = tree_begin, end = tree_end] {
+        return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+      },
+      [&, begin = tree_begin, end = tree_end] {
+        return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+      });
+  if (!known_type) {
+    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+    CHECK(proxy) << error::InplacePredictProxy();
+    LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
+  }
+}
+
+[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
+    bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
+  CHECK(configured_);
+
+  // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
+  // prevent data copy.
+  if (f_dmat && !f_dmat->SingleColBlock()) {
+    if (ctx_->IsCPU()) {
+      return cpu_predictor_;
+    } else {
+      common::AssertGPUSupport();
      CHECK(gpu_predictor_);
      return gpu_predictor_;
-#else
-      common::AssertGPUSupport();
-#endif  // defined(XGBOOST_USE_CUDA)
    }
-    if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
-#if defined(XGBOOST_USE_ONEAPI)
-      CHECK(oneapi_predictor_);
-      return oneapi_predictor_;
-#else
-      common::AssertOneAPISupport();
-#endif  // defined(XGBOOST_USE_ONEAPI)
-    }
-    CHECK(cpu_predictor_);
-    return cpu_predictor_;
  }

  // Data comes from Device DMatrix.
-  auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
-                    !f_dmat->PageExists<SparsePage>();
+  auto is_ellpack =
+      f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
  // Data comes from device memory, like CuDF or CuPy.
-  auto is_from_device =
-      f_dmat && f_dmat->PageExists<SparsePage>() &&
-      (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
+  auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
+                        (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
  auto on_device = is_ellpack || is_from_device;

  // Use GPU Predictor if data is already on device and gpu_id is set.
-  if (on_device && ctx_->gpu_id >= 0) {
-#if defined(XGBOOST_USE_CUDA)
-    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  if (on_device && ctx_->IsCUDA()) {
+    common::AssertGPUSupport();
    CHECK(gpu_predictor_);
    return gpu_predictor_;
-#else
-    LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
-                  "CUDA support.";
-    return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
  }

  // GPU_Hist by default has prediction cache calculated from quantile values,
@@ -665,23 +644,19 @@ GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
  if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
      // FIXME(trivialfis): Implement a better method for testing whether data
      // is on device after DMatrix refactoring is done.
-      !on_device) {
+      !on_device && is_training) {
    CHECK(cpu_predictor_);
    return cpu_predictor_;
  }

-  if (tparam_.tree_method == TreeMethod::kGPUHist) {
-#if defined(XGBOOST_USE_CUDA)
-    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  if (ctx_->IsCPU()) {
+    return cpu_predictor_;
+  } else {
+    common::AssertGPUSupport();
    CHECK(gpu_predictor_);
    return gpu_predictor_;
-#else
-    common::AssertGPUSupport();
-    return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
  }

-  CHECK(cpu_predictor_);
  return cpu_predictor_;
 }

@@ -796,7 +771,7 @@ class Dart : public GBTree {
                        bool training, unsigned layer_begin,
                        unsigned layer_end) const {
    CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
-    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
+    auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
    CHECK(predictor);
    predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                  model_);
@@ -860,15 +835,16 @@ class Dart : public GBTree {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    auto n_groups = model_.learner_model_param->num_output_group;

-    std::vector<Predictor const*> predictors {
-      cpu_predictor_.get(),
-#if defined(XGBOOST_USE_CUDA)
-      gpu_predictor_.get()
-#endif  // defined(XGBOOST_USE_CUDA)
-    };
-    Predictor const* predictor{nullptr};
-    StringView msg{"Unsupported data type for inplace predict."};
+    if (ctx_->Device() != p_fmat->Ctx()->Device()) {
+      MismatchedDevices(ctx_, p_fmat->Ctx());
+      auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
+      CHECK(proxy) << error::InplacePredictProxy();
+      auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
+      this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
+      return;
+    }

+    StringView msg{"Unsupported data type for inplace predict."};
    PredictionCacheEntry predts;
    if (ctx_->gpu_id != Context::kCpuId) {
      predts.predictions.SetDevice(ctx_->gpu_id);
@@ -877,32 +853,29 @@ class Dart : public GBTree {

    auto predict_impl = [&](size_t i) {
      predts.predictions.Fill(0);
-      if (tparam_.predictor == PredictorType::kAuto) {
-        // Try both predictor implementations
-        bool success = false;
-        for (auto const& p : predictors) {
-          if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
-            success = true;
-            predictor = p;
-            break;
-          }
-        }
-        CHECK(success) << msg;
-      } else {
-        predictor = this->GetPredictor().get();
-        bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-        CHECK(success) << msg << std::endl
-                       << "Current Predictor: "
-                       << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
-                                                                             : "gpu_predictor");
-      }
+      bool success = this->ctx_->DispatchDevice(
+          [&] {
+            return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+          },
+          [&] {
+            return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+          });
+      CHECK(success) << msg;
    };

    // Inplace predict is not used for training, so no need to drop tree.
    for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
      predict_impl(i);
      if (i == tree_begin) {
-        predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
+        this->ctx_->DispatchDevice(
+            [&] {
+              this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                       model_);
+            },
+            [&] {
+              this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                       model_);
+            });
      }
      // Multiple the tree weight
      auto w = this->weight_drop_.at(i);
@@ -932,25 +905,24 @@ class Dart : public GBTree {
                       std::vector<bst_float> *out_preds,
                       unsigned layer_begin, unsigned layer_end) override {
    DropTrees(false);
-    auto &predictor = this->GetPredictor();
+    auto &predictor = this->GetPredictor(false);
    uint32_t _, tree_end;
    std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
    predictor->PredictInstance(inst, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat,
-                           HostDeviceVector<bst_float>* out_contribs,
-                           unsigned layer_begin, unsigned layer_end, bool approximate, int,
-                           unsigned) override {
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                           bst_layer_t layer_begin, bst_layer_t layer_end,
+                           bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
                                        approximate);
  }

-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      unsigned layer_begin, unsigned layer_end, bool approximate) override {
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t layer_end,
+                                       bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,14 +1,11 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #include "../common/device_helpers.cuh"
-#include "xgboost/context.h"
 #include "xgboost/linalg.h"
 #include "xgboost/span.h"

-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
                     bst_group_t n_groups, bst_group_t group_id,
                     HostDeviceVector<GradientPair> *out_gpair) {
@@ -41,5 +38,4 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
    out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
  });
 }
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -43,30 +43,18 @@ enum class TreeProcessType : int {
  kDefault = 0,
  kUpdate = 1
 };
-
-enum class PredictorType : int {
-  kAuto = 0,
-  kCPUPredictor,
-  kGPUPredictor,
-  kOneAPIPredictor
-};
 }  // namespace xgboost

 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
-DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
-
-namespace xgboost {
-namespace gbm {

+namespace xgboost::gbm {
 /*! \brief training parameters */
 struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
  /*! \brief tree updater sequence */
  std::string updater_seq;
  /*! \brief type of boosting process to run */
  TreeProcessType process_type;
-  // predictor type
-  PredictorType predictor;
  // tree construction method
  TreeMethod tree_method;
  // declare parameters
@@ -81,13 +69,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
        .describe("Whether to run the normal boosting process that creates new trees,"\
                  " or to update the trees in an existing model.");
    DMLC_DECLARE_ALIAS(updater_seq, updater);
-    DMLC_DECLARE_FIELD(predictor)
-        .set_default(PredictorType::kAuto)
-        .add_enum("auto", PredictorType::kAuto)
-        .add_enum("cpu_predictor", PredictorType::kCPUPredictor)
-        .add_enum("gpu_predictor", PredictorType::kGPUPredictor)
-        .add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
-        .describe("Predictor algorithm type");
    DMLC_DECLARE_FIELD(tree_method)
        .set_default(TreeMethod::kAuto)
        .add_enum("auto",      TreeMethod::kAuto)
@@ -192,12 +173,8 @@ class GBTree : public GradientBooster {
      : GradientBooster{ctx}, model_(booster_config, ctx_) {}

  void Configure(const Args& cfg) override;
-  // Revise `tree_method` and `updater` parameters after seeing the training
-  // data matrix, only useful when tree_method is auto.
-  void PerformTreeMethodHeuristic(DMatrix* fmat);
  /*! \brief Map `tree_method` parameter to `updater` parameter */
  void ConfigureUpdaters();
-  void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);

  /**
   * \brief Optionally update the leaf value.
@@ -212,21 +189,11 @@ class GBTree : public GradientBooster {
  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
               PredictionCacheEntry* predt, ObjFunction const* obj) override;

-  bool UseGPU() const override {
-    return
-        tparam_.predictor == PredictorType::kGPUPredictor ||
-        tparam_.tree_method == TreeMethod::kGPUHist;
-  }
+  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }

-  GBTreeTrainParam const& GetTrainParam() const {
-    return tparam_;
-  }
-
-  void Load(dmlc::Stream* fi) override {
-    model_.Load(fi);
-    this->cfg_.clear();
-  }
+  [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }

+  void Load(dmlc::Stream* fi) override { model_.Load(fi); }
  void Save(dmlc::Stream* fo) const override {
    model_.Save(fo);
  }
@@ -246,39 +213,14 @@ class GBTree : public GradientBooster {
    return !model_.trees.empty() || !model_.trees_to_update.empty();
  }

+  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                        bst_layer_t layer_begin, bst_layer_t layer_end) const;
+
  void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
                    bst_layer_t layer_begin, bst_layer_t layer_end) override;

  void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
-                      bst_layer_t layer_begin, bst_layer_t layer_end) const override {
-    CHECK(configured_);
-    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
-    std::vector<Predictor const *> predictors{
-      cpu_predictor_.get(),
-#if defined(XGBOOST_USE_CUDA)
-      gpu_predictor_.get()
-#endif  // defined(XGBOOST_USE_CUDA)
-    };
-    StringView msg{"Unsupported data type for inplace predict."};
-    if (tparam_.predictor == PredictorType::kAuto) {
-      // Try both predictor implementations
-      for (auto const &p : predictors) {
-        if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
-          return;
-        }
-      }
-      LOG(FATAL) << msg;
-    } else {
-      bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
-                                                          tree_begin, tree_end);
-      CHECK(success) << msg << std::endl
-                     << "Current Predictor: "
-                     << (tparam_.predictor == PredictorType::kCPUPredictor
-                             ? "cpu_predictor"
-                             : "gpu_predictor");
-    }
-  }
+                      bst_layer_t layer_begin, bst_layer_t layer_end) const override;

  void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
                    std::vector<bst_feature_t>* features,
@@ -359,32 +301,29 @@ class GBTree : public GradientBooster {
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
    CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
                               "n_iteration), use model slicing instead.";
-    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
+    this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
  }

-  void PredictContribution(DMatrix* p_fmat,
-                           HostDeviceVector<bst_float>* out_contribs,
-                           uint32_t layer_begin, uint32_t layer_end, bool approximate,
-                           int, unsigned) override {
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                           bst_layer_t layer_begin, bst_layer_t layer_end,
+                           bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0)
-        << "Predict contribution supports only iteration end: (0, "
-           "n_iteration), using model slicing instead.";
-    this->GetPredictor()->PredictContribution(
-        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
+                               "n_iteration), using model slicing instead.";
+    this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
+                                                   approximate);
  }

-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t layer_end,
+                                       bool approximate) override {
    CHECK(configured_);
    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0)
-        << "Predict interaction contribution supports only iteration end: (0, "
-           "n_iteration), using model slicing instead.";
-    this->GetPredictor()->PredictInteractionContributions(
-        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
+                               "n_iteration), using model slicing instead.";
+    this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
+                                                               tree_end, nullptr, approximate);
  }

  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -400,8 +339,9 @@ class GBTree : public GradientBooster {
                     std::vector<HostDeviceVector<bst_node_t>>* out_position,
                     std::vector<std::unique_ptr<RegTree>>* ret);

-  std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
-                                                 DMatrix* f_dmat = nullptr) const;
+  [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
+      bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
+      DMatrix* f_dmat = nullptr) const;

  // commit new trees all at once
  virtual void CommitModel(TreesOneIter&& new_trees);
@@ -416,22 +356,17 @@ class GBTree : public GradientBooster {
  bool showed_updater_warning_ {false};
  bool specified_updater_   {false};
  bool configured_ {false};
-  // configurations for tree
-  Args cfg_;
  // the updaters that can be applied to each of tree
  std::vector<std::unique_ptr<TreeUpdater>> updaters_;
  // Predictors
  std::unique_ptr<Predictor> cpu_predictor_;
-#if defined(XGBOOST_USE_CUDA)
-  std::unique_ptr<Predictor> gpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
+  std::unique_ptr<Predictor> gpu_predictor_{nullptr};
 #if defined(XGBOOST_USE_ONEAPI)
  std::unique_ptr<Predictor> oneapi_predictor_;
 #endif  // defined(XGBOOST_USE_ONEAPI)
  common::Monitor monitor_;
 };

-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm

 #endif  // XGBOOST_GBM_GBTREE_H_
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,6 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -356,21 +357,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
    dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;

-namespace {
-StringView ModelMsg() {
-  return StringView{
-      R"doc(
-  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
-  older XGBoost, please export the model by calling `Booster.save_model` from that version
-  first, then load it back in current version. See:
-
-    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
-
-  for more details about differences between saving model and serializing.
-)doc"};
-}
-}  // anonymous namespace
-
 class LearnerConfiguration : public Learner {
 private:
  std::mutex config_lock_;
@@ -530,7 +516,7 @@ class LearnerConfiguration : public Learner {
    }

    if (!Version::Same(origin_version)) {
-      LOG(WARNING) << ModelMsg();
+      error::WarnOldSerialization();
      return;  // skip configuration if version is not matched
    }

@@ -561,7 +547,7 @@ class LearnerConfiguration : public Learner {
    for (size_t i = 0; i < n_metrics; ++i) {
      auto old_serialization = IsA<String>(j_metrics[i]);
      if (old_serialization) {
-        LOG(WARNING) << ModelMsg();
+        error::WarnOldSerialization();
        metric_names_[i] = get<String>(j_metrics[i]);
      } else {
        metric_names_[i] = get<String>(j_metrics[i]["name"]);
@@ -763,9 +749,7 @@ class LearnerConfiguration : public Learner {
        CHECK(matrix.first.ptr);
        CHECK(!matrix.second.ref.expired());
        const uint64_t num_col = matrix.first.ptr->Info().num_col_;
-        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
-            << "Unfortunately, XGBoost does not support data matrices with "
-            << std::numeric_limits<unsigned>::max() << " features or greater";
+        error::MaxFeatureSize(num_col);
        num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
      }

@@ -1174,7 +1158,7 @@ class LearnerIO : public LearnerConfiguration {
      Json memory_snapshot;
      if (header[1] == '"') {
        memory_snapshot = Json::Load(StringView{buffer});
-        LOG(WARNING) << ModelMsg();
+        error::WarnOldSerialization();
      } else if (std::isalpha(header[1])) {
        memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
      } else {
@@ -1193,7 +1177,7 @@ class LearnerIO : public LearnerConfiguration {
      header.resize(serialisation_header_.size());
      CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
      // Avoid printing the content in loaded header, which might be random binary code.
-      CHECK(header == serialisation_header_) << ModelMsg();
+      CHECK(header == serialisation_header_) << error::OldSerialization();
      int64_t sz {-1};
      CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
      if (!DMLC_IO_NO_ENDIAN_SWAP) {
@@ -1413,6 +1397,8 @@ class LearnerImpl : public LearnerIO {
    this->CheckModelInitialized();

    auto& out_predictions = this->GetThreadLocal().prediction_entry;
+    out_predictions.version = 0;
+
    this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
    if (type == PredictionType::kValue) {
      obj_->PredTransform(&out_predictions.predictions);
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -577,8 +577,8 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
                       if (lj(0) >= Eps64()) {
                         tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
                       }
-                       assert(!std::isinf(ti_plus(i)));
-                       assert(!std::isinf(tj_minus(i)));
+                       assert(!isinf(ti_plus(i)));
+                       assert(!isinf(tj_minus(i)));
                     });
 }
 }  // namespace cuda_impl
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -16,6 +16,7 @@
 #include "../common/bitfield.h"               // for RBitField8
 #include "../common/categorical.h"            // for IsCat, Decision
 #include "../common/common.h"                 // for DivRoundUp
+#include "../common/error_msg.h"              // for InplacePredictProxy
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
@@ -467,7 +468,6 @@ class ColumnSplitHelper {
  void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
    auto const &tree = *model_.trees[tree_id];
    auto const &cats = tree.GetCategoriesMatrix();
-    auto const has_categorical = tree.HasCategoricalSplit();
    bst_node_t n_nodes = tree.GetNodes().size();

    for (bst_node_t nid = 0; nid < n_nodes; nid++) {
@@ -484,16 +484,10 @@ class ColumnSplitHelper {
      }

      auto const fvalue = feat.GetFvalue(split_index);
-      if (has_categorical && common::IsCat(cats.split_type, nid)) {
-        auto const node_categories =
-            cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
-        if (!common::Decision(node_categories, fvalue)) {
-          decision_bits_.Set(bit_index);
-        }
-        continue;
-      }
-
-      if (fvalue >= node.SplitCond()) {
+      auto const decision = tree.HasCategoricalSplit()
+                                ? GetDecision<true>(node, nid, fvalue, cats)
+                                : GetDecision<false>(node, nid, fvalue, cats);
+      if (decision) {
        decision_bits_.Set(bit_index);
      }
    }
@@ -511,7 +505,7 @@ class ColumnSplitHelper {
    if (missing_bits_.Check(bit_index)) {
      return node.DefaultChild();
    } else {
-      return node.LeftChild() + decision_bits_.Check(bit_index);
+      return node.LeftChild() + !decision_bits_.Check(bit_index);
    }
  }

@@ -748,7 +742,7 @@ class CPUPredictor : public Predictor {
                      PredictionCacheEntry *out_preds, uint32_t tree_begin,
                      unsigned tree_end) const override {
    auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
-    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
+    CHECK(proxy)<< error::InplacePredictProxy();
    CHECK(!p_m->Info().IsColumnSplit())
        << "Inplace predict support for column-wise data split is not yet implemented.";
    auto x = proxy->Adapter();
@@ -890,9 +884,8 @@ class CPUPredictor : public Predictor {
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      auto page = batch.GetView();
      // parallel over local batch
-      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
-      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
-        auto row_idx = static_cast<size_t>(batch.base_rowid + i);
+      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
+        auto row_idx = batch.base_rowid + i;
        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
        if (feats.Size() == 0) {
          feats.Init(num_feature);
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -11,10 +11,13 @@
 #include <any>  // for any, any_cast
 #include <memory>

+#include "../collective/communicator-inl.cuh"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
+#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/device_helpers.cuh"
+#include "../common/error_msg.h"  // for InplacePredictProxy
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -110,13 +113,11 @@ struct SparsePageLoader {
  bool use_shared;
  SparsePageView data;
  float* smem;
-  size_t entry_start;

  __device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
                              bst_row_t num_rows, size_t entry_start, float)
      : use_shared(use_shared),
-        data(data),
-        entry_start(entry_start) {
+        data(data) {
    extern __shared__ float _smem[];
    smem = _smem;
    // Copy instances
@@ -622,6 +623,199 @@ size_t SharedMemoryBytes(size_t cols, size_t max_shared_memory_bytes) {
  }
  return shared_memory_bytes;
 }
+
+using BitVector = LBitField64;
+
+__global__ void MaskBitVectorKernel(
+    SparsePageView data, common::Span<RegTree::Node const> d_nodes,
+    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
+    common::Span<FeatureType const> d_tree_split_types,
+    common::Span<std::uint32_t const> d_cat_tree_segments,
+    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
+    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
+    std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
+    std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
+  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx >= num_rows) {
+    return;
+  }
+  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
+
+  std::size_t tree_offset = 0;
+  for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+    TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                    d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                    d_cat_node_segments, d_categories};
+    auto const tree_nodes = d_tree.d_tree.size();
+    for (auto nid = 0; nid < tree_nodes; nid++) {
+      auto const& node = d_tree.d_tree[nid];
+      if (node.IsDeleted() || node.IsLeaf()) {
+          continue;
+      }
+      auto const fvalue = loader.GetElement(row_idx, node.SplitIndex());
+      auto const is_missing = common::CheckNAN(fvalue);
+      auto const bit_index = row_idx * num_nodes + tree_offset + nid;
+      if (is_missing) {
+          missing_bits.Set(bit_index);
+      } else {
+          auto const decision = d_tree.HasCategoricalSplit()
+                                    ? GetDecision<true>(node, nid, fvalue, d_tree.cats)
+                                    : GetDecision<false>(node, nid, fvalue, d_tree.cats);
+          if (decision) {
+            decision_bits.Set(bit_index);
+          }
+      }
+    }
+    tree_offset += tree_nodes;
+  }
+}
+
+__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
+                                          BitVector const& decision_bits,
+                                          BitVector const& missing_bits, std::size_t num_nodes,
+                                          std::size_t tree_offset) {
+  bst_node_t nidx = 0;
+  RegTree::Node n = tree.d_tree[nidx];
+  while (!n.IsLeaf()) {
+    auto const bit_index = ridx * num_nodes + tree_offset + nidx;
+    if (missing_bits.Check(bit_index)) {
+      nidx = n.DefaultChild();
+    } else {
+      nidx = n.LeftChild() + !decision_bits.Check(bit_index);
+    }
+    n = tree.d_tree[nidx];
+  }
+  return tree.d_tree[nidx].LeafValue();
+}
+
+__global__ void PredictByBitVectorKernel(
+    common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
+    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
+    common::Span<FeatureType const> d_tree_split_types,
+    common::Span<std::uint32_t const> d_cat_tree_segments,
+    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
+    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
+    std::size_t tree_begin, std::size_t tree_end, std::size_t num_rows, std::size_t num_nodes,
+    std::uint32_t num_group) {
+  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx >= num_rows) {
+    return;
+  }
+
+  std::size_t tree_offset = 0;
+  if (num_group == 1) {
+    float sum = 0;
+    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                      d_cat_node_segments, d_categories};
+      sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
+                                      tree_offset);
+      tree_offset += d_tree.d_tree.size();
+    }
+    d_out_predictions[row_idx] += sum;
+  } else {
+    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto const tree_group = d_tree_group[tree_idx];
+      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                      d_cat_node_segments, d_categories};
+      bst_uint out_prediction_idx = row_idx * num_group + tree_group;
+      d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
+          row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
+      tree_offset += d_tree.d_tree.size();
+    }
+  }
+}
+
+class ColumnSplitHelper {
+ public:
+  explicit ColumnSplitHelper(Context const* ctx) : ctx_{ctx} {}
+
+  void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
+                    gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
+    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
+    PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                   model.learner_model_param->num_output_group);
+  }
+
+ private:
+  using BitType = BitVector::value_type;
+
+  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
+                      bst_feature_t num_features, std::uint32_t num_group) const {
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::caching_device_vector<BitType> decision_storage{};
+    dh::caching_device_vector<BitType> missing_storage{};
+
+    auto constexpr kBlockThreads = 128;
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const shared_memory_bytes =
+        SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
+    auto const use_shared = shared_memory_bytes != 0;
+
+    auto const num_nodes = model.nodes.Size();
+    std::size_t batch_offset = 0;
+    for (auto const& batch : dmat->GetBatches<SparsePage>()) {
+      auto const num_rows = batch.Size();
+      ResizeBitVectors(&decision_storage, &missing_storage, num_rows * num_nodes);
+      BitVector decision_bits{dh::ToSpan(decision_storage)};
+      BitVector missing_bits{dh::ToSpan(missing_storage)};
+
+      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->gpu_id);
+      std::size_t entry_start = 0;
+      SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
+
+      auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
+      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()} (
+          MaskBitVectorKernel, data, model.nodes.ConstDeviceSpan(),
+          model.tree_segments.ConstDeviceSpan(), model.tree_group.ConstDeviceSpan(),
+          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
+          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
+          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_features, num_rows,
+          entry_start, num_nodes, use_shared, nan(""));
+
+      AllReduceBitVectors(&decision_storage, &missing_storage);
+
+      dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
+          PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
+          out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
+          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
+          model.categories_tree_segments.ConstDeviceSpan(),
+          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
+          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_rows, num_nodes,
+          num_group);
+
+      batch_offset += batch.Size() * num_group;
+    }
+  }
+
+  void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
+                           dh::caching_device_vector<BitType>* missing_storage) const {
+    collective::AllReduce<collective::Operation::kBitwiseOR>(
+        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+    collective::AllReduce<collective::Operation::kBitwiseAND>(
+        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
+    collective::Synchronize(ctx_->gpu_id);
+  }
+
+  void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
+                               dh::caching_device_vector<BitType>* missing_storage,
+                               std::size_t total_bits) const {
+    auto const size = BitVector::ComputeStorageSize(total_bits);
+    if (decision_storage->size() < size) {
+      decision_storage->resize(size);
+    }
+    thrust::fill(ctx_->CUDACtx()->CTP(), decision_storage->begin(), decision_storage->end(), 0);
+    if (missing_storage->size() < size) {
+      missing_storage->resize(size);
+    }
+    thrust::fill(ctx_->CUDACtx()->CTP(), missing_storage->begin(), missing_storage->end(), 0);
+  }
+
+  Context const* ctx_;
+};
 }  // anonymous namespace

 class GPUPredictor : public xgboost::Predictor {
@@ -697,6 +891,11 @@ class GPUPredictor : public xgboost::Predictor {
    DeviceModel d_model;
    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);

+    if (dmat->Info().IsColumnSplit()) {
+      column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
+      return;
+    }
+
    if (dmat->PageExists<SparsePage>()) {
      size_t batch_offset = 0;
      for (auto &batch : dmat->GetBatches<SparsePage>()) {
@@ -720,7 +919,8 @@ class GPUPredictor : public xgboost::Predictor {
  }

 public:
-  explicit GPUPredictor(Context const* ctx) : Predictor::Predictor{ctx} {}
+  explicit GPUPredictor(Context const* ctx)
+      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}

  ~GPUPredictor() override {
    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
@@ -790,7 +990,7 @@ class GPUPredictor : public xgboost::Predictor {
                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
                      unsigned tree_end) const override {
    auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
-    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
+    CHECK(proxy) << error::InplacePredictProxy();
    auto x = proxy->Adapter();
    if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
      this->DispatchedInplacePredict<data::CupyAdapter,
@@ -1019,6 +1219,8 @@ class GPUPredictor : public xgboost::Predictor {
    }
    return 0;
  }
+
+  ColumnSplitHelper column_split_helper_;
 };

 XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -7,6 +7,18 @@
 #include "xgboost/tree_model.h"

 namespace xgboost::predictor {
+/** @brief Whether it should traverse to the left branch of a tree. */
+template <bool has_categorical>
+XGBOOST_DEVICE bool GetDecision(RegTree::Node const &node, bst_node_t nid, float fvalue,
+                                RegTree::CategoricalSplitMatrix const &cats) {
+  if (has_categorical && common::IsCat(cats.split_type, nid)) {
+    auto node_categories = cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
+    return common::Decision(node_categories, fvalue);
+  } else {
+    return fvalue < node.SplitCond();
+  }
+}
+
 template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
                                             float fvalue, bool is_missing,
@@ -14,13 +26,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
  if (has_missing && is_missing) {
    return node.DefaultChild();
  } else {
-    if (has_categorical && common::IsCat(cats.split_type, nid)) {
-      auto node_categories =
-          cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
-      return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
-    } else {
-      return node.LeftChild() + !(fvalue < node.SplitCond());
-    }
+    return node.LeftChild() + !GetDecision<has_categorical>(node, nid, fvalue, cats);
  }
 }

--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -226,9 +226,7 @@ struct GPUHistMakerDevice {
    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
  }

-  ~GPUHistMakerDevice() {  // NOLINT
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-  }
+  ~GPUHistMakerDevice() = default;

  void InitFeatureGroupsOnce() {
    if (!feature_groups) {
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x

 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.06
+RAPIDS_VERSION=23.08
 SPARK_VERSION=3.4.0
 JDK_VERSION=8

--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -23,7 +23,11 @@ class LintersPaths:
        "tests/python/test_predict.py",
        "tests/python/test_quantile_dmatrix.py",
        "tests/python/test_tree_regularization.py",
+        "tests/python/test_shap.py",
        "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/test_gpu_prediction.py",
+        "tests/python-gpu/load_pickle.py",
+        "tests/python-gpu/test_gpu_pickling.py",
        "tests/test_distributed/test_with_spark/",
        "tests/test_distributed/test_gpu_with_spark/",
        # demo
@@ -37,6 +41,7 @@ class LintersPaths:
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
        "demo/guide-python/learning_to_rank.py",
+        "demo/aft_survival/aft_survival_viz_demo.py",
        # CI
        "tests/ci_build/lint_python.py",
        "tests/ci_build/test_r_package.py",
@@ -66,6 +71,7 @@ class LintersPaths:
        "tests/python/test_dt.py",
        "tests/python/test_data_iterator.py",
        "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/load_pickle.py",
        "tests/test_distributed/test_with_spark/test_data.py",
        "tests/test_distributed/test_gpu_with_spark/test_data.py",
        "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
@@ -78,6 +84,7 @@ class LintersPaths:
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
        "demo/guide-python/learning_to_rank.py",
+        "demo/aft_survival/aft_survival_viz_demo.py",
        # CI
        "tests/ci_build/lint_python.py",
        "tests/ci_build/test_r_package.py",
@@ -114,7 +121,13 @@ def run_black(rel_path: str, fix: bool) -> bool:
@cd(PY_PACKAGE)
 def run_isort(rel_path: str, fix: bool) -> bool:
    # Isort gets confused when trying to find the config file, so specified explicitly.
-    cmd = ["isort", "--settings-path", PY_PACKAGE, os.path.join(ROOT, rel_path)]
+    cmd = [
+        "isort",
+        "--settings-path",
+        PY_PACKAGE,
+        f"--src={PY_PACKAGE}",
+        os.path.join(ROOT, rel_path),
+    ]
    if not fix:
        cmd += ["--check"]

--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -16,12 +16,7 @@ namespace xgboost {
 namespace collective {

 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
-  EXPECT_THROW(construct(), dmlc::Error);
-}
-
-TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
-  auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
  EXPECT_THROW(construct(), dmlc::Error);
 }

--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -143,11 +143,14 @@ TEST(HistUtil, DeviceSketchCategoricalFeatures) {

 void TestMixedSketch() {
  size_t n_samples = 1000, n_features = 2, n_categories = 3;
+  bst_bin_t n_bins = 64;
+
  std::vector<float> data(n_samples * n_features);
  SimpleLCG gen;
  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
  SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
  for (size_t i = 0; i < n_samples * n_features; ++i) {
+    // two features, row major. The first column is numeric and the second is categorical.
    if (i % 2 == 0) {
      data[i] = std::floor(cat_d(&gen));
    } else {
@@ -159,12 +162,75 @@ void TestMixedSketch() {
  m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
  m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);

-  auto cuts = DeviceSketch(0, m.get(), 64);
-  ASSERT_EQ(cuts.Values().size(), 64 + n_categories);
+  auto cuts = DeviceSketch(0, m.get(), n_bins);
+  ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
 }

-TEST(HistUtil, DeviceSketchMixedFeatures) {
-  TestMixedSketch();
+TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
+
+TEST(HistUtil, RemoveDuplicatedCategories) {
+  bst_row_t n_samples = 512;
+  bst_feature_t n_features = 3;
+  bst_cat_t n_categories = 5;
+
+  auto ctx = MakeCUDACtx(0);
+  SimpleLCG rng;
+  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
+
+  dh::device_vector<Entry> sorted_entries(n_samples * n_features);
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    for (bst_feature_t j = 0; j < n_features; ++j) {
+      float fvalue{0.0f};
+      // The second column is categorical
+      if (j == 1) {
+        fvalue = std::floor(cat_d(&rng));
+      } else {
+        fvalue = i;
+      }
+      sorted_entries[i * n_features + j] = Entry{j, fvalue};
+    }
+  }
+
+  MetaInfo info;
+  info.num_col_ = n_features;
+  info.num_row_ = n_samples;
+  info.feature_types.HostVector() = std::vector<FeatureType>{
+      FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
+  ASSERT_EQ(info.feature_types.Size(), n_features);
+
+  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
+  cuts_ptr.SetDevice(0);
+
+  dh::device_vector<float> weight(n_samples * n_features, 0);
+  dh::Iota(dh::ToSpan(weight));
+
+  dh::caching_device_vector<bst_row_t> columns_ptr(4);
+  for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
+    columns_ptr[i] = i * n_samples;
+  }
+  // sort into column major
+  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
+                      detail::EntryCompareOp());
+
+  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+                                     &weight, &columns_ptr);
+
+  auto const& h_cptr = cuts_ptr.ConstHostVector();
+  ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
+  // check numerical
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i], i * 3);
+  }
+  auto beg = n_samples + n_categories;
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i + beg], i * 3 + 2);
+  }
+  // check categorical
+  beg = n_samples;
+  for (std::size_t i = 0; i < n_categories; ++i) {
+    // all from the second column
+    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
+  }
 }

 TEST(HistUtil, DeviceSketchMultipleColumns) {
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -3,11 +3,12 @@
 */
 #include <gtest/gtest.h>

-#include <fstream>
+#include <cstddef>  // for size_t
+#include <fstream>  // for ofstream

 #include "../../../src/common/io.h"
-#include "../helpers.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../helpers.h"

 namespace xgboost::common {
 TEST(MemoryFixSizeBuffer, Seek) {
@@ -89,6 +90,57 @@ TEST(IO, LoadSequentialFile) {
  ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }

+TEST(IO, Resource) {
+  {
+    // test malloc basic
+    std::size_t n = 128;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    ASSERT_EQ(resource->Size(), n);
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
+  }
+
+  // test malloc resize
+  auto test_malloc_resize = [](bool force_malloc) {
+    std::size_t n = 64;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
+    std::iota(ptr, ptr + n, 0);
+
+    auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
+    ASSERT_TRUE(malloc_resource);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 2);
+    } else {
+      malloc_resource->Resize<false>(n * 2);
+    }
+    for (std::size_t i = 0; i < n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
+    }
+    for (std::size_t i = n; i < 2 * n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
+    }
+  };
+  test_malloc_resize(true);
+  test_malloc_resize(false);
+
+  {
+    // test mmap
+    dmlc::TemporaryDirectory tmpdir;
+    auto path = tmpdir.path + "/testfile";
+
+    std::ofstream fout(path, std::ios::binary);
+    double val{1.0};
+    fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
+    fout << 1.0 << std::endl;
+    fout.close();
+
+    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    ASSERT_EQ(resource->Size(), sizeof(double));
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
+    ASSERT_EQ(resource->DataAs<double>()[0], val);
+  }
+}
+
 TEST(IO, PrivateMmapStream) {
  dmlc::TemporaryDirectory tempdir;
  auto path = tempdir.path + "/testfile";
@@ -124,17 +176,35 @@ TEST(IO, PrivateMmapStream) {
  // Turn size info offset
  std::partial_sum(offset.begin(), offset.end(), offset.begin());

+  // Test read
  for (std::size_t i = 0; i < n_batches; ++i) {
    std::size_t off = offset[i];
    std::size_t n = offset.at(i + 1) - offset[i];
-    std::unique_ptr<dmlc::Stream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
    std::vector<T> data;

    std::uint64_t size{0};
-    fi->Read(&size);
+    ASSERT_TRUE(fi->Read(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
    data.resize(size);

-    fi->Read(data.data(), size * sizeof(T));
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
+    ASSERT_EQ(data, batches[i]);
+  }
+
+  // Test consume
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    ASSERT_TRUE(fi->Consume(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
+    data.resize(size);
+
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
    ASSERT_EQ(data, batches[i]);
  }
 }
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -41,7 +41,6 @@ std::string GetModelStr() {
    "num_class": "0",
    "num_feature": "10",
    "objective": "reg:linear",
-    "predictor": "gpu_predictor",
    "tree_method": "gpu_hist",
    "updater": "grow_gpu_hist"
  },
--- a/tests/cpp/common/test_ref_resource_view.cc
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <cstddef>  // for size_t
+#include <memory>   // for make_shared, make_unique
+#include <numeric>  // for iota
+#include <vector>   // for vector
+
+#include "../../../src/common/ref_resource_view.h"
+#include "dmlc/filesystem.h"  // for TemporaryDirectory
+
+namespace xgboost::common {
+TEST(RefResourceView, Basic) {
+  std::size_t n_bytes = 1024;
+  auto mem = std::make_shared<MallocResource>(n_bytes);
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+
+    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
+                          mem};
+    ASSERT_EQ(mem.use_count(), 3);
+    ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
+    ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
+  }
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
+                         1.5f};
+    for (auto v : view) {
+      ASSERT_EQ(v, 1.5f);
+    }
+    std::iota(view.begin(), view.end(), 0.0f);
+    ASSERT_EQ(view.front(), 0.0f);
+    ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
+
+    view.front() = 1.0f;
+    view.back() = 2.0f;
+    ASSERT_EQ(view.front(), 1.0f);
+    ASSERT_EQ(view.back(), 2.0f);
+  }
+  ASSERT_EQ(mem.use_count(), 1);
+}
+
+TEST(RefResourceView, IO) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
+  }
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+}
+
+TEST(RefResourceView, IOAligned) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, 1.0f);
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    // wrong type, float vs. double
+    auto read = MakeFixedVecWithMalloc(123, 2.0);
+    ASSERT_FALSE(ReadVec(fi.get(), &read));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, 2.0f);
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+  {
+    // Test std::vector
+    std::vector<float> data(123);
+    std::iota(data.begin(), data.end(), 0.0f);
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
+                                            sizeof(RefResourceView<std::size_t>::size_type) +
+                                            sizeof(float));
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -4,14 +4,14 @@
 #include <gtest/gtest.h>
 #include <xgboost/data.h>

+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(EllpackPageRawFormat, IO) {
  Context ctx{MakeCUDACtx(0)};
  auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
@@ -22,15 +22,17 @@ TEST(EllpackPageRawFormat, IO) {
  dmlc::TemporaryDirectory tmpdir;
  std::string path = tmpdir.path + "/ellpack.page";

+  std::size_t n_bytes{0};
  {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
    for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
-      format->Write(ellpack, fo.get());
+      n_bytes += format->Write(ellpack, fo.get());
    }
  }

  EllpackPage page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
  format->Read(&page, fi.get());

  for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
@@ -44,5 +46,4 @@ TEST(EllpackPageRawFormat, IO) {
    ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -26,28 +26,32 @@
 #include "xgboost/context.h"                    // for Context
 #include "xgboost/host_device_vector.h"         // for HostDeviceVector

-namespace xgboost {
-namespace data {
-TEST(GradientIndex, ExternalMemory) {
+namespace xgboost::data {
+TEST(GradientIndex, ExternalMemoryBaseRowID) {
  Context ctx;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
+  auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
+                    .Device(ctx.gpu_id)
+                    .Batches(8)
+                    .GenerateSparsePageDMatrix("cache", true);
+
  std::vector<size_t> base_rowids;
-  std::vector<float> hessian(dmat->Info().num_row_, 1);
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
+  std::vector<float> hessian(p_fmat->Info().num_row_, 1);
+  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
    base_rowids.push_back(page.base_rowid);
  }
-  size_t i = 0;
-  for (auto const &page : dmat->GetBatches<SparsePage>()) {
+
+  std::size_t i = 0;
+  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
    ASSERT_EQ(base_rowids[i], page.base_rowid);
    ++i;
  }

  base_rowids.clear();
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
+  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
    base_rowids.push_back(page.base_rowid);
  }
  i = 0;
-  for (auto const &page : dmat->GetBatches<SparsePage>()) {
+  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
    ASSERT_EQ(base_rowids[i], page.base_rowid);
    ++i;
  }
@@ -171,7 +175,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
    gpu_ctx.gpu_id = 0;
    for (auto const &page : Xy->GetBatches<EllpackPage>(
             &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
-      from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
+      from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
    }

    for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
@@ -199,13 +203,15 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo

      std::string from_sparse_buf;
      {
-        common::MemoryBufferStream fo{&from_sparse_buf};
-        columns_from_sparse.Write(&fo);
+        common::AlignedMemWriteStream fo{&from_sparse_buf};
+        auto n_bytes = columns_from_sparse.Write(&fo);
+        ASSERT_EQ(fo.Tell(), n_bytes);
      }
      std::string from_ellpack_buf;
      {
-        common::MemoryBufferStream fo{&from_ellpack_buf};
-        columns_from_sparse.Write(&fo);
+        common::AlignedMemWriteStream fo{&from_ellpack_buf};
+        auto n_bytes = columns_from_sparse.Write(&fo);
+        ASSERT_EQ(fo.Tell(), n_bytes);
      }
      ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
    }
@@ -229,5 +235,4 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
                                         std::make_tuple(.6f, .4)));  // dense columns

 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -2,14 +2,18 @@
 * Copyright 2021-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
+#include <xgboost/context.h>  // for Context
+
+#include <cstddef>  // for size_t
+#include <memory>   // for unique_ptr

 #include "../../../src/common/column_matrix.h"
-#include "../../../src/data/gradient_index.h"
+#include "../../../src/common/io.h"            // for MmapResource, AlignedResourceReadStream...
+#include "../../../src/data/gradient_index.h"  // for GHistIndexMatrix
 #include "../../../src/data/sparse_page_source.h"
-#include "../helpers.h"
+#include "../helpers.h"  // for RandomDataGenerator

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(GHistIndexPageRawFormat, IO) {
  Context ctx;

@@ -20,15 +24,18 @@ TEST(GHistIndexPageRawFormat, IO) {
  std::string path = tmpdir.path + "/ghistindex.page";
  auto batch = BatchParam{256, 0.5};

+  std::size_t bytes{0};
  {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
    for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
-      format->Write(index, fo.get());
+      bytes += format->Write(index, fo.get());
    }
  }

  GHistIndexMatrix page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path, 0, bytes)};
  format->Read(&page, fi.get());

  for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
@@ -37,6 +44,8 @@ TEST(GHistIndexPageRawFormat, IO) {
    ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
    ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
    ASSERT_EQ(loaded.base_rowid, page.base_rowid);
+    ASSERT_EQ(loaded.row_ptr.size(), page.row_ptr.size());
+    ASSERT_TRUE(std::equal(loaded.row_ptr.cbegin(), loaded.row_ptr.cend(), page.row_ptr.cbegin()));
    ASSERT_EQ(loaded.IsDense(), page.IsDense());
    ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(), page.index.begin()));
    ASSERT_TRUE(std::equal(loaded.index.Offset(), loaded.index.Offset() + loaded.index.OffsetSize(),
@@ -45,5 +54,4 @@ TEST(GHistIndexPageRawFormat, IO) {
    ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
  }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -76,9 +76,11 @@ TEST(SparsePageDMatrix, LoadFile) {
 // allow caller to retain pages so they can process multiple pages at the same time.
 template <typename Page>
 void TestRetainPage() {
-  auto m = CreateSparsePageDMatrix(10000);
+  std::size_t n_batches = 4;
+  auto p_fmat = RandomDataGenerator{1024, 128, 0.5f}.Batches(n_batches).GenerateSparsePageDMatrix(
+      "cache", true);
  Context ctx;
-  auto batches = m->GetBatches<Page>(&ctx);
+  auto batches = p_fmat->GetBatches<Page>(&ctx);
  auto begin = batches.begin();
  auto end = batches.end();

@@ -94,7 +96,7 @@ void TestRetainPage() {
    }
    ASSERT_EQ(pages.back().Size(), (*it).Size());
  }
-  ASSERT_GE(iterators.size(), 2);
+  ASSERT_GE(iterators.size(), n_batches);

  for (size_t i = 0; i < iterators.size(); ++i) {
    ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
@@ -102,7 +104,7 @@ void TestRetainPage() {
  }

  // make sure it's const and the caller can not modify the content of page.
-  for (auto &page : m->GetBatches<Page>({&ctx})) {
+  for (auto &page : p_fmat->GetBatches<Page>({&ctx})) {
    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
  }
 }
--- a/tests/cpp/data/test_sparse_page_raw_format.cc
+++ b/tests/cpp/data/test_sparse_page_raw_format.cc
@@ -2,20 +2,20 @@
 * Copyright 2021-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>                          // for CSCPage, SortedCSCPage, SparsePage
+#include <xgboost/data.h>  // for CSCPage, SortedCSCPage, SparsePage

-#include <memory>                                  // for allocator, unique_ptr, __shared_ptr_ac...
-#include <string>                                  // for char_traits, operator+, basic_string
+#include <memory>  // for allocator, unique_ptr, __shared_ptr_ac...
+#include <string>  // for char_traits, operator+, basic_string

+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
 #include "../helpers.h"                            // for RandomDataGenerator
 #include "dmlc/filesystem.h"                       // for TemporaryDirectory
-#include "dmlc/io.h"                               // for SeekStream, Stream
+#include "dmlc/io.h"                               // for Stream
 #include "gtest/gtest_pred_impl.h"                 // for Test, AssertionResult, ASSERT_EQ, TEST
 #include "xgboost/context.h"                       // for Context

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 template <typename S> void TestSparsePageRawFormat() {
  std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
  Context ctx;
@@ -25,17 +25,19 @@ template <typename S> void TestSparsePageRawFormat() {
  dmlc::TemporaryDirectory tmpdir;
  std::string path = tmpdir.path + "/sparse.page";
  S orig;
+  std::size_t n_bytes{0};
  {
    // block code to flush the stream
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
    for (auto const &page : m->GetBatches<S>(&ctx)) {
      orig.Push(page);
-      format->Write(page, fo.get());
+      n_bytes = format->Write(page, fo.get());
    }
  }

  S page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
  format->Read(&page, fi.get());
  for (size_t i = 0; i < orig.data.Size(); ++i) {
    ASSERT_EQ(page.data.HostVector()[i].fvalue,
@@ -59,5 +61,4 @@ TEST(SparsePageRawFormat, CSCPage) {
 TEST(SparsePageRawFormat, SortedCSCPage) {
  TestSparsePageRawFormat<SortedCSCPage>();
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,17 +1,20 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/learner.h>             // for Learner

-#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
+#include <limits>  // for numeric_limits
+#include <memory>  // for shared_ptr
+#include <string>  // for string
+
+#include "../../../src/data/proxy_dmatrix.h"  // for DMatrixProxy
 #include "../../../src/gbm/gbtree.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 #include "xgboost/base.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/learner.h"
 #include "xgboost/predictor.h"

 namespace xgboost {
@@ -113,12 +116,11 @@ TEST(GBTree, WrongUpdater) {
 #ifdef XGBOOST_USE_CUDA
 TEST(GBTree, ChoosePredictor) {
  // The test ensures data don't get pulled into device.
-  size_t constexpr kRows = 17;
-  size_t constexpr kCols = 15;
+  std::size_t constexpr kRows = 17, kCols = 15;

  auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();

-  auto& data = (*(p_dmat->GetBatches<SparsePage>().begin())).data;
+  auto const& data = (*(p_dmat->GetBatches<SparsePage>().begin())).data;
  p_dmat->Info().labels.Reshape(kRows);

  auto learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
@@ -127,14 +129,13 @@ TEST(GBTree, ChoosePredictor) {
    learner->UpdateOneIter(i, p_dmat);
  }
  ASSERT_TRUE(data.HostCanWrite());
+
  dmlc::TemporaryDirectory tempdir;
  const std::string fname = tempdir.path + "/model_param.bst";
-
  {
    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
    learner->Save(fo.get());
  }
-
  // a new learner
  learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
  {
@@ -146,6 +147,8 @@ TEST(GBTree, ChoosePredictor) {
    learner->UpdateOneIter(i, p_dmat);
  }
  ASSERT_TRUE(data.HostCanWrite());
+  ASSERT_FALSE(data.DeviceCanWrite());
+  ASSERT_FALSE(data.DeviceCanRead());

  // pull data into device.
  data.HostVector();
@@ -171,32 +174,52 @@ TEST(GBTree, JsonIO) {
  Context ctx;
  LearnerModelParam mparam{MakeMP(kCols, .5, 1)};

-  std::unique_ptr<GradientBooster> gbm {
-    CreateTrainedGBM("gbtree", Args{}, kRows, kCols, &mparam, &ctx) };
+  std::unique_ptr<GradientBooster> gbm{
+      CreateTrainedGBM("gbtree", Args{{"tree_method", "exact"}, {"default_direction", "left"}},
+                       kRows, kCols, &mparam, &ctx)};

-  Json model {Object()};
+  Json model{Object()};
  model["model"] = Object();
-  auto& j_model = model["model"];
+  auto j_model = model["model"];

  model["config"] = Object();
-  auto& j_param = model["config"];
+  auto j_config = model["config"];

  gbm->SaveModel(&j_model);
-  gbm->SaveConfig(&j_param);
+  gbm->SaveConfig(&j_config);

  std::string model_str;
  Json::Dump(model, &model_str);

  model = Json::Load({model_str.c_str(), model_str.size()});
-  ASSERT_EQ(get<String>(model["model"]["name"]), "gbtree");
+  j_model = model["model"];
+  j_config = model["config"];
+  ASSERT_EQ(get<String>(j_model["name"]), "gbtree");

-  auto const& gbtree_model = model["model"]["model"];
+  auto gbtree_model = j_model["model"];
  ASSERT_EQ(get<Array>(gbtree_model["trees"]).size(), 1ul);
  ASSERT_EQ(get<Integer>(get<Object>(get<Array>(gbtree_model["trees"]).front()).at("id")), 0);
  ASSERT_EQ(get<Array>(gbtree_model["tree_info"]).size(), 1ul);
-
-  auto j_train_param = model["config"]["gbtree_model_param"];
+  auto j_train_param = j_config["gbtree_model_param"];
  ASSERT_EQ(get<String>(j_train_param["num_parallel_tree"]), "1");
+
+  auto check_config = [](Json j_up_config) {
+    auto colmaker = get<Array const>(j_up_config).front();
+    auto pruner = get<Array const>(j_up_config).back();
+    ASSERT_EQ(get<String const>(colmaker["name"]), "grow_colmaker");
+    ASSERT_EQ(get<String const>(pruner["name"]), "prune");
+    ASSERT_EQ(get<String const>(colmaker["colmaker_train_param"]["default_direction"]), "left");
+  };
+  check_config(j_config["updater"]);
+
+  std::unique_ptr<GradientBooster> loaded(gbm::GBTree::Create("gbtree", &ctx, &mparam));
+  loaded->LoadModel(j_model);
+  loaded->LoadConfig(j_config);
+
+  // roundtrip test
+  Json j_config_rt{Object{}};
+  loaded->SaveConfig(&j_config_rt);
+  check_config(j_config_rt["updater"]);
 }

 TEST(Dart, JsonIO) {
@@ -232,14 +255,15 @@ TEST(Dart, JsonIO) {
 namespace {
 class Dart : public testing::TestWithParam<char const*> {
 public:
-  void Run(std::string predictor) {
+  void Run(std::string device) {
    size_t constexpr kRows = 16, kCols = 10;

    HostDeviceVector<float> data;
-    auto rng = RandomDataGenerator(kRows, kCols, 0);
-    if (predictor == "gpu_predictor") {
-      rng.Device(0);
+    Context ctx;
+    if (device == "GPU") {
+      ctx = MakeCUDACtx(0);
    }
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
    auto array_str = rng.GenerateArrayInterface(&data);
    auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);

@@ -258,14 +282,14 @@ class Dart : public testing::TestWithParam<char const*> {
      learner->UpdateOneIter(i, p_mat);
    }

-    learner->SetParam("predictor", predictor);
+    ConfigLearnerByCtx(&ctx, learner.get());

    HostDeviceVector<float> predts_training;
    learner->Predict(p_mat, false, &predts_training, 0, 0, true);

    HostDeviceVector<float>* inplace_predts;
    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
-    if (predictor == "gpu_predictor") {
+    if (ctx.IsCUDA()) {
      x->SetCUDAArray(array_str.c_str());
    } else {
      x->SetArrayData(array_str.c_str());
@@ -295,10 +319,9 @@ class Dart : public testing::TestWithParam<char const*> {
 TEST_P(Dart, Prediction) { this->Run(GetParam()); }

 #if defined(XGBOOST_USE_CUDA)
-INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart,
-                         testing::Values("auto", "cpu_predictor", "gpu_predictor"));
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU", "GPU"));
 #else
-INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("auto", "cpu_predictor"));
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU"));
 #endif  // defined(XGBOOST_USE_CUDA)


@@ -511,4 +534,86 @@ TEST(GBTree, PredictRange) {
                 dmlc::Error);
  }
 }
+
+TEST(GBTree, InplacePredictionError) {
+  std::size_t n_samples{2048}, n_features{32};
+
+  auto test_ext_err = [&](std::string booster, Context const* ctx) {
+    std::shared_ptr<DMatrix> p_fmat =
+        RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
+            "cache", true);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParam("booster", booster);
+    ConfigLearnerByCtx(ctx, learner.get());
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(i, p_fmat);
+    }
+    HostDeviceVector<float>* out_predt;
+    ASSERT_THROW(
+        {
+          learner->InplacePredict(p_fmat, PredictionType::kValue,
+                                  std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+        },
+        dmlc::Error);
+  };
+
+  {
+    Context ctx;
+    test_ext_err("gbtree", &ctx);
+    test_ext_err("dart", &ctx);
+  }
+
+#if defined(XGBOOST_USE_CUDA)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test_ext_err("gbtree", &ctx);
+    test_ext_err("dart", &ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+
+  auto test_qdm_err = [&](std::string booster, Context const* ctx) {
+    std::shared_ptr<DMatrix> p_fmat;
+    bst_bin_t max_bins = 16;
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    if (ctx->IsCPU()) {
+      p_fmat = rng.GenerateQuantileDMatrix(true);
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      p_fmat = rng.GenerateDeviceDMatrix(true);
+#else
+      CHECK(p_fmat);
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParam("booster", booster);
+    learner->SetParam("max_bin", std::to_string(max_bins));
+    ConfigLearnerByCtx(ctx, learner.get());
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(i, p_fmat);
+    }
+    HostDeviceVector<float>* out_predt;
+    ASSERT_THROW(
+        {
+          learner->InplacePredict(p_fmat, PredictionType::kValue,
+                                  std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+        },
+        dmlc::Error);
+  };
+
+  {
+    Context ctx;
+    test_qdm_err("gbtree", &ctx);
+    test_qdm_err("dart", &ctx);
+  }
+
+#if defined(XGBOOST_USE_CUDA)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test_qdm_err("gbtree", &ctx);
+    test_qdm_err("dart", &ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+}
 }  // namespace xgboost
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2023, XGBoost contributors
+ */
+#include <xgboost/context.h>      // for Context
+#include <xgboost/learner.h>      // for Learner
+#include <xgboost/string_view.h>  // for StringView
+
+#include <limits>  // for numeric_limits
+#include <memory>  // for shared_ptr
+#include <string>  // for string
+
+#include "../../../src/data/adapter.h"           // for ArrayAdapter
+#include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
+#include "../../../src/data/proxy_dmatrix.h"     // for DMatrixProxy
+#include "../helpers.h"                          // for RandomDataGenerator
+
+namespace xgboost {
+void TestInplaceFallback(Context const* ctx) {
+  // prepare data
+  bst_row_t n_samples{1024};
+  bst_feature_t n_features{32};
+  HostDeviceVector<float> X_storage;
+  // use a different device than the learner
+  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto X = RandomDataGenerator{n_samples, n_features, 0.0}
+               .Device(data_ordinal)
+               .GenerateArrayInterface(&X_storage);
+  HostDeviceVector<float> y_storage;
+  auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
+
+  std::shared_ptr<DMatrix> Xy;
+  if (data_ordinal == Context::kCpuId) {
+    auto X_adapter = data::ArrayAdapter{StringView{X}};
+    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
+  } else {
+    auto X_adapter = data::CupyAdapter{StringView{X}};
+    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
+  }
+
+  Xy->SetInfo("label", y);
+
+  // learner is configured to the device specified by ctx
+  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+  ConfigLearnerByCtx(ctx, learner.get());
+  for (std::int32_t i = 0; i < 3; ++i) {
+    learner->UpdateOneIter(i, Xy);
+  }
+
+  std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
+  auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+  if (data_ordinal == Context::kCpuId) {
+    proxy->SetArrayData(StringView{X});
+  } else {
+    proxy->SetCUDAArray(X.c_str());
+  }
+
+  HostDeviceVector<float>* out_predt{nullptr};
+  ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  // test whether the warning is raised
+  ::testing::internal::CaptureStderr();
+  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &out_predt, 0, 0);
+  auto output = testing::internal::GetCapturedStderr();
+  ASSERT_NE(output.find("Falling back"), std::string::npos);
+
+  // test when the contexts match
+  Context new_ctx = *proxy->Ctx();
+  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+
+  ConfigLearnerByCtx(&new_ctx, learner.get());
+  HostDeviceVector<float>* out_predt_1{nullptr};
+  // no warning is raised
+  ::testing::internal::CaptureStderr();
+  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &out_predt_1, 0, 0);
+  output = testing::internal::GetCapturedStderr();
+
+  ASSERT_TRUE(output.empty());
+
+  ASSERT_EQ(out_predt->ConstHostVector(), out_predt_1->ConstHostVector());
+}
+
+TEST(GBTree, InplacePredictFallback) {
+  auto ctx = MakeCUDACtx(0);
+  TestInplaceFallback(&ctx);
+}
+}  // namespace xgboost
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -210,6 +210,16 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 // Make sure it's compile time constant.
 static_assert(SimpleLCG::max() - SimpleLCG::min());

+void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
+  RandomDataGenerator{p_fmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+      p_fmat->Info().labels.Data());
+  CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
+  p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+  if (device_ != Context::kCpuId) {
+    p_fmat->Info().labels.SetDevice(device_);
+  }
+}
+
 void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
  xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
  CHECK(out);
@@ -363,8 +373,9 @@ void RandomDataGenerator::GenerateCSR(
  CHECK_EQ(columns->Size(), value->Size());
 }

-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label,
-                                                              size_t classes) const {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label,
+                                                                            bool float_label,
+                                                                            size_t classes) const {
  HostDeviceVector<float> data;
  HostDeviceVector<bst_row_t> rptrs;
  HostDeviceVector<bst_feature_t> columns;
@@ -395,6 +406,9 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, b
    for (auto const& page : out->GetBatches<SparsePage>()) {
      page.data.SetDevice(device_);
      page.offset.SetDevice(device_);
+      // pull to device
+      page.data.ConstDeviceSpan();
+      page.offset.ConstDeviceSpan();
    }
  }
  if (!ft_.empty()) {
@@ -403,10 +417,58 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, b
  return out;
 }

-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix() {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateSparsePageDMatrix(
+    std::string prefix, bool with_label) const {
+  CHECK_GE(this->rows_, this->n_batches_);
+  CHECK_GE(this->n_batches_, 1)
+      << "Must set the n_batches before generating an external memory DMatrix.";
+  std::unique_ptr<ArrayIterForTest> iter;
+  if (device_ == Context::kCpuId) {
+    iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
+  } else {
+#if defined(XGBOOST_USE_CUDA)
+    iter = std::make_unique<CudaArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
+#else
+    CHECK(iter);
+#endif  // defined(XGBOOST_USE_CUDA)
+  }
+
+  std::unique_ptr<DMatrix> dmat{
+      DMatrix::Create(static_cast<DataIterHandle>(iter.get()), iter->Proxy(), Reset, Next,
+                      std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(), prefix)};
+
+  auto row_page_path =
+      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
+  EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
+
+  // Loop over the batches and count the number of pages
+  std::size_t batch_count = 0;
+  bst_row_t row_count = 0;
+  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
+    batch_count++;
+    row_count += batch.Size();
+    CHECK_NE(batch.data.Size(), 0);
+  }
+
+  EXPECT_EQ(batch_count, n_batches_);
+  EXPECT_EQ(row_count, dmat->Info().num_row_);
+
+  if (with_label) {
+    RandomDataGenerator{dmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+        dmat->Info().labels.Data());
+    CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
+    dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+  }
+  return dmat;
+}
+
+std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix(bool with_label) {
  NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
  auto m = std::make_shared<data::IterativeDMatrix>(
      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  if (with_label) {
+    this->GenerateLabels(m);
+  }
  return m;
 }

--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -24,10 +24,13 @@ int CudaArrayIterForTest::Next() {
  return 1;
 }

-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix() {
+std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label) {
  CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
  auto m = std::make_shared<data::IterativeDMatrix>(
      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  if (with_label) {
+    this->GenerateLabels(m);
+  }
  return m;
 }
 }  // namespace xgboost
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -183,7 +183,7 @@ class SimpleRealUniformDistribution {

    for (size_t k = m; k != 0; --k) {
      sum_value += static_cast<ResultT>((*rng)() - rng->Min()) * r_k;
-      r_k *= r;
+      r_k *= static_cast<ResultT>(r);
    }

    ResultT res = sum_value / r_k;
@@ -238,15 +238,18 @@ class RandomDataGenerator {
  bst_target_t n_targets_{1};

  std::int32_t device_{Context::kCpuId};
+  std::size_t n_batches_{0};
  std::uint64_t seed_{0};
  SimpleLCG lcg_;

-  std::size_t bins_{0};
+  bst_bin_t bins_{0};
  std::vector<FeatureType> ft_;
  bst_cat_t max_cat_;

  Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;

+  void GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const;
+
 public:
  RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
      : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lcg_{seed_} {}
@@ -263,12 +266,16 @@ class RandomDataGenerator {
    device_ = d;
    return *this;
  }
+  RandomDataGenerator& Batches(std::size_t n_batches) {
+    n_batches_ = n_batches;
+    return *this;
+  }
  RandomDataGenerator& Seed(uint64_t s) {
    seed_ = s;
    lcg_.Seed(seed_);
    return *this;
  }
-  RandomDataGenerator& Bins(size_t b) {
+  RandomDataGenerator& Bins(bst_bin_t b) {
    bins_ = b;
    return *this;
  }
@@ -309,12 +316,17 @@ class RandomDataGenerator {
  void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
                   HostDeviceVector<bst_feature_t>* columns) const;

-  std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false, bool float_label = true,
-                                           size_t classes = 1) const;
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
+                                                         bool float_label = true,
+                                                         size_t classes = 1) const;
+
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
+                                                                   bool with_label) const;
+
 #if defined(XGBOOST_USE_CUDA)
-  std::shared_ptr<DMatrix> GenerateDeviceDMatrix();
+  std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label);
 #endif
-  std::shared_ptr<DMatrix> GenerateQuantileDMatrix();
+  std::shared_ptr<DMatrix> GenerateQuantileDMatrix(bool with_label);
 };

 // Generate an empty DMatrix, mostly for its meta info.
@@ -322,15 +334,14 @@ inline std::shared_ptr<DMatrix> EmptyDMatrix() {
  return RandomDataGenerator{0, 0, 0.0}.GenerateDMatrix();
 }

-inline std::vector<float>
-GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
+inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
  std::vector<float> x(n);
  std::mt19937 rng(0);
  std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
  std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
  // Make sure each category is present
-  for(size_t i = 0; i < num_categories; i++) {
-    x[i] = i;
+  for (size_t i = 0; i < num_categories; i++) {
+    x[i] = static_cast<decltype(x)::value_type>(i);
  }
  return x;
 }
@@ -444,11 +455,11 @@ class ArrayIterForTest {
  size_t static constexpr Cols() { return 13; }

 public:
-  std::string AsArray() const { return interface_; }
+  [[nodiscard]] std::string AsArray() const { return interface_; }

  virtual int Next() = 0;
  virtual void Reset() { iter_ = 0; }
-  size_t Iter() const { return iter_; }
+  [[nodiscard]] std::size_t Iter() const { return iter_; }
  auto Proxy() -> decltype(proxy_) { return proxy_; }

  explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
@@ -549,4 +560,15 @@ class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
  }
 };

+// A temporary solution before we move away from gpu_id.
+inline void ConfigLearnerByCtx(Context const* ctx, Learner* learner) {
+  if (ctx->IsCPU()) {
+    learner->SetParam("tree_method", "hist");
+  } else {
+    learner->SetParam("tree_method", "gpu_hist");
+  }
+  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->Configure();
+  ASSERT_EQ(learner->Ctx()->gpu_id, ctx->gpu_id);
+}
 }  // namespace xgboost
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -37,7 +37,14 @@ class ServerForTest {
  }

  ~ServerForTest() {
+    using namespace std::chrono_literals;
+    while (!server_) {
+      std::this_thread::sleep_for(100ms);
+    }
    server_->Shutdown();
+    while (!server_thread_) {
+      std::this_thread::sleep_for(100ms);
+    }
    server_thread_->join();
  }

@@ -56,7 +63,7 @@ class BaseFederatedTest : public ::testing::Test {

  void TearDown() override { server_.reset(nullptr); }

-  static int constexpr kWorldSize{3};
+  static int constexpr kWorldSize{2};
  std::unique_ptr<ServerForTest> server_;
 };

--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -9,6 +9,7 @@
 #include <thread>

 #include "../../../plugin/federated/federated_communicator.h"
+#include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/device_communicator_adapter.cuh"
 #include "./helpers.h"

@@ -17,67 +18,63 @@ namespace xgboost::collective {
 class FederatedAdapterTest : public BaseFederatedTest {};

 TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { DeviceCommunicatorAdapter adapter{-1, nullptr}; };
+  auto construct = []() { DeviceCommunicatorAdapter adapter{-1}; };
  EXPECT_THROW(construct(), dmlc::Error);
 }

-TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
-  auto construct = []() { DeviceCommunicatorAdapter adapter{0, nullptr}; };
-  EXPECT_THROW(construct(), dmlc::Error);
-}
-
-TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
-  std::vector<std::thread> threads;
-  for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back([rank, server_address = server_->Address()] {
-      FederatedCommunicator comm{kWorldSize, rank, server_address};
-      // Assign device 0 to all workers, since we run gtest in a single-GPU machine
-      DeviceCommunicatorAdapter adapter{0, &comm};
-      int count = 3;
-      thrust::device_vector<double> buffer(count, 0);
-      thrust::sequence(buffer.begin(), buffer.end());
-      adapter.AllReduce(buffer.data().get(), count, DataType::kDouble, Operation::kSum);
-      thrust::host_vector<double> host_buffer = buffer;
-      EXPECT_EQ(host_buffer.size(), count);
-      for (auto i = 0; i < count; i++) {
-        EXPECT_EQ(host_buffer[i], i * kWorldSize);
-      }
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
+namespace {
+void VerifyAllReduceSum() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  int count = 3;
+  thrust::device_vector<double> buffer(count, 0);
+  thrust::sequence(buffer.begin(), buffer.end());
+  collective::AllReduce<collective::Operation::kSum>(rank, buffer.data().get(), count);
+  thrust::host_vector<double> host_buffer = buffer;
+  EXPECT_EQ(host_buffer.size(), count);
+  for (auto i = 0; i < count; i++) {
+    EXPECT_EQ(host_buffer[i], i * world_size);
  }
 }
+}  // anonymous namespace

-TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
-  std::vector<std::thread> threads;
-  for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back([rank, server_address = server_->Address()] {
-      FederatedCommunicator comm{kWorldSize, rank, server_address};
-      // Assign device 0 to all workers, since we run gtest in a single-GPU machine
-      DeviceCommunicatorAdapter adapter{0, &comm};
-
-      int const count = rank + 2;
-      thrust::device_vector<char> buffer(count, 0);
-      thrust::sequence(buffer.begin(), buffer.end());
-      std::vector<std::size_t> segments(kWorldSize);
-      dh::caching_device_vector<char> receive_buffer{};
-
-      adapter.AllGatherV(buffer.data().get(), count, &segments, &receive_buffer);
-
-      EXPECT_EQ(segments[0], 2);
-      EXPECT_EQ(segments[1], 3);
-      thrust::host_vector<char> host_buffer = receive_buffer;
-      EXPECT_EQ(host_buffer.size(), 9);
-      int expected[] = {0, 1, 0, 1, 2, 0, 1, 2, 3};
-      for (auto i = 0; i < 9; i++) {
-        EXPECT_EQ(host_buffer[i], expected[i]);
-      }
-    });
+TEST_F(FederatedAdapterTest, MGPUAllReduceSum) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceSum test with # GPUs = " << n_gpus;
  }
-  for (auto& thread : threads) {
-    thread.join();
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllReduceSum);
+}
+
+namespace {
+void VerifyAllGatherV() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  int const count = rank + 2;
+  thrust::device_vector<char> buffer(count, 0);
+  thrust::sequence(buffer.begin(), buffer.end());
+  std::vector<std::size_t> segments(world_size);
+  dh::caching_device_vector<char> receive_buffer{};
+
+  collective::AllGatherV(rank, buffer.data().get(), count, &segments, &receive_buffer);
+
+  EXPECT_EQ(segments[0], 2);
+  EXPECT_EQ(segments[1], 3);
+  thrust::host_vector<char> host_buffer = receive_buffer;
+  EXPECT_EQ(host_buffer.size(), 5);
+  int expected[] = {0, 1, 0, 1, 2};
+  for (auto i = 0; i < 5; i++) {
+    EXPECT_EQ(host_buffer[i], expected[i]);
  }
 }
+}  // anonymous namespace
+
+TEST_F(FederatedAdapterTest, MGPUAllGatherV) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllGatherV test with # GPUs = " << n_gpus;
+  }
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllGatherV);
+}

 }  // namespace xgboost::collective
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -31,7 +31,7 @@ class FederatedCommunicatorTest : public BaseFederatedTest {

 protected:
  static void CheckAllgather(FederatedCommunicator &comm, int rank) {
-    int buffer[kWorldSize] = {0, 0, 0};
+    int buffer[kWorldSize] = {0, 0};
    buffer[rank] = rank;
    comm.AllGather(buffer, sizeof(buffer));
    for (auto i = 0; i < kWorldSize; i++) {
@@ -42,7 +42,7 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
  static void CheckAllreduce(FederatedCommunicator &comm) {
    int buffer[] = {1, 2, 3, 4, 5};
    comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
-    int expected[] = {3, 6, 9, 12, 15};
+    int expected[] = {2, 4, 6, 8, 10};
    for (auto i = 0; i < 5; i++) {
      EXPECT_EQ(buffer[i], expected[i]);
    }
--- a/tests/cpp/plugin/test_federated_data.cc
+++ b/tests/cpp/plugin/test_federated_data.cc
@@ -30,7 +30,7 @@ void VerifyLoadUri() {
  std::string uri = path + "?format=csv";
  dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kCol));

-  ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 3);
+  ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 1);
  ASSERT_EQ(dmat->Info().num_row_, kRows);

  for (auto const& page : dmat->GetBatches<SparsePage>()) {
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -39,7 +39,7 @@ class FederatedServerTest : public BaseFederatedTest {

 protected:
  static void CheckAllgather(federated::FederatedClient& client, int rank) {
-    int data[kWorldSize] = {0, 0, 0};
+    int data[kWorldSize] = {0, 0};
    data[rank] = rank;
    std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
    auto reply = client.Allgather(send_buffer);
@@ -54,7 +54,7 @@ class FederatedServerTest : public BaseFederatedTest {
    std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
    auto reply = client.Allreduce(send_buffer, federated::INT32, federated::SUM);
    auto const* result = reinterpret_cast<int const*>(reply.data());
-    int expected[] = {3, 6, 9, 12, 15};
+    int expected[] = {2, 4, 6, 8, 10};
    for (auto i = 0; i < 5; i++) {
      EXPECT_EQ(result[i], expected[i]);
    }
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -122,11 +122,13 @@ TEST(CpuPredictor, BasicColumnSplit) {
 }

 TEST(CpuPredictor, IterationRange) {
-  TestIterationRange("cpu_predictor");
+  Context ctx;
+  TestIterationRange(&ctx);
 }

 TEST(CpuPredictor, IterationRangeColmnSplit) {
-  TestIterationRangeColumnSplit("cpu_predictor");
+  Context ctx;
+  TestIterationRangeColumnSplit(&ctx);
 }

 TEST(CpuPredictor, ExternalMemory) {
@@ -139,7 +141,8 @@ TEST(CpuPredictor, ExternalMemory) {
 TEST(CpuPredictor, InplacePredict) {
  bst_row_t constexpr kRows{128};
  bst_feature_t constexpr kCols{64};
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(-1);
+  Context ctx;
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id);
  {
    HostDeviceVector<float> data;
    gen.GenerateDense(&data);
@@ -149,7 +152,7 @@ TEST(CpuPredictor, InplacePredict) {
    std::string arr_str;
    Json::Dump(array_interface, &arr_str);
    x->SetArrayData(arr_str.data());
-    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, Context::kCpuId);
+    TestInplacePrediction(&ctx, x, kRows, kCols);
  }

  {
@@ -166,54 +169,54 @@ TEST(CpuPredictor, InplacePredict) {
    Json::Dump(col_interface, &col_str);
    std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy};
    x->SetCSRData(rptr_str.data(), col_str.data(), data_str.data(), kCols, true);
-    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, Context::kCpuId);
+    TestInplacePrediction(&ctx, x, kRows, kCols);
  }
 }

+namespace {
 void TestUpdatePredictionCache(bool use_subsampling) {
-  size_t constexpr kRows = 64, kCols = 16, kClasses = 4;
+  std::size_t constexpr kRows = 64, kCols = 16, kClasses = 4;
  LearnerModelParam mparam{MakeMP(kCols, .0, kClasses)};
  Context ctx;

  std::unique_ptr<gbm::GBTree> gbm;
  gbm.reset(static_cast<gbm::GBTree*>(GradientBooster::Create("gbtree", &ctx, &mparam)));
-  std::map<std::string, std::string> cfg;
-  cfg["tree_method"] = "hist";
-  cfg["predictor"]   = "cpu_predictor";
+  Args args{{"tree_method", "hist"}};
  if (use_subsampling) {
-    cfg["subsample"] = "0.5";
+    args.emplace_back("subsample", "0.5");
  }
-  Args args = {cfg.cbegin(), cfg.cend()};
  gbm->Configure(args);

  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);

  HostDeviceVector<GradientPair> gpair;
  auto& h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows*kClasses);
-  for (size_t i = 0; i < kRows*kClasses; ++i) {
+  h_gpair.resize(kRows * kClasses);
+  for (size_t i = 0; i < kRows * kClasses; ++i) {
    h_gpair[i] = {static_cast<float>(i), 1};
  }

  PredictionCacheEntry predtion_cache;
-  predtion_cache.predictions.Resize(kRows*kClasses, 0);
-  // after one training iteration predtion_cache is filled with cached in QuantileHistMaker::Builder prediction values
+  predtion_cache.predictions.Resize(kRows * kClasses, 0);
+  // after one training iteration predtion_cache is filled with cached in QuantileHistMaker
+  // prediction values
  gbm->DoBoost(dmat.get(), &gpair, &predtion_cache, nullptr);

  PredictionCacheEntry out_predictions;
-  // perform fair prediction on the same input data, should be equal to cached result
+  // perform prediction from scratch on the same input data, should be equal to cached result
  gbm->PredictBatch(dmat.get(), &out_predictions, false, 0, 0);

-  std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
-  std::vector<float> &predtion_cache_from_train = predtion_cache.predictions.HostVector();
+  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+  std::vector<float>& predtion_cache_from_train = predtion_cache.predictions.HostVector();
  for (size_t i = 0; i < out_predictions_h.size(); ++i) {
    ASSERT_NEAR(out_predictions_h[i], predtion_cache_from_train[i], kRtEps);
  }
 }
+}  // namespace

 TEST(CPUPredictor, GHistIndex) {
  size_t constexpr kRows{128}, kCols{16}, kBins{64};
-  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix();
+  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
  auto adapter = data::ArrayAdapter(columnar.c_str());
@@ -223,19 +226,23 @@ TEST(CPUPredictor, GHistIndex) {
 }

 TEST(CPUPredictor, CategoricalPrediction) {
-  TestCategoricalPrediction("cpu_predictor");
+  Context ctx;
+  TestCategoricalPrediction(&ctx, false);
 }

 TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
-  TestCategoricalPredictionColumnSplit("cpu_predictor");
+  Context ctx;
+  TestCategoricalPredictionColumnSplit(&ctx);
 }

 TEST(CPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(StringView{"cpu_predictor"});
+  Context ctx;
+  TestCategoricalPredictLeaf(&ctx, false);
 }

 TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
-  TestCategoricalPredictLeafColumnSplit(StringView{"cpu_predictor"});
+  Context ctx;
+  TestCategoricalPredictLeafColumnSplit(&ctx);
 }

 TEST(CpuPredictor, UpdatePredictionCache) {
@@ -244,21 +251,25 @@ TEST(CpuPredictor, UpdatePredictionCache) {
 }

 TEST(CpuPredictor, LesserFeatures) {
-  TestPredictionWithLesserFeatures("cpu_predictor");
+  Context ctx;
+  TestPredictionWithLesserFeatures(&ctx);
 }

 TEST(CpuPredictor, LesserFeaturesColumnSplit) {
-  TestPredictionWithLesserFeaturesColumnSplit("cpu_predictor");
+  Context ctx;
+  TestPredictionWithLesserFeaturesColumnSplit(&ctx);
 }

 TEST(CpuPredictor, Sparse) {
-  TestSparsePrediction(0.2, "cpu_predictor");
-  TestSparsePrediction(0.8, "cpu_predictor");
+  Context ctx;
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
 }

 TEST(CpuPredictor, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(0.2, "cpu_predictor");
-  TestSparsePredictionColumnSplit(0.8, "cpu_predictor");
+  Context ctx;
+  TestSparsePredictionColumnSplit(&ctx, 0.2);
+  TestSparsePredictionColumnSplit(&ctx, 0.8);
 }

 TEST(CpuPredictor, Multi) {
@@ -266,4 +277,6 @@ TEST(CpuPredictor, Multi) {
  ctx.nthread = 1;
  TestVectorLeafPrediction(&ctx);
 }
+
+TEST(CpuPredictor, Access) { TestPredictionDeviceAccess(); }
 }  // namespace xgboost
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -15,8 +15,7 @@
 #include "../helpers.h"
 #include "test_predictor.h"

-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {

 TEST(GPUPredictor, Basic) {
  auto cpu_lparam = MakeCUDACtx(-1);
@@ -57,21 +56,85 @@ TEST(GPUPredictor, Basic) {
  }
 }

+namespace {
+void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_result) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+
+  auto ctx = MakeCUDACtx(rank);
+  std::unique_ptr<Predictor> predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
+  predictor->Configure({});
+
+  for (size_t i = 1; i < 33; i *= 2) {
+    size_t n_row = i, n_col = i;
+    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
+    std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
+
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+
+    // Test predict batch
+    PredictionCacheEntry out_predictions;
+
+    predictor->InitOutPredictions(sliced->Info(), &out_predictions.predictions, model);
+    predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
+
+    std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+    EXPECT_EQ(out_predictions_h, expected_result[i - 1]);
+  }
+}
+}  // anonymous namespace
+
+TEST(GPUPredictor, MGPUBasicColumnSplit) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUIBasicColumnSplit test with # GPUs = " << n_gpus;
+  }
+
+  auto ctx = MakeCUDACtx(0);
+  std::unique_ptr<Predictor> predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
+  predictor->Configure({});
+
+  std::array<std::vector<float>, 32> result{};
+  for (size_t i = 1; i < 33; i *= 2) {
+    size_t n_row = i, n_col = i;
+    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
+
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+
+    // Test predict batch
+    PredictionCacheEntry out_predictions;
+
+    predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
+    predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+
+    std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+    result[i - 1] = out_predictions_h;
+  }
+
+  RunWithInMemoryCommunicator(n_gpus, VerifyBasicColumnSplit, result);
+}
+
 TEST(GPUPredictor, EllpackBasic) {
-  size_t constexpr kCols {8};
+  size_t constexpr kCols{8};
+  auto ctx = MakeCUDACtx(0);
  for (size_t bins = 2; bins < 258; bins += 16) {
    size_t rows = bins * 16;
-    auto p_m = RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix();
+    auto p_m =
+        RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false);
    ASSERT_FALSE(p_m->PageExists<SparsePage>());
-    TestPredictionFromGradientIndex<EllpackPage>("gpu_predictor", rows, kCols, p_m);
-    TestPredictionFromGradientIndex<EllpackPage>("gpu_predictor", bins, kCols, p_m);
+    TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
+    TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
  }
 }

 TEST(GPUPredictor, EllpackTraining) {
  size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
  auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix();
+      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix(false);
  HostDeviceVector<float> storage(kRows * kCols);
  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
       .Device(0)
@@ -119,29 +182,32 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 }

 TEST(GPUPredictor, InplacePredictCupy) {
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(0);
+  gen.Device(ctx.gpu_id);
  HostDeviceVector<float> data;
  std::string interface_str = gen.GenerateArrayInterface(&data);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
-  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
+  TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }

 TEST(GPUPredictor, InplacePredictCuDF) {
+  auto ctx = MakeCUDACtx(0);
  size_t constexpr kRows{128}, kCols{64};
  RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(0);
+  gen.Device(ctx.gpu_id);
  std::vector<HostDeviceVector<float>> storage(kCols);
  auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
  std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
  dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
-  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
+  TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }

 TEST(GpuPredictor, LesserFeatures) {
-  TestPredictionWithLesserFeatures("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestPredictionWithLesserFeatures(&ctx);
 }

 // Very basic test of empty model
@@ -154,7 +220,7 @@ TEST(GPUPredictor, ShapStump) {
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
-  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
+  trees.push_back(std::make_unique<RegTree>());
  model.CommitModelGroup(std::move(trees), 0);

  auto gpu_lparam = MakeCUDACtx(0);
@@ -181,7 +247,7 @@ TEST(GPUPredictor, Shap) {
  gbm::GBTreeModel model(&mparam, &ctx);

  std::vector<std::unique_ptr<RegTree>> trees;
-  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
+  trees.push_back(std::make_unique<RegTree>());
  trees[0]->ExpandNode(0, 0, 0.5, true, 1.0, -1.0, 1.0, 0.0, 5.0, 2.0, 3.0);
  model.CommitModelGroup(std::move(trees), 0);

@@ -206,15 +272,18 @@ TEST(GPUPredictor, Shap) {
 }

 TEST(GPUPredictor, IterationRange) {
-  TestIterationRange("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestIterationRange(&ctx);
 }

 TEST(GPUPredictor, CategoricalPrediction) {
-  TestCategoricalPrediction("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalPrediction(&ctx, false);
 }

 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(StringView{"gpu_predictor"});
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalPredictLeaf(&ctx, false);
 }

 TEST(GPUPredictor, PredictLeafBasic) {
@@ -238,8 +307,8 @@ TEST(GPUPredictor, PredictLeafBasic) {
 }

 TEST(GPUPredictor, Sparse) {
-  TestSparsePrediction(0.2, "gpu_predictor");
-  TestSparsePrediction(0.8, "gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
 }
-}  // namespace predictor
-}  // namespace xgboost
+}  // namespace xgboost::predictor
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
amdsc21	5929890174	[CI] Update RAPIDS to latest stable	2023-08-10 20:02:16 +00:00
Rong Ou	15ca12a77e	Fix NCCL test hang (#9367 )	2023-07-07 11:21:35 +08:00
Jiaming Yuan	41c6813496	Preserve order of saved updaters config. (#9355 ) - Save the updater sequence as an array instead of object. - Warn only once. The compatibility is kept, but we should be able to break it as the config is not loaded in pickle model and it's declared to be not stable.	2023-07-05 20:20:07 +08:00
Jiaming Yuan	b572a39919	[doc] Fix removed reference. (#9358 )	2023-07-05 16:49:25 +08:00
Jiaming Yuan	645037e376	Improve test coverage with predictor configuration. (#9354 ) * Improve test coverage with predictor configuration. - Test with ext memory. - Test with QDM. - Test with dart.	2023-07-05 15:17:22 +08:00
Oliver Holworthy	6c9c8a9001	Enable Installation of Python Package with System lib in a Virtual Environment (#9349 )	2023-07-05 05:46:17 +08:00
Boris	bb2de1fd5d	xgboost4j-gpu_2.12-2.0.0: added libxgboost4j.so back. (#9351 )	2023-07-04 03:31:33 +08:00
Jiaming Yuan	d0916849a6	Remove unused weight from buffer for cat features. (#9341 )	2023-07-04 01:07:09 +08:00
Jiaming Yuan	6155394a06	Update news for 1.7.6 [skip ci] (#9350 )	2023-07-04 01:04:34 +08:00
Jiaming Yuan	e964654b8f	[skl] Enable cat feature without specifying tree method. (#9353 )	2023-07-03 22:06:17 +08:00
Jiaming Yuan	39390cc2ee	[breaking] Remove the `predictor` param, allow fallback to prediction using `DMatrix`. (#9129 ) - A `DeviceOrd` struct is implemented to indicate the device. It will eventually replace the `gpu_id` parameter. - The `predictor` parameter is removed. - Fallback to `DMatrix` when `inplace_predict` is not available. - The heuristic for choosing a predictor is only used during training.	2023-07-03 19:23:54 +08:00
Rong Ou	3a0f787703	Support column split in GPU predictor (#9343 )	2023-07-03 04:05:34 +08:00
Rong Ou	f90771eec6	Fix device communicator dependency (#9346 )	2023-06-29 10:34:30 +08:00
Jiaming Yuan	f4798718c7	Use hist as the default tree method. (#9320 )	2023-06-27 23:04:24 +08:00
Jiaming Yuan	bc267dd729	Use ptr from `mmap` for `GHistIndexMatrix` and `ColumnMatrix`. (#9315 ) * Use ptr from mmap for `GHistIndexMatrix` and `ColumnMatrix`. - Define a resource for holding various types of memory pointers. - Define ref vector for holding resources. - Swap the underlying resources for GHist and ColumnM. - Add documentation for current status. - s390x support is removed. It should work if you can compile XGBoost, all the old workaround code does is to get GCC to compile.	2023-06-27 19:05:46 +08:00
jasjung	96c3071a8a	[doc] Update learning_to_rank.rst (#9336 )	2023-06-27 13:56:18 +08:00
Jiaming Yuan	cfa9c42eb4	Fix callback in AFT viz demo. (#9333 ) * Fix callback in AFT viz demo. - Update the callback function. - Add lint check.	2023-06-26 22:35:02 +08:00
Jiaming Yuan	6efe7c129f	[doc] Update reference in R vignettes. (#9323 )	2023-06-26 18:32:11 +08:00