rocm enable for v2.0.1

2023-10-27 18:50:28 -07:00
parent 2e7e9d3b2d a408254c2f
commit 782b73f2bb
447 changed files with 13518 additions and 8719 deletions
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,16 +132,28 @@ def locate_or_build_libxgboost(

    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
-        libxgboost_sys = sys_prefix / "lib" / _lib_name()
-        if not libxgboost_sys.exists():
-            raise RuntimeError(
-                f"use_system_libxgboost was specified but {_lib_name()} is "
-                f"not found in {libxgboost_sys.parent}"
-            )
-
-        logger.info("Using system XGBoost: %s", str(libxgboost_sys))
-        return libxgboost_sys
+        sys_prefix = pathlib.Path(sys.base_prefix)
+        sys_prefix_candidates = [
+            sys_prefix / "lib",
+            # Paths possibly used on Windows
+            sys_prefix / "bin",
+            sys_prefix / "Library",
+            sys_prefix / "Library" / "bin",
+            sys_prefix / "Library" / "lib",
+        ]
+        sys_prefix_candidates = [
+            p.expanduser().resolve() for p in sys_prefix_candidates
+        ]
+        for candidate_dir in sys_prefix_candidates:
+            libtreelite_sys = candidate_dir / _lib_name()
+            if libtreelite_sys.exists():
+                logger.info("Using system XGBoost: %s", str(libtreelite_sys))
+                return libtreelite_sys
+        raise RuntimeError(
+            f"use_system_libxgboost was specified but {_lib_name()} is "
+            f"not found. Paths searched (in order): \n"
+            + "\n".join([f"* {str(p)}" for p in sys_prefix_candidates])
+        )

    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
    if libxgboost is not None:
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "packager.pep517"

 [project]
 name = "xgboost"
-version = "2.0.0-dev"
+version = "2.0.1"
 authors = [
    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-2.0.0-dev
+2.0.1
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -8,7 +8,9 @@ from typing import (
    Callable,
    Dict,
    List,
+    Optional,
    Sequence,
+    Tuple,
    Type,
    TypeVar,
    Union,
@@ -20,8 +22,6 @@ import numpy as np

 DataType = Any

-# xgboost accepts some other possible types in practice due to historical reason, which is
-# lesser tested.  For now we encourage users to pass a simple list of string.
 FeatureInfo = Sequence[str]
 FeatureNames = FeatureInfo
 FeatureTypes = FeatureInfo
@@ -97,6 +97,13 @@ else:
        ctypes._Pointer,
    ]

+# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
+# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
+# being freed.
+TransformedData = Tuple[
+    Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
+]
+
 # template parameter
 _T = TypeVar("_T")
 _F = TypeVar("_F", bound=Callable[..., Any])
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -134,13 +134,17 @@ class CallbackContainer:
        is_cv: bool = False,
    ) -> None:
        self.callbacks = set(callbacks)
-        if metric is not None:
-            msg = (
-                "metric must be callable object for monitoring.  For "
-                + "builtin metrics, passing them in training parameter"
-                + " will invoke monitor automatically."
-            )
-            assert callable(metric), msg
+        for cb in callbacks:
+            if not isinstance(cb, TrainingCallback):
+                raise TypeError("callback must be an instance of `TrainingCallback`.")
+
+        msg = (
+            "metric must be callable object for monitoring.  For builtin metrics"
+            ", passing them in training parameter invokes monitor automatically."
+        )
+        if metric is not None and not callable(metric):
+            raise TypeError(msg)
+
        self.metric = metric
        self.history: TrainingCallback.EvalsLog = collections.OrderedDict()
        self._output_margin = output_margin
@@ -170,16 +174,6 @@ class CallbackContainer:
            else:
                assert isinstance(model, Booster), msg

-        if not self.is_cv:
-            if model.attr("best_score") is not None:
-                model.best_score = float(cast(str, model.attr("best_score")))
-                model.best_iteration = int(cast(str, model.attr("best_iteration")))
-            else:
-                # Due to compatibility with version older than 1.4, these attributes are
-                # added to Python object even if early stopping is not used.
-                model.best_iteration = model.num_boosted_rounds() - 1
-                model.set_attr(best_iteration=str(model.best_iteration))
-
        return model

    def before_iteration(
@@ -267,9 +261,14 @@ class LearningRateScheduler(TrainingCallback):
    def __init__(
        self, learning_rates: Union[Callable[[int], float], Sequence[float]]
    ) -> None:
-        assert callable(learning_rates) or isinstance(
+        if not callable(learning_rates) and not isinstance(
            learning_rates, collections.abc.Sequence
-        )
+        ):
+            raise TypeError(
+                "Invalid learning rates, expecting callable or sequence, got: "
+                f"{type(learning_rates)}"
+            )
+
        if callable(learning_rates):
            self.learning_rates = learning_rates
        else:
@@ -302,24 +301,28 @@ class EarlyStopping(TrainingCallback):
    save_best :
        Whether training should return the best model or the last model.
    min_delta :
-        Minimum absolute change in score to be qualified as an improvement.

        .. versionadded:: 1.5.0

-        .. code-block:: python
+        Minimum absolute change in score to be qualified as an improvement.

-            es = xgboost.callback.EarlyStopping(
-                rounds=2,
-                min_delta=1e-3,
-                save_best=True,
-                maximize=False,
-                data_name="validation_0",
-                metric_name="mlogloss",
-            )
-            clf = xgboost.XGBClassifier(tree_method="gpu_hist", callbacks=[es])
+    Examples
+    --------

-            X, y = load_digits(return_X_y=True)
-            clf.fit(X, y, eval_set=[(X, y)])
+    .. code-block:: python
+
+        es = xgboost.callback.EarlyStopping(
+            rounds=2,
+            min_delta=1e-3,
+            save_best=True,
+            maximize=False,
+            data_name="validation_0",
+            metric_name="mlogloss",
+        )
+        clf = xgboost.XGBClassifier(tree_method="hist", device="cuda", callbacks=[es])
+
+        X, y = load_digits(return_X_y=True)
+        clf.fit(X, y, eval_set=[(X, y)])
    """

    # pylint: disable=too-many-arguments
@@ -363,7 +366,7 @@ class EarlyStopping(TrainingCallback):
            return numpy.greater(get_s(new) - self._min_delta, get_s(best))

        def minimize(new: _Score, best: _Score) -> bool:
-            """New score should be smaller than the old one."""
+            """New score should be lesser than the old one."""
            return numpy.greater(get_s(best) - self._min_delta, get_s(new))

        if self.maximize is None:
@@ -419,38 +422,53 @@ class EarlyStopping(TrainingCallback):
    ) -> bool:
        epoch += self.starting_round  # training continuation
        msg = "Must have at least 1 validation dataset for early stopping."
-        assert len(evals_log.keys()) >= 1, msg
-        data_name = ""
+        if len(evals_log.keys()) < 1:
+            raise ValueError(msg)
+
+        # Get data name
        if self.data:
-            for d, _ in evals_log.items():
-                if d == self.data:
-                    data_name = d
-            if not data_name:
-                raise ValueError("No dataset named:", self.data)
+            data_name = self.data
        else:
            # Use the last one as default.
            data_name = list(evals_log.keys())[-1]
-        assert isinstance(data_name, str) and data_name
+        if data_name not in evals_log:
+            raise ValueError(f"No dataset named: {data_name}")
+
+        if not isinstance(data_name, str):
+            raise TypeError(
+                f"The name of the dataset should be a string. Got: {type(data_name)}"
+            )
        data_log = evals_log[data_name]

-        # Filter out scores that can not be used for early stopping.
+        # Get metric name
        if self.metric_name:
            metric_name = self.metric_name
        else:
            # Use last metric by default.
-            assert isinstance(data_log, collections.OrderedDict)
            metric_name = list(data_log.keys())[-1]
+        if metric_name not in data_log:
+            raise ValueError(f"No metric named: {metric_name}")
+
+        # The latest score
        score = data_log[metric_name][-1]
        return self._update_rounds(score, data_name, metric_name, model, epoch)

    def after_training(self, model: _Model) -> _Model:
+        if not self.save_best:
+            return model
+
        try:
-            if self.save_best:
-                model = model[: int(model.attr("best_iteration")) + 1]
+            best_iteration = model.best_iteration
+            best_score = model.best_score
+            assert best_iteration is not None and best_score is not None
+            model = model[: best_iteration + 1]
+            model.best_iteration = best_iteration
+            model.best_score = best_score
        except XGBoostError as e:
            raise XGBoostError(
-                "`save_best` is not applicable to current booster"
+                "`save_best` is not applicable to the current booster"
            ) from e
+
        return model


@@ -462,8 +480,6 @@ class EvaluationMonitor(TrainingCallback):
    Parameters
    ----------

-    metric :
-        Extra user defined metric.
    rank :
        Which worker should be used for printing the result.
    period :
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -88,6 +88,18 @@ def is_cudf_available() -> bool:
        return False


+def is_cupy_available() -> bool:
+    """Check cupy package available or not"""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+
+        return True
+    except ImportError:
+        return False
+
+
 try:
    import scipy.sparse as scipy_sparse
    from scipy.sparse import csr_matrix as scipy_csr
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3,11 +3,13 @@
 """Core XGBoost Library."""
 import copy
 import ctypes
+import importlib.util
 import json
 import os
 import re
 import sys
 import warnings
+import weakref
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from enum import IntEnum, unique
@@ -50,6 +52,7 @@ from ._typing import (
    FeatureTypes,
    ModelIn,
    NumpyOrCupy,
+    TransformedData,
    c_bst_ulong,
 )
 from .compat import PANDAS_INSTALLED, DataFrame, py_str
@@ -152,7 +155,11 @@ def _expect(expectations: Sequence[Type], got: Type) -> str:

 def _log_callback(msg: bytes) -> None:
    """Redirect logs from native library into Python console"""
-    print(py_str(msg))
+    smsg = py_str(msg)
+    if smsg.find("WARNING:") != -1:
+        warnings.warn(smsg, UserWarning)
+        return
+    print(smsg)


 def _get_log_callback_func() -> Callable:
@@ -228,8 +235,11 @@ Error message(s): {os_error_list}

    def parse(ver: str) -> Tuple[int, int, int]:
        """Avoid dependency on packaging (PEP 440)."""
-        # 2.0.0-dev or 2.0.0
+        # 2.0.0-dev, 2.0.0, or 2.0.0rc1
        major, minor, patch = ver.split("-")[0].split(".")
+        rc = patch.find("rc")
+        if rc != -1:
+            patch = patch[:rc]
        return int(major), int(minor), int(patch)

    libver = _lib_version(lib)
@@ -271,6 +281,44 @@ def _check_call(ret: int) -> None:
        raise XGBoostError(py_str(_LIB.XGBGetLastError()))


+def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
+    """Validate parameters in distributed environments."""
+    device = kwargs.get("device", None)
+    if device and not isinstance(device, str):
+        msg = "Invalid type for the `device` parameter"
+        msg += _expect((str,), type(device))
+        raise TypeError(msg)
+
+    if device and device.find(":") != -1:
+        raise ValueError(
+            "Distributed training doesn't support selecting device ordinal as GPUs are"
+            " managed by the distributed framework. use `device=cuda` or `device=gpu`"
+            " instead."
+        )
+
+    if kwargs.get("booster", None) == "gblinear":
+        raise NotImplementedError(
+            f"booster `{kwargs['booster']}` is not supported for distributed training."
+        )
+
+
+def _validate_feature_info(
+    feature_info: Sequence[str], n_features: int, name: str
+) -> List[str]:
+    if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
+        raise TypeError(
+            f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
+        )
+    feature_info = list(feature_info)
+    if len(feature_info) != n_features and n_features != 0:
+        msg = (
+            f"{name} must have the same length as the number of data columns, ",
+            f"expected {n_features}, got {len(feature_info)}",
+        )
+        raise ValueError(msg)
+    return feature_info
+
+
 def build_info() -> dict:
    """Build information of XGBoost.  The returned value format is not stable. Also,
    please note that build time dependency is not the same as runtime dependency. For
@@ -381,6 +429,54 @@ def c_array(
    return (ctype * len(values))(*values)


+def from_array_interface(interface: dict) -> NumpyOrCupy:
+    """Convert array interface to numpy or cupy array"""
+
+    class Array:  # pylint: disable=too-few-public-methods
+        """Wrapper type for communicating with numpy and cupy."""
+
+        _interface: Optional[dict] = None
+
+        @property
+        def __array_interface__(self) -> Optional[dict]:
+            return self._interface
+
+        @__array_interface__.setter
+        def __array_interface__(self, interface: dict) -> None:
+            self._interface = copy.copy(interface)
+            # converts some fields to tuple as required by numpy
+            self._interface["shape"] = tuple(self._interface["shape"])
+            self._interface["data"] = tuple(self._interface["data"])
+            if self._interface.get("strides", None) is not None:
+                self._interface["strides"] = tuple(self._interface["strides"])
+
+        @property
+        def __cuda_array_interface__(self) -> Optional[dict]:
+            return self.__array_interface__
+
+        @__cuda_array_interface__.setter
+        def __cuda_array_interface__(self, interface: dict) -> None:
+            self.__array_interface__ = interface
+
+    arr = Array()
+
+    if "stream" in interface:
+        # CUDA stream is presented, this is a __cuda_array_interface__.
+        spec = importlib.util.find_spec("cupy")
+        if spec is None:
+            raise ImportError("`cupy` is required for handling CUDA buffer.")
+
+        import cupy as cp  # pylint: disable=import-error
+
+        arr.__cuda_array_interface__ = interface
+        out = cp.array(arr, copy=True)
+    else:
+        arr.__array_interface__ = interface
+        out = np.array(arr, copy=True)
+
+    return out
+
+
 def _prediction_output(
    shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
 ) -> NumpyOrCupy:
@@ -395,7 +491,16 @@ def _prediction_output(


 class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
-    """The interface for user defined data iterator.
+    """The interface for user defined data iterator. The iterator facilitates
+    distributed training, :py:class:`QuantileDMatrix`, and external memory support using
+    :py:class:`DMatrix`. Most of time, users don't need to interact with this class
+    directly.
+
+    .. note::
+
+        The class caches some intermediate results using the `data` input (predictor
+        `X`) as key. Don't repeat the `X` for multiple batches with different meta data
+        (like `label`), make a copy if necessary.

    Parameters
    ----------
@@ -419,13 +524,13 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
        self._allow_host = True
        self._release = release_data
        # Stage data in Python until reset or next is called to avoid data being free.
-        self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
-        self._input_id: int = 0
+        self._temporary_data: Optional[TransformedData] = None
+        self._data_ref: Optional[weakref.ReferenceType] = None

    def get_callbacks(
        self, allow_host: bool, enable_categorical: bool
    ) -> Tuple[Callable, Callable]:
-        """Get callback functions for iterating in C."""
+        """Get callback functions for iterating in C. This is an internal function."""
        assert hasattr(self, "cache_prefix"), "__init__ is not called."
        self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
            self._reset_wrapper
@@ -491,8 +596,8 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes

        @require_keyword_args(True)
        def input_data(
-            data: Any,
            *,
+            data: Any,
            feature_names: Optional[FeatureNames] = None,
            feature_types: Optional[FeatureTypes] = None,
            **kwargs: Any,
@@ -500,7 +605,19 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
            from .data import _proxy_transform, dispatch_proxy_set_data

            # Reduce the amount of transformation that's needed for QuantileDMatrix.
-            if self._temporary_data is not None and id(data) == self._input_id:
+            #
+            # To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
+            # GPU. If the QDM has only one batch of input (most of the cases), we can
+            # avoid transforming the data repeatly.
+            try:
+                ref = weakref.ref(data)
+            except TypeError:
+                ref = None
+            if (
+                self._temporary_data is not None
+                and ref is not None
+                and ref is self._data_ref
+            ):
                new, cat_codes, feature_names, feature_types = self._temporary_data
            else:
                new, cat_codes, feature_names, feature_types = _proxy_transform(
@@ -517,7 +634,7 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
                feature_types=feature_types,
                **kwargs,
            )
-            self._input_id = id(data)
+            self._data_ref = ref

        # pylint: disable=not-callable
        return self._handle_exception(lambda: self.next(input_data), 0)
@@ -593,6 +710,9 @@ def require_keyword_args(
        @wraps(func)
        def inner_f(*args: Any, **kwargs: Any) -> _T:
            extra_args = len(args) - len(all_args)
+            if not all_args and extra_args > 0:  # keyword argument only
+                raise TypeError("Keyword argument is required.")
+
            if extra_args > 0:
                # ignore first 'self' argument for instance methods
                args_msg = [
@@ -1040,7 +1160,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        testing purposes. If this is a quantized DMatrix then quantized values are
        returned instead of input values.

-            .. versionadded:: 1.7.0
+        .. versionadded:: 1.7.0

        """
        indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
@@ -1060,6 +1180,36 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        )
        return ret

+    def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get quantile cuts for quantization.
+
+        .. versionadded:: 2.0.0
+
+        """
+        n_features = self.num_col()
+
+        c_sindptr = ctypes.c_char_p()
+        c_sdata = ctypes.c_char_p()
+        config = make_jcargs()
+        _check_call(
+            _LIB.XGDMatrixGetQuantileCut(
+                self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
+            )
+        )
+        assert c_sindptr.value is not None
+        assert c_sdata.value is not None
+
+        i_indptr = json.loads(c_sindptr.value)
+        indptr = from_array_interface(i_indptr)
+        assert indptr.size == n_features + 1
+        assert indptr.dtype == np.uint64
+
+        i_data = json.loads(c_sdata.value)
+        data = from_array_interface(i_data)
+        assert data.size == indptr[-1]
+        assert data.dtype == np.float32
+        return indptr, data
+
    def num_row(self) -> int:
        """Get the number of rows in the DMatrix."""
        ret = c_bst_ulong()
@@ -1117,11 +1267,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m

    @property
    def feature_names(self) -> Optional[FeatureNames]:
-        """Get feature names (column labels).
+        """Labels for features (column labels).
+
+        Setting it to ``None`` resets existing feature names.

-        Returns
-        -------
-        feature_names : list or None
        """
        length = c_bst_ulong()
        sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1140,67 +1289,61 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m

    @feature_names.setter
    def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
-        """Set feature names (column labels).
-
-        Parameters
-        ----------
-        feature_names : list or None
-            Labels for features. None will reset existing feature names
-        """
-        if feature_names is not None:
-            # validate feature name
-            try:
-                if not isinstance(feature_names, str):
-                    feature_names = list(feature_names)
-                else:
-                    feature_names = [feature_names]
-            except TypeError:
-                feature_names = [cast(str, feature_names)]
-
-            if len(feature_names) != len(set(feature_names)):
-                raise ValueError("feature_names must be unique")
-            if len(feature_names) != self.num_col() and self.num_col() != 0:
-                msg = (
-                    "feature_names must have the same length as data, ",
-                    f"expected {self.num_col()}, got {len(feature_names)}",
-                )
-                raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. []<
-            if not all(
-                isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
-                for f in feature_names
-            ):
-                raise ValueError(
-                    "feature_names must be string, and may not contain [, ] or <"
-                )
-            feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
-            c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
-                *feature_names_bytes
-            )
-            _check_call(
-                _LIB.XGDMatrixSetStrFeatureInfo(
-                    self.handle,
-                    c_str("feature_name"),
-                    c_feature_names,
-                    c_bst_ulong(len(feature_names)),
-                )
-            )
-        else:
-            # reset feature_types also
+        if feature_names is None:
            _check_call(
                _LIB.XGDMatrixSetStrFeatureInfo(
                    self.handle, c_str("feature_name"), None, c_bst_ulong(0)
                )
            )
-            self.feature_types = None
+            return
+
+        # validate feature name
+        feature_names = _validate_feature_info(
+            feature_names, self.num_col(), "feature names"
+        )
+        if len(feature_names) != len(set(feature_names)):
+            values, counts = np.unique(
+                feature_names,
+                return_index=False,
+                return_inverse=False,
+                return_counts=True,
+            )
+            duplicates = [name for name, cnt in zip(values, counts) if cnt > 1]
+            raise ValueError(
+                f"feature_names must be unique. Duplicates found: {duplicates}"
+            )
+
+        # prohibit the use symbols that may affect parsing. e.g. []<
+        if not all(
+            isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
+            for f in feature_names
+        ):
+            raise ValueError(
+                "feature_names must be string, and may not contain [, ] or <"
+            )
+
+        feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
+        c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
+            *feature_names_bytes
+        )
+        _check_call(
+            _LIB.XGDMatrixSetStrFeatureInfo(
+                self.handle,
+                c_str("feature_name"),
+                c_feature_names,
+                c_bst_ulong(len(feature_names)),
+            )
+        )

    @property
    def feature_types(self) -> Optional[FeatureTypes]:
-        """Get feature types (column types).
+        """Type of features (column types).
+
+        This is for displaying the results and categorical data support. See
+        :py:class:`DMatrix` for details.
+
+        Setting it to ``None`` resets existing feature types.

-        Returns
-        -------
-        feature_types : list or None
        """
        length = c_bst_ulong()
        sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1218,57 +1361,32 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        return res

    @feature_types.setter
-    def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
-        """Set feature types (column types).
-
-        This is for displaying the results and categorical data support. See
-        :py:class:`DMatrix` for details.
-
-        Parameters
-        ----------
-        feature_types :
-            Labels for features. None will reset existing feature names
-
-        """
-        # For compatibility reason this function wraps single str input into a list.  But
-        # we should not promote such usage since other than visualization, the field is
-        # also used for specifying categorical data type.
-        if feature_types is not None:
-            if not isinstance(feature_types, (list, str)):
-                raise TypeError("feature_types must be string or list of strings")
-            if isinstance(feature_types, str):
-                # single string will be applied to all columns
-                feature_types = [feature_types] * self.num_col()
-            try:
-                if not isinstance(feature_types, str):
-                    feature_types = list(feature_types)
-                else:
-                    feature_types = [feature_types]
-            except TypeError:
-                feature_types = [cast(str, feature_types)]
-            feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
-            c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
-                *feature_types_bytes
-            )
-            _check_call(
-                _LIB.XGDMatrixSetStrFeatureInfo(
-                    self.handle,
-                    c_str("feature_type"),
-                    c_feature_types,
-                    c_bst_ulong(len(feature_types)),
-                )
-            )
-
-            if len(feature_types) != self.num_col() and self.num_col() != 0:
-                msg = "feature_types must have the same length as data"
-                raise ValueError(msg)
-        else:
-            # Reset.
+    def feature_types(self, feature_types: Optional[FeatureTypes]) -> None:
+        if feature_types is None:
+            # Reset
            _check_call(
                _LIB.XGDMatrixSetStrFeatureInfo(
                    self.handle, c_str("feature_type"), None, c_bst_ulong(0)
                )
            )
+            return
+
+        feature_types = _validate_feature_info(
+            feature_types, self.num_col(), "feature types"
+        )
+
+        feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
+        c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
+            *feature_types_bytes
+        )
+        _check_call(
+            _LIB.XGDMatrixSetStrFeatureInfo(
+                self.handle,
+                c_str("feature_type"),
+                c_feature_types,
+                c_bst_ulong(len(feature_types)),
+            )
+        )


 class _ProxyDMatrix(DMatrix):
@@ -1318,13 +1436,13 @@ class _ProxyDMatrix(DMatrix):


 class QuantileDMatrix(DMatrix):
-    """A DMatrix variant that generates quantilized data directly from input for
-    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
-    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
-    number of bins during quantisation, which should be consistent with the training
-    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
-    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
-    it defeats the purpose of saving memory) constructed from training dataset.  See
+    """A DMatrix variant that generates quantilized data directly from input for the
+    ``hist`` tree method. This DMatrix is primarily designed to save memory in training
+    by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
+    during quantisation, which should be consistent with the training parameter
+    ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
+    should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
+    defeats the purpose of saving memory) constructed from training dataset.  See
    :py:obj:`xgboost.DMatrix` for documents on meta info.

    .. note::
@@ -1372,7 +1490,7 @@ class QuantileDMatrix(DMatrix):
        enable_categorical: bool = False,
        data_split_mode: DataSplitMode = DataSplitMode.ROW,
    ) -> None:
-        self.max_bin: int = max_bin if max_bin is not None else 256
+        self.max_bin = max_bin
        self.missing = missing if missing is not None else np.nan
        self.nthread = nthread if nthread is not None else -1
        self._silent = silent  # unused, kept for compatibility
@@ -1544,7 +1662,7 @@ class Booster:
        )
        for d in cache:
            # Validate feature only after the feature names are saved into booster.
-            self._validate_dmatrix_features(d)
+            self._assign_dmatrix_features(d)

        if isinstance(model_file, Booster):
            assert self.handle is not None
@@ -1667,6 +1785,11 @@ class Booster:
        self.__dict__.update(state)

    def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
+        """Get a slice of the tree-based model.
+
+        .. versionadded:: 1.3.0
+
+        """
        if isinstance(val, int):
            val = slice(val, val + 1)
        if isinstance(val, tuple):
@@ -1705,6 +1828,11 @@ class Booster:
        return sliced

    def __iter__(self) -> Generator["Booster", None, None]:
+        """Iterator method for getting individual trees.
+
+        .. versionadded:: 2.0.0
+
+        """
        for i in range(0, self.num_boosted_rounds()):
            yield self[i]

@@ -1795,7 +1923,7 @@ class Booster:
        attr_names = from_cstr_to_pystr(sarr, length)
        return {n: self.attr(n) for n in attr_names}

-    def set_attr(self, **kwargs: Optional[str]) -> None:
+    def set_attr(self, **kwargs: Optional[Any]) -> None:
        """Set the attribute of the Booster.

        Parameters
@@ -1915,7 +2043,7 @@ class Booster:
        """
        if not isinstance(dtrain, DMatrix):
            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_dmatrix_features(dtrain)
+        self._assign_dmatrix_features(dtrain)

        if fobj is None:
            _check_call(
@@ -1947,7 +2075,7 @@ class Booster:
            raise ValueError(f"grad / hess length mismatch: {len(grad)} / {len(hess)}")
        if not isinstance(dtrain, DMatrix):
            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_dmatrix_features(dtrain)
+        self._assign_dmatrix_features(dtrain)

        _check_call(
            _LIB.XGBoosterBoostOneIter(
@@ -1988,7 +2116,7 @@ class Booster:
                raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
            if not isinstance(d[1], str):
                raise TypeError(f"expected string, got {type(d[1]).__name__}")
-            self._validate_dmatrix_features(d[0])
+            self._assign_dmatrix_features(d[0])

        dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
        evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
@@ -2040,7 +2168,7 @@ class Booster:
        result: str
            Evaluation result string.
        """
-        self._validate_dmatrix_features(data)
+        self._assign_dmatrix_features(data)
        return self.eval_set([(data, name)], iteration)

    # pylint: disable=too-many-function-args
@@ -2139,7 +2267,8 @@ class Booster:
        if not isinstance(data, DMatrix):
            raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
        if validate_features:
-            self._validate_dmatrix_features(data)
+            fn = data.feature_names
+            self._validate_features(fn)
        args = {
            "type": 0,
            "training": training,
@@ -2187,20 +2316,25 @@ class Booster:
        base_margin: Any = None,
        strict_shape: bool = False,
    ) -> NumpyOrCupy:
-        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
-        does not cache the prediction result.
+        """Run prediction in-place when possible, Unlike :py:meth:`predict` method,
+        inplace prediction does not cache the prediction result.

        Calling only ``inplace_predict`` in multiple threads is safe and lock
        free.  But the safety does not hold when used in conjunction with other
        methods. E.g. you can't train the booster in one thread and perform
        prediction in the other.

+        .. note::
+
+            If the device ordinal of the input data doesn't match the one configured for
+            the booster, data will be copied to the booster device.
+
        .. code-block:: python

-            booster.set_param({"predictor": "gpu_predictor"})
+            booster.set_param({"device": "cuda:0"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({"predictor": "cpu_predictor"})
+            booster.set_param({"device": "cpu"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2208,9 +2342,7 @@ class Booster:
        Parameters
        ----------
        data :
-            The input data, must not be a view for numpy array.  Set
-            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
-            array or CuDF DataFrame.
+            The input data.
        iteration_range :
            See :py:meth:`predict` for details.
        predict_type :
@@ -2233,8 +2365,8 @@ class Booster:
        Returns
        -------
        prediction : numpy.ndarray/cupy.ndarray
-            The prediction result.  When input data is on GPU, prediction
-            result is stored in a cupy array.
+            The prediction result.  When input data is on GPU, prediction result is
+            stored in a cupy array.

        """
        preds = ctypes.POINTER(ctypes.c_float)()
@@ -2267,6 +2399,7 @@ class Booster:
            _is_cudf_df,
            _is_cupy_array,
            _is_list,
+            _is_np_array_like,
            _is_pandas_df,
            _is_pandas_series,
            _is_tuple,
@@ -2296,7 +2429,7 @@ class Booster:
                    f"got {data.shape[1]}"
                )

-        if isinstance(data, np.ndarray):
+        if _is_np_array_like(data):
            from .data import _ensure_np_dtype

            data, _ = _ensure_np_dtype(data, data.dtype)
@@ -2460,10 +2593,35 @@ class Booster:
        else:
            raise TypeError("Unknown file type: ", fname)

-        if self.attr("best_iteration") is not None:
-            self.best_iteration = int(cast(int, self.attr("best_iteration")))
-        if self.attr("best_score") is not None:
-            self.best_score = float(cast(float, self.attr("best_score")))
+    @property
+    def best_iteration(self) -> int:
+        """The best iteration during training."""
+        best = self.attr("best_iteration")
+        if best is not None:
+            return int(best)
+
+        raise AttributeError(
+            "`best_iteration` is only defined when early stopping is used."
+        )
+
+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        self.set_attr(best_iteration=iteration)
+
+    @property
+    def best_score(self) -> float:
+        """The best evaluation score during training."""
+        best = self.attr("best_score")
+        if best is not None:
+            return float(best)
+
+        raise AttributeError(
+            "`best_score` is only defined when early stopping is used."
+        )
+
+    @best_score.setter
+    def best_score(self, score: int) -> None:
+        self.set_attr(best_score=score)

    def num_boosted_rounds(self) -> int:
        """Get number of boosted rounds.  For gblinear this is reset to 0 after
@@ -2761,14 +2919,13 @@ class Booster:
        # pylint: disable=no-member
        return df.sort(["Tree", "Node"]).reset_index(drop=True)

-    def _validate_dmatrix_features(self, data: DMatrix) -> None:
+    def _assign_dmatrix_features(self, data: DMatrix) -> None:
        if data.num_row() == 0:
            return

        fn = data.feature_names
        ft = data.feature_types
-        # Be consistent with versions before 1.7, "validate" actually modifies the
-        # booster.
+
        if self.feature_names is None:
            self.feature_names = fn
        if self.feature_types is None:
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -70,6 +70,7 @@ from .core import (
    Metric,
    Objective,
    QuantileDMatrix,
+    _check_distributed_params,
    _deprecate_positional_args,
    _expect,
 )
@@ -82,6 +83,7 @@ from .sklearn import (
    XGBRanker,
    XGBRankerMixIn,
    XGBRegressorBase,
+    _can_use_qdm,
    _check_rf_callback,
    _cls_predict_proba,
    _objective_decorator,
@@ -617,14 +619,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
        if self._iter == len(self._data):
            # Return 0 when there's no more batch.
            return 0
-        feature_names: Optional[FeatureNames] = None
-        if self._feature_names:
-            feature_names = self._feature_names
-        else:
-            if hasattr(self.data(), "columns"):
-                feature_names = self.data().columns.format()
-            else:
-                feature_names = None
+
        input_data(
            data=self.data(),
            label=self._get("_label"),
@@ -634,7 +629,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
            base_margin=self._get("_base_margin"),
            label_lower_bound=self._get("_label_lower_bound"),
            label_upper_bound=self._get("_label_upper_bound"),
-            feature_names=feature_names,
+            feature_names=self._feature_names,
            feature_types=self._feature_types,
            feature_weights=self._feature_weights,
        )
@@ -855,8 +850,6 @@ async def _get_rabit_args(
    except Exception:  # pylint: disable=broad-except
        sched_addr = None

-    # make sure all workers are online so that we can obtain reliable scheduler_info
-    await client.wait_for_workers(n_workers)  # type: ignore
    env = await client.run_on_scheduler(
        _start_tracker, n_workers, sched_addr, user_addr
    )
@@ -912,6 +905,16 @@ def _filter_empty(
    raise ValueError("None of the workers can provide a valid result.")


+async def _check_workers_are_alive(
+    workers: List[str], client: "distributed.Client"
+) -> None:
+    info = await client.scheduler.identity()
+    current_workers = info["workers"].keys()
+    missing_workers = set(workers) - current_workers
+    if missing_workers:
+        raise RuntimeError(f"Missing required workers: {missing_workers}")
+
+
 async def _train_async(
    client: "distributed.Client",
    global_config: Dict[str, Any],
@@ -929,12 +932,9 @@ async def _train_async(
    custom_metric: Optional[Metric],
 ) -> Optional[TrainReturnT]:
    workers = _get_workers_from_data(dtrain, evals)
+    await _check_workers_are_alive(workers, client)
    _rabit_args = await _get_rabit_args(len(workers), dconfig, client)
-
-    if params.get("booster", None) == "gblinear":
-        raise NotImplementedError(
-            f"booster `{params['booster']}` is not yet supported for dask."
-        )
+    _check_distributed_params(params)

    def dispatched_train(
        parameters: Dict,
@@ -1574,7 +1574,7 @@ async def _async_wrap_evaluation_matrices(
    """A switch function for async environment."""

    def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
-        if tree_method in ("hist", "gpu_hist"):
+        if _can_use_qdm(tree_method):
            return DaskQuantileDMatrix(
                client=client, ref=ref, max_bin=max_bin, **kwargs
            )
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast

 import numpy as np

@@ -17,6 +17,7 @@ from ._typing import (
    FloatCompatible,
    NumpyDType,
    PandasDType,
+    TransformedData,
    c_bst_ulong,
 )
 from .compat import DataFrame, lazy_isinstance
@@ -163,8 +164,8 @@ def _is_scipy_coo(data: DataType) -> bool:
    return isinstance(data, scipy.sparse.coo_matrix)


-def _is_numpy_array(data: DataType) -> bool:
-    return isinstance(data, (np.ndarray, np.matrix))
+def _is_np_array_like(data: DataType) -> bool:
+    return hasattr(data, "__array_interface__")


 def _ensure_np_dtype(
@@ -197,6 +198,7 @@ def _from_numpy_array(
    nthread: int,
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
    """Initialize data from a 2-D numpy matrix."""
    _check_data_shape(data)
@@ -205,7 +207,11 @@ def _from_numpy_array(
    _check_call(
        _LIB.XGDMatrixCreateFromDense(
            _array_interface(data),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
            ctypes.byref(handle),
        )
    )
@@ -311,7 +317,6 @@ def pandas_feature_info(
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
    """Handle feature info for pandas dataframe."""
    import pandas as pd
-    from pandas.api.types import is_categorical_dtype, is_sparse

    # handle feature names
    if feature_names is None and meta is None:
@@ -326,10 +331,10 @@ def pandas_feature_info(
    if feature_types is None and meta is None:
        feature_types = []
        for dtype in data.dtypes:
-            if is_sparse(dtype):
+            if is_pd_sparse_dtype(dtype):
                feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
            elif (
-                is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
            ) and enable_categorical:
                feature_types.append(CAT_T)
            else:
@@ -339,18 +344,13 @@ def pandas_feature_info(

 def is_nullable_dtype(dtype: PandasDType) -> bool:
    """Whether dtype is a pandas nullable type."""
-    from pandas.api.types import (
-        is_bool_dtype,
-        is_categorical_dtype,
-        is_float_dtype,
-        is_integer_dtype,
-    )
+    from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype

    is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
    is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
-    return is_int or is_bool or is_float or is_categorical_dtype(dtype)
+    return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)


 def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -365,17 +365,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
    )


+def is_pd_cat_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas category type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import CategoricalDtype
+
+            return isinstance(dtype, CategoricalDtype)
+
+    from pandas.api.types import is_categorical_dtype
+
+    return is_categorical_dtype(dtype)
+
+
+def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas sparse type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import SparseDtype
+
+            return isinstance(dtype, SparseDtype)
+
+    from pandas.api.types import is_sparse
+
+    return is_sparse(dtype)
+
+
 def pandas_cat_null(data: DataFrame) -> DataFrame:
    """Handle categorical dtype and nullable extension types from pandas."""
    import pandas as pd
-    from pandas.api.types import is_categorical_dtype

    # handle category codes and nullable.
    cat_columns = []
    nul_columns = []
    # avoid an unnecessary conversion if possible
    for col, dtype in zip(data.columns, data.dtypes):
-        if is_categorical_dtype(dtype):
+        if is_pd_cat_dtype(dtype):
            cat_columns.append(col)
        elif is_pa_ext_categorical_dtype(dtype):
            raise ValueError(
@@ -392,7 +423,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
        transformed = data

    def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_categorical_dtype(ser.dtype):
+        if is_pd_cat_dtype(ser.dtype):
            return ser.cat.codes
        assert is_pa_ext_categorical_dtype(ser.dtype)
        # Not yet supported, the index is not ordered for some reason. Alternately:
@@ -448,14 +479,12 @@ def _transform_pandas_df(
    meta: Optional[str] = None,
    meta_type: Optional[NumpyDType] = None,
 ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
-    from pandas.api.types import is_categorical_dtype, is_sparse
-
    pyarrow_extension = False
    for dtype in data.dtypes:
        if not (
            (dtype.name in _pandas_dtype_mapper)
-            or is_sparse(dtype)
-            or (is_categorical_dtype(dtype) and enable_categorical)
+            or is_pd_sparse_dtype(dtype)
+            or (is_pd_cat_dtype(dtype) and enable_categorical)
            or is_pa_ext_dtype(dtype)
        ):
            _invalid_dataframe_dtype(data)
@@ -509,9 +538,8 @@ def _meta_from_pandas_series(
 ) -> None:
    """Help transform pandas series for meta data like labels"""
    data = data.values.astype("float")
-    from pandas.api.types import is_sparse

-    if is_sparse(data):
+    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
        data = data.to_dense()  # type: ignore
    assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
    _meta_from_numpy(data, name, dtype, handle)
@@ -533,13 +561,11 @@ def _from_pandas_series(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    from pandas.api.types import is_categorical_dtype
-
    if (data.dtype.name not in _pandas_dtype_mapper) and not (
-        is_categorical_dtype(data.dtype) and enable_categorical
+        is_pd_cat_dtype(data.dtype) and enable_categorical
    ):
        _invalid_dataframe_dtype(data)
-    if enable_categorical and is_categorical_dtype(data.dtype):
+    if enable_categorical and is_pd_cat_dtype(data.dtype):
        data = data.cat.codes
    return _from_numpy_array(
        data.values.reshape(data.shape[0], 1).astype("float"),
@@ -1045,8 +1071,10 @@ def dispatch_data_backend(
        return _from_scipy_csr(
            data.tocsr(), missing, threads, feature_names, feature_types
        )
-    if _is_numpy_array(data):
-        return _from_numpy_array(data, missing, threads, feature_names, feature_types)
+    if _is_np_array_like(data):
+        return _from_numpy_array(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
    if _is_uri(data):
        return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
    if _is_list(data):
@@ -1186,7 +1214,7 @@ def dispatch_meta_backend(
    if _is_tuple(data):
        _meta_from_tuple(data, name, dtype, handle)
        return
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        _meta_from_numpy(data, name, dtype, handle)
        return
    if _is_pandas_df(data):
@@ -1261,12 +1289,7 @@ def _proxy_transform(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
    enable_categorical: bool,
-) -> Tuple[
-    Union[bool, ctypes.c_void_p, np.ndarray],
-    Optional[list],
-    Optional[FeatureNames],
-    Optional[FeatureTypes],
-]:
+) -> TransformedData:
    if _is_cudf_df(data) or _is_cudf_ser(data):
        return _transform_cudf_df(
            data, feature_names, feature_types, enable_categorical
@@ -1278,7 +1301,7 @@ def _proxy_transform(
        return _transform_dlpack(data), None, feature_names, feature_types
    if _is_list(data) or _is_tuple(data):
        data = np.array(data)
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
    if _is_scipy_csr(data):
@@ -1328,7 +1351,7 @@ def dispatch_proxy_set_data(
    if not allow_host:
        raise err

-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
        _check_data_shape(data)
        proxy._set_data_from_array(data)  # pylint: disable=W0212
        return
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -27,20 +27,19 @@ def find_lib_path() -> List[str]:
        os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
        # use libxgboost from a system prefix, if available.  This should be the last
        # option.
-        os.path.join(sys.prefix, "lib"),
+        os.path.join(sys.base_prefix, "lib"),
    ]

    if sys.platform == "win32":
-        if platform.architecture()[0] == "64bit":
-            dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/"))
-            # hack for pip installation when copy all parent source
-            # directory here
-            dll_path.append(os.path.join(curr_path, "./windows/x64/Release/"))
-        else:
-            dll_path.append(os.path.join(curr_path, "../../windows/Release/"))
-            # hack for pip installation when copy all parent source
-            # directory here
-            dll_path.append(os.path.join(curr_path, "./windows/Release/"))
+        # On Windows, Conda may install libs in different paths
+        dll_path.extend(
+            [
+                os.path.join(sys.base_prefix, "bin"),
+                os.path.join(sys.base_prefix, "Library"),
+                os.path.join(sys.base_prefix, "Library", "bin"),
+                os.path.join(sys.base_prefix, "Library", "lib"),
+            ]
+        )
        dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path]
    elif sys.platform.startswith(("linux", "freebsd", "emscripten")):
        dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path]
@@ -62,8 +61,8 @@ def find_lib_path() -> List[str]:
            + ("\n- ".join(dll_path))
            + "\nXGBoost Python package path: "
            + curr_path
-            + "\nsys.prefix: "
-            + sys.prefix
+            + "\nsys.base_prefix: "
+            + sys.base_prefix
            + "\nSee: "
            + link
            + " for installing XGBoost."
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -76,6 +76,10 @@ def _check_rf_callback(
        )


+def _can_use_qdm(tree_method: Optional[str]) -> bool:
+    return tree_method in ("hist", "gpu_hist", None, "auto")
+
+
 SklObjective = Optional[
    Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
 ]
@@ -226,10 +230,10 @@ __model_doc = f"""
    subsample : Optional[float]
        Subsample ratio of the training instance.
    sampling_method :
-        Sampling method. Used only by `gpu_hist` tree method.
-          - `uniform`: select random training instances uniformly.
-          - `gradient_based` select random training instances with higher probability when
-            the gradient and hessian are larger. (cf. CatBoost)
+        Sampling method. Used only by the GPU version of ``hist`` tree method.
+          - ``uniform``: select random training instances uniformly.
+          - ``gradient_based`` select random training instances with higher probability
+            when the gradient and hessian are larger. (cf. CatBoost)
    colsample_bytree : Optional[float]
        Subsample ratio of columns when constructing each tree.
    colsample_bylevel : Optional[float]
@@ -273,13 +277,16 @@ __model_doc = f"""
        * For linear model, only "weight" is defined and it's the normalized coefficients
          without bias.

-    gpu_id : Optional[int]
-        Device ordinal.
+    device : Optional[str]
+
+        .. versionadded:: 2.0.0
+
+        Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
+
    validate_parameters : Optional[bool]
+
        Give warnings for unknown parameter.
-    predictor : Optional[str]
-        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
-        gpu_predictor].
+
    enable_categorical : bool

        .. versionadded:: 1.5.0
@@ -381,17 +388,21 @@ __model_doc = f"""
          every **early_stopping_rounds** round(s) to continue training.  Requires at
          least one item in **eval_set** in :py:meth:`fit`.

-        - The method returns the model from the last iteration, not the best one, use a
-          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
-          model is preferred.
+        - If early stopping occurs, the model will have two additional attributes:
+          :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
+          :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
+          number of trees during inference. If users want to access the full model
+          (including trees built after early stopping), they can specify the
+          `iteration_range` in these inference methods. In addition, other utilities
+          like model plotting can also use the entire model.
+
+        - If you prefer to discard the trees after `best_iteration`, consider using the
+          callback function :py:class:`xgboost.callback.EarlyStopping`.

        - If there's more than one item in **eval_set**, the last entry will be used for
          early stopping.  If there's more than one metric in **eval_metric**, the last
          metric will be used for early stopping.

-        - If early stopping occurs, the model will have three additional fields:
-          :py:attr:`best_score`, :py:attr:`best_iteration`.
-
        .. note::

            This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
@@ -646,9 +657,8 @@ class XGBModel(XGBModelBase):
        monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
        interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
        importance_type: Optional[str] = None,
-        gpu_id: Optional[int] = None,
+        device: Optional[str] = None,
        validate_parameters: Optional[bool] = None,
-        predictor: Optional[str] = None,
        enable_categorical: bool = False,
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
@@ -693,9 +703,8 @@ class XGBModel(XGBModelBase):
        self.monotone_constraints = monotone_constraints
        self.interaction_constraints = interaction_constraints
        self.importance_type = importance_type
-        self.gpu_id = gpu_id
+        self.device = device
        self.validate_parameters = validate_parameters
-        self.predictor = predictor
        self.enable_categorical = enable_categorical
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
@@ -931,8 +940,7 @@ class XGBModel(XGBModelBase):
        callbacks = self.callbacks if self.callbacks is not None else callbacks

        tree_method = params.get("tree_method", None)
-        cat_support = {"gpu_hist", "approx", "hist"}
-        if self.enable_categorical and tree_method not in cat_support:
+        if self.enable_categorical and tree_method == "exact":
            raise ValueError(
                "Experimental support for categorical data is not implemented for"
                " current tree method yet."
@@ -941,7 +949,7 @@ class XGBModel(XGBModelBase):

    def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
        # Use `QuantileDMatrix` to save memory.
-        if self.tree_method in ("hist", "gpu_hist"):
+        if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
            try:
                return QuantileDMatrix(
                    **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
@@ -984,12 +992,12 @@ class XGBModel(XGBModelBase):
        X :
            Feature matrix. See :ref:`py-data` for a list of supported types.

-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        sample_weight :
@@ -1002,13 +1010,17 @@ class XGBModel(XGBModelBase):
            Validation metrics will help us track the performance of the model.

        eval_metric : str, list of str, or callable, optional
+
            .. deprecated:: 1.6.0
-                Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
+
+            Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.

        early_stopping_rounds : int
+
            .. deprecated:: 1.6.0
-                Use `early_stopping_rounds` in :py:meth:`__init__` or
-                :py:meth:`set_params` instead.
+
+            Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
+            instead.
        verbose :
            If `verbose` is True and an evaluation set is used, the evaluation metric
            measured on the validation set is printed to stdout at each boosting stage.
@@ -1089,12 +1101,7 @@ class XGBModel(XGBModelBase):
            return self

    def _can_use_inplace_predict(self) -> bool:
-        # When predictor is explicitly set, using `inplace_predict` might result into
-        # error with incompatible data type.
-        # Inplace predict doesn't handle as many data types as DMatrix, but it's
-        # sufficient for dask interface where input is simpiler.
-        predictor = self.get_xgb_params().get("predictor", None)
-        if predictor in ("auto", None) and self.booster != "gblinear":
+        if self.booster != "gblinear":
            return True
        return False

@@ -1120,9 +1127,9 @@ class XGBModel(XGBModelBase):
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then
-        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
-        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
-        prediction is run on GPU automatically, otherwise it will run on CPU.
+        :py:attr:`best_iteration` is used automatically. The estimator uses
+        `inplace_predict` by default and falls back to using :py:class:`DMatrix` if
+        devices between the data and the estimator don't match.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1272,19 +1279,10 @@ class XGBModel(XGBModelBase):
            )
        return np.array(feature_names)

-    def _early_stopping_attr(self, attr: str) -> Union[float, int]:
-        booster = self.get_booster()
-        try:
-            return getattr(booster, attr)
-        except AttributeError as e:
-            raise AttributeError(
-                f"`{attr}` in only defined when early stopping is used."
-            ) from e
-
    @property
    def best_score(self) -> float:
        """The best score obtained by early stopping."""
-        return float(self._early_stopping_attr("best_score"))
+        return self.get_booster().best_score

    @property
    def best_iteration(self) -> int:
@@ -1292,7 +1290,7 @@ class XGBModel(XGBModelBase):
        for instance if the best iteration is the first round, then best_iteration is 0.

        """
-        return int(self._early_stopping_attr("best_iteration"))
+        return self.get_booster().best_iteration

    @property
    def feature_importances_(self) -> np.ndarray:
@@ -1584,7 +1582,9 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    ) -> np.ndarray:
        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
-        automatically.
+        automatically. The estimator uses `inplace_predict` by default and falls back to
+        using :py:class:`DMatrix` if devices between the data and the estimator don't
+        match.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1917,12 +1917,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
            | 1   | :math:`x_{20}` | :math:`x_{21}` |
            +-----+----------------+----------------+

-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        group :
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1,4 +1,4 @@
-"""Xgboost pyspark integration submodule for core code."""
+"""XGBoost pyspark integration submodule for core code."""
 import base64

 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@@ -22,7 +22,7 @@ from typing import (

 import numpy as np
 import pandas as pd
-from pyspark import SparkContext, cloudpickle
+from pyspark import RDD, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@@ -44,6 +44,7 @@ from pyspark.ml.util import (
    MLWritable,
    MLWriter,
 )
+from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
 from pyspark.sql import Column, DataFrame
 from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
 from pyspark.sql.types import (
@@ -59,11 +60,12 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module

 import xgboost
 from xgboost import XGBClassifier
-from xgboost.compat import is_cudf_available
-from xgboost.core import Booster
-from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel
+from xgboost.compat import is_cudf_available, is_cupy_available
+from xgboost.core import Booster, _check_distributed_params
+from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train

+from .._typing import ArrayLike
 from .data import (
    _read_csr_matrix_from_unwrapped_spark_vec,
    alias,
@@ -87,11 +89,13 @@ from .utils import (
    _get_rabit_args,
    _get_spark_session,
    _is_local,
+    _is_standalone_or_localcluster,
    deserialize_booster,
    deserialize_xgb_model,
    get_class_name,
    get_logger,
    serialize_booster,
+    use_cuda,
 )

 # Put pyspark specific params here, they won't be passed to XGBoost.
@@ -108,13 +112,13 @@ _pyspark_specific_params = [
    "arbitrary_params_dict",
    "force_repartition",
    "num_workers",
-    "use_gpu",
    "feature_names",
    "features_cols",
    "enable_sparse_data_optim",
    "qid_col",
    "repartition_random_shuffle",
    "pred_contrib_col",
+    "use_gpu",
 ]

 _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -132,7 +136,7 @@ _pyspark_param_alias_map = {
 _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.items()}

 _unsupported_xgb_params = [
-    "gpu_id",  # we have "use_gpu" pyspark param instead.
+    "gpu_id",  # we have "device" pyspark param instead.
    "enable_categorical",  # Use feature_types param to specify categorical feature instead
    "use_label_encoder",
    "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -197,11 +201,24 @@ class _SparkXGBParams(
        "The number of XGBoost workers. Each XGBoost worker corresponds to one spark task.",
        TypeConverters.toInt,
    )
+    device = Param(
+        Params._dummy(),
+        "device",
+        (
+            "The device type for XGBoost executors. Available options are `cpu`,`cuda`"
+            " and `gpu`. Set `device` to `cuda` or `gpu` if the executors are running "
+            "on GPU instances. Currently, only one GPU per task is supported."
+        ),
+        TypeConverters.toString,
+    )
    use_gpu = Param(
        Params._dummy(),
        "use_gpu",
-        "A boolean variable. Set use_gpu=true if the executors "
-        + "are running on GPU instances. Currently, only one GPU per task is supported.",
+        (
+            "Deprecated, use `device` instead. A boolean variable. Set use_gpu=true "
+            "if the executors are running on GPU instances. Currently, only one GPU per"
+            " task is supported."
+        ),
        TypeConverters.toBoolean,
    )
    force_repartition = Param(
@@ -227,6 +244,13 @@ class _SparkXGBParams(
        TypeConverters.toList,
    )

+    def set_device(self, value: str) -> "_SparkXGBParams":
+        """Set device, optional value: cpu, cuda, gpu"""
+        _check_distributed_params({"device": value})
+        assert value in ("cpu", "cuda", "gpu")
+        self.set(self.device, value)
+        return self
+
    @classmethod
    def _xgb_cls(cls) -> Type[XGBModel]:
        """
@@ -320,6 +344,54 @@ class _SparkXGBParams(
                predict_params[param.name] = self.getOrDefault(param)
        return predict_params

+    def _validate_gpu_params(self) -> None:
+        """Validate the gpu parameters and gpu configurations"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if _is_local(sc):
+                # Support GPU training in Spark local mode is just for debugging
+                # purposes, so it's okay for printing the below warning instead of
+                # checking the real gpu numbers and raising the exception.
+                get_logger(self.__class__.__name__).warning(
+                    "You have enabled GPU in spark local mode. Please make sure your"
+                    " local node has at least %d GPUs",
+                    self.getOrDefault(self.num_workers),
+                )
+            else:
+                executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+                if executor_gpus is None:
+                    raise ValueError(
+                        "The `spark.executor.resource.gpu.amount` is required for training"
+                        " on GPU."
+                    )
+
+                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
+                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
+                    # require spark.task.resource.gpu.amount to be set explicitly
+                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
+                    if gpu_per_task is not None:
+                        if float(gpu_per_task) < 1.0:
+                            raise ValueError(
+                                "XGBoost doesn't support GPU fractional configurations. "
+                                "Please set `spark.task.resource.gpu.amount=spark.executor"
+                                ".resource.gpu.amount`"
+                            )
+
+                        if float(gpu_per_task) > 1.0:
+                            get_logger(self.__class__.__name__).warning(
+                                "%s GPUs for each Spark task is configured, but each "
+                                "XGBoost training task uses only 1 GPU.",
+                                gpu_per_task,
+                            )
+                    else:
+                        raise ValueError(
+                            "The `spark.task.resource.gpu.amount` is required for training"
+                            " on GPU."
+                        )
+
    def _validate_params(self) -> None:
        # pylint: disable=too-many-branches
        init_model = self.getOrDefault("xgb_model")
@@ -335,10 +407,18 @@ class _SparkXGBParams(
                f"It cannot be less than 1 [Default is 1]"
            )

+        tree_method = self.getOrDefault(self.getParam("tree_method"))
+        if tree_method == "exact":
+            raise ValueError(
+                "The `exact` tree method is not supported for distributed systems."
+            )
+
        if self.getOrDefault(self.features_cols):
-            if not self.getOrDefault(self.use_gpu):
+            if not use_cuda(self.getOrDefault(self.device)) and not self.getOrDefault(
+                self.use_gpu
+            ):
                raise ValueError(
-                    "features_col param with list value requires enabling use_gpu."
+                    "features_col param with list value requires `device=cuda`."
                )

        if self.getOrDefault("objective") is not None:
@@ -391,57 +471,7 @@ class _SparkXGBParams(
                    "`pyspark.ml.linalg.Vector` type."
                )

-        if self.getOrDefault(self.use_gpu):
-            tree_method = self.getParam("tree_method")
-            if (
-                self.getOrDefault(tree_method) is not None
-                and self.getOrDefault(tree_method) != "gpu_hist"
-            ):
-                raise ValueError(
-                    f"tree_method should be 'gpu_hist' or None when use_gpu is True,"
-                    f"found {self.getOrDefault(tree_method)}."
-                )
-
-            gpu_per_task = (
-                _get_spark_session()
-                .sparkContext.getConf()
-                .get("spark.task.resource.gpu.amount")
-            )
-
-            is_local = _is_local(_get_spark_session().sparkContext)
-
-            if is_local:
-                # checking spark local mode.
-                if gpu_per_task:
-                    raise RuntimeError(
-                        "The spark cluster does not support gpu configuration for local mode. "
-                        "Please delete spark.executor.resource.gpu.amount and "
-                        "spark.task.resource.gpu.amount"
-                    )
-
-                # Support GPU training in Spark local mode is just for debugging purposes,
-                # so it's okay for printing the below warning instead of checking the real
-                # gpu numbers and raising the exception.
-                get_logger(self.__class__.__name__).warning(
-                    "You enabled use_gpu in spark local mode. Please make sure your local node "
-                    "has at least %d GPUs",
-                    self.getOrDefault(self.num_workers),
-                )
-            else:
-                # checking spark non-local mode.
-                if not gpu_per_task or int(gpu_per_task) < 1:
-                    raise RuntimeError(
-                        "The spark cluster does not have the necessary GPU"
-                        + "configuration for the spark task. Therefore, we cannot"
-                        + "run xgboost training using GPU."
-                    )
-
-                if int(gpu_per_task) > 1:
-                    get_logger(self.__class__.__name__).warning(
-                        "You configured %s GPU cores for each spark task, but in "
-                        "XGBoost training, every Spark task will only use one GPU core.",
-                        gpu_per_task,
-                    )
+        self._validate_gpu_params()


 def _validate_and_convert_feature_col_as_float_col_list(
@@ -557,6 +587,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
        #  they are added in `setParams`.
        self._setDefault(
            num_workers=1,
+            device="cpu",
            use_gpu=False,
            force_repartition=False,
            repartition_random_shuffle=False,
@@ -565,9 +596,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            arbitrary_params_dict={},
        )

-    def setParams(
-        self, **kwargs: Dict[str, Any]
-    ) -> None:  # pylint: disable=invalid-name
+        self.logger = get_logger(self.__class__.__name__)
+
+    def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
        """
        Set params for the estimator.
        """
@@ -612,6 +643,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                    )
                    raise ValueError(err_msg)
                _extra_params[k] = v
+
+        _check_distributed_params(kwargs)
        _existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
        self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})

@@ -708,9 +741,6 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
        # TODO: support "num_parallel_tree" for random forest
        params["num_boost_round"] = self.getOrDefault("n_estimators")

-        if self.getOrDefault(self.use_gpu):
-            params["tree_method"] = "gpu_hist"
-
        return params

    @classmethod
@@ -870,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

        return booster_params, train_call_kwargs_params, dmatrix_kwargs

+    def _skip_stage_level_scheduling(self) -> bool:
+        # pylint: disable=too-many-return-statements
+        """Check if stage-level scheduling is not needed,
+        return true to skip stage-level scheduling"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if ss.version < "3.4.0":
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark version 3.4.0+"
+                )
+                return True
+
+            if not _is_standalone_or_localcluster(sc):
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark standalone or "
+                    "local-cluster mode"
+                )
+                return True
+
+            executor_cores = sc.getConf().get("spark.executor.cores")
+            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+            if executor_cores is None or executor_gpus is None:
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores, "
+                    "spark.executor.resource.gpu.amount to be set."
+                )
+                return True
+
+            if int(executor_cores) == 1:
+                # there will be only 1 task running at any time.
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
+                )
+                return True
+
+            if int(executor_gpus) > 1:
+                # For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
+                # to make xgboost run successfully.
+                #
+                self.logger.info(
+                    "Stage-level scheduling in xgboost will not work "
+                    "when spark.executor.resource.gpu.amount>1"
+                )
+                return True
+
+            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
+
+            if task_gpu_amount is None:
+                # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
+                # but with stage-level scheduling, we can make training task grab the gpu.
+                return False
+
+            if float(task_gpu_amount) == float(executor_gpus):
+                # spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
+                # results in only 1 task running at a time, which may cause perf issue.
+                return True
+
+            # We can enable stage-level scheduling
+            return False
+
+        # CPU training doesn't require stage-level scheduling
+        return True
+
+    def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
+        """Try to enable stage-level scheduling"""
+
+        if self._skip_stage_level_scheduling():
+            return rdd
+
+        ss = _get_spark_session()
+
+        # executor_cores will not be None
+        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
+        assert executor_cores is not None
+
+        # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
+        # If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
+        # ETL gpu tasks running alongside training tasks.
+        spark_plugins = ss.conf.get("spark.plugins", " ")
+        assert spark_plugins is not None
+        spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
+        assert spark_rapids_sql_enabled is not None
+
+        task_cores = (
+            int(executor_cores)
+            if "com.nvidia.spark.SQLPlugin" in spark_plugins
+            and "true" == spark_rapids_sql_enabled.lower()
+            else (int(executor_cores) // 2) + 1
+        )
+
+        # Each training task requires cpu cores > total executor cores//2 + 1 which can
+        # make sure the tasks be sent to different executors.
+        #
+        # Please note that we can't use GPU to limit the concurrent tasks because of
+        # https://issues.apache.org/jira/browse/SPARK-45527.
+
+        task_gpus = 1.0
+        treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
+        rp = ResourceProfileBuilder().require(treqs).build
+
+        self.logger.info(
+            "XGBoost training tasks require the resource(cores=%s, gpu=%s).",
+            task_cores,
+            task_gpus,
+        )
+        return rdd.withResources(rp)
+
    def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
        # pylint: disable=too-many-statements, too-many-locals
        self._validate_params()
@@ -882,8 +1022,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            dmatrix_kwargs,
        ) = self._get_xgb_parameters(dataset)

-        use_gpu = self.getOrDefault(self.use_gpu)
-
+        run_on_gpu = use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
+            self.use_gpu
+        )
        is_local = _is_local(_get_spark_session().sparkContext)

        num_workers = self.getOrDefault(self.num_workers)
@@ -899,34 +1040,30 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

            context = BarrierTaskContext.get()

-            gpu_id = None
-            use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+            dev_ordinal = None
+            use_qdm = _can_use_qdm(booster_params.get("tree_method", None))

-            if use_gpu:
-                gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
-                booster_params["gpu_id"] = gpu_id
+            if run_on_gpu:
+                dev_ordinal = (
+                    context.partitionId() if is_local else _get_gpu_id(context)
+                )
+                booster_params["device"] = "cuda:" + str(dev_ordinal)
                # If cuDF is not installed, then using DMatrix instead of QDM,
                # because without cuDF, DMatrix performs better than QDM.
                # Note: Checking `is_cudf_available` in spark worker side because
                # spark worker might has different python environment with driver side.
-                use_qdm = use_hist and is_cudf_available()
-            else:
-                use_qdm = use_hist
+                use_qdm = use_qdm and is_cudf_available()
+                get_logger("XGBoost-PySpark").info(
+                    "Leveraging %s to train with QDM: %s",
+                    booster_params["device"],
+                    "on" if use_qdm else "off",
+                )

            if use_qdm and (booster_params.get("max_bin", None) is not None):
                dmatrix_kwargs["max_bin"] = booster_params["max_bin"]

            _rabit_args = {}
            if context.partitionId() == 0:
-                get_logger("XGBoostPySpark").debug(
-                    "booster params: %s\n"
-                    "train_call_kwargs_params: %s\n"
-                    "dmatrix_kwargs: %s",
-                    booster_params,
-                    train_call_kwargs_params,
-                    dmatrix_kwargs,
-                )
-
                _rabit_args = _get_rabit_args(context, num_workers)

            worker_message = {
@@ -945,7 +1082,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                dtrain, dvalid = create_dmatrix_from_partitions(
                    pandas_df_iter,
                    feature_prop.features_cols_names,
-                    gpu_id,
+                    dev_ordinal,
                    use_qdm,
                    dmatrix_kwargs,
                    enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
@@ -973,17 +1110,31 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                )

        def _run_job() -> Tuple[str, str]:
-            ret = (
+            rdd = (
                dataset.mapInPandas(
-                    _train_booster, schema="config string, booster string"  # type: ignore
+                    _train_booster,  # type: ignore
+                    schema="config string, booster string",
                )
                .rdd.barrier()
                .mapPartitions(lambda x: x)
-                .collect()[0]
            )
+            rdd_with_resource = self._try_stage_level_scheduling(rdd)
+            ret = rdd_with_resource.collect()[0]
            return ret[0], ret[1]

+        get_logger("XGBoost-PySpark").info(
+            "Running xgboost-%s on %s workers with"
+            "\n\tbooster params: %s"
+            "\n\ttrain_call_kwargs_params: %s"
+            "\n\tdmatrix_kwargs: %s",
+            xgboost._py_version(),
+            num_workers,
+            booster_params,
+            train_call_kwargs_params,
+            dmatrix_kwargs,
+        )
        (config, booster) = _run_job()
+        get_logger("XGBoost-PySpark").info("Finished xgboost training!")

        result_xgb_model = self._convert_to_sklearn_model(
            bytearray(booster, "utf-8"), config
@@ -1092,12 +1243,111 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
            )
        return features_col, feature_col_names

+    def _get_pred_contrib_col_name(self) -> Optional[str]:
+        """Return the pred_contrib_col col name"""
+        pred_contrib_col_name = None
+        if (
+            self.isDefined(self.pred_contrib_col)
+            and self.getOrDefault(self.pred_contrib_col) != ""
+        ):
+            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+
+        return pred_contrib_col_name
+
+    def _out_schema(self) -> Tuple[bool, str]:
+        """Return the bool to indicate if it's a single prediction, true is single prediction,
+        and the returned type of the user-defined function. The value must
+        be a DDL-formatted type string."""
+
+        if self._get_pred_contrib_col_name() is not None:
+            return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+
+        return True, "double"
+
+    def _get_predict_func(self) -> Callable:
+        """Return the true prediction function which will be running on the executor side"""
+
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            data = {}
+            preds = model.predict(
+                X,
+                base_margin=base_margin,
+                validate_features=False,
+                **predict_params,
+            )
+            data[pred.prediction] = pd.Series(preds)
+
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin)
+                data[pred.pred_contrib] = pd.Series(list(contribs))
+                return pd.DataFrame(data=data)
+
+            return data[pred.prediction]
+
+        return _predict
+
+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
+        """Post process of transform"""
+        prediction_col_name = self.getOrDefault(self.predictionCol)
+        single_pred, _ = self._out_schema()
+
+        if single_pred:
+            if prediction_col_name:
+                dataset = dataset.withColumn(prediction_col_name, pred_col)
+        else:
+            pred_struct_col = "_prediction_struct"
+            dataset = dataset.withColumn(pred_struct_col, pred_col)
+
+            if prediction_col_name:
+                dataset = dataset.withColumn(
+                    prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
+                )
+
+            pred_contrib_col_name = self._get_pred_contrib_col_name()
+            if pred_contrib_col_name is not None:
+                dataset = dataset.withColumn(
+                    pred_contrib_col_name,
+                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
+                )
+
+            dataset = dataset.drop(pred_struct_col)
+        return dataset
+
+    def _gpu_transform(self) -> bool:
+        """If gpu is used to do the prediction, true to gpu prediction"""
+
+        if _is_local(_get_spark_session().sparkContext):
+            # if it's local model, we just use the internal "device"
+            return use_cuda(self.getOrDefault(self.device))
+
+        gpu_per_task = (
+            _get_spark_session()
+            .sparkContext.getConf()
+            .get("spark.task.resource.gpu.amount")
+        )
+
+        # User don't set gpu configurations, just use cpu
+        if gpu_per_task is None:
+            if use_cuda(self.getOrDefault(self.device)):
+                get_logger("XGBoost-PySpark").warning(
+                    "Do the prediction on the CPUs since "
+                    "no gpu configurations are set"
+                )
+            return False
+
+        # User already sets the gpu configurations, we just use the internal "device".
+        return use_cuda(self.getOrDefault(self.device))
+
    def _transform(self, dataset: DataFrame) -> DataFrame:
        # pylint: disable=too-many-statements, too-many-locals
        # Save xgb_sklearn_model and predict_params to be local variable
        # to avoid the `self` object to be pickled to remote.
        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()

        has_base_margin = False
        if (
@@ -1112,79 +1362,92 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
        features_col, feature_col_names = self._get_feature_col(dataset)
        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)

-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+        predict_func = self._get_predict_func()

-        single_pred = True
-        schema = "double"
-        if pred_contrib_col_name:
-            single_pred = False
-            schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+        _, schema = self._out_schema()
+
+        is_local = _is_local(_get_spark_session().sparkContext)
+        run_on_gpu = self._gpu_transform()

        @pandas_udf(schema)  # type: ignore
        def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
            assert xgb_sklearn_model is not None
            model = xgb_sklearn_model
+
+            from pyspark import TaskContext
+
+            context = TaskContext.get()
+            assert context is not None
+
+            dev_ordinal = -1
+
+            if is_cudf_available():
+                if is_local:
+                    if run_on_gpu and is_cupy_available():
+                        import cupy as cp  # pylint: disable=import-error
+
+                        total_gpus = cp.cuda.runtime.getDeviceCount()
+                        if total_gpus > 0:
+                            partition_id = context.partitionId()
+                            # For transform local mode, default the dev_ordinal to
+                            # (partition id) % gpus.
+                            dev_ordinal = partition_id % total_gpus
+                elif run_on_gpu:
+                    dev_ordinal = _get_gpu_id(context)
+
+                if dev_ordinal >= 0:
+                    device = "cuda:" + str(dev_ordinal)
+                    get_logger("XGBoost-PySpark").info(
+                        "Do the inference with device: %s", device
+                    )
+                    model.set_params(device=device)
+                else:
+                    get_logger("XGBoost-PySpark").info("Do the inference on the CPUs")
+            else:
+                msg = (
+                    "CUDF is unavailable, fallback the inference on the CPUs"
+                    if run_on_gpu
+                    else "Do the inference on the CPUs"
+                )
+                get_logger("XGBoost-PySpark").info(msg)
+
+            def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
+                """Move the data to gpu if possible"""
+                if dev_ordinal >= 0:
+                    import cudf  # pylint: disable=import-error
+                    import cupy as cp  # pylint: disable=import-error
+
+                    # We must set the device after import cudf, which will change the device id to 0
+                    # See https://github.com/rapidsai/cudf/issues/11386
+                    cp.cuda.runtime.setDevice(dev_ordinal)  # pylint: disable=I1101
+                    df = cudf.DataFrame(data)
+                    del data
+                    return df
+                return data
+
            for data in iterator:
                if enable_sparse_data_optim:
                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
                else:
                    if feature_col_names is not None:
-                        X = data[feature_col_names]
+                        tmp = data[feature_col_names]
                    else:
-                        X = stack_series(data[alias.data])
+                        tmp = stack_series(data[alias.data])
+                    X = to_gpu_if_possible(tmp)

                if has_base_margin:
-                    base_margin = data[alias.margin].to_numpy()
+                    base_margin = to_gpu_if_possible(data[alias.margin])
                else:
                    base_margin = None

-                data = {}
-                preds = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    validate_features=False,
-                    **predict_params,
-                )
-                data[pred.prediction] = pd.Series(preds)
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin)
-                    data[pred.pred_contrib] = pd.Series(list(contribs))
-                    yield pd.DataFrame(data=data)
-                else:
-                    yield data[pred.prediction]
+                yield predict_func(model, X, base_margin)

        if has_base_margin:
            pred_col = predict_udf(struct(*features_col, base_margin_col))
        else:
            pred_col = predict_udf(struct(*features_col))

-        prediction_col_name = self.getOrDefault(self.predictionCol)
-
-        if single_pred:
-            dataset = dataset.withColumn(prediction_col_name, pred_col)
-        else:
-            pred_struct_col = "_prediction_struct"
-            dataset = dataset.withColumn(pred_struct_col, pred_col)
-
-            dataset = dataset.withColumn(
-                prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
-            )
-
-            if pred_contrib_col_name:
-                dataset = dataset.withColumn(
-                    pred_contrib_col_name,
-                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
-                )
-
-            dataset = dataset.drop(pred_struct_col)
-
-        return dataset
+        return self._post_transform(dataset, pred_col)


 class _ClassificationModel(  # pylint: disable=abstract-method
@@ -1196,22 +1459,21 @@ class _ClassificationModel(  # pylint: disable=abstract-method
    .. Note:: This API is experimental.
    """

-    def _transform(self, dataset: DataFrame) -> DataFrame:
-        # pylint: disable=too-many-statements, too-many-locals
-        # Save xgb_sklearn_model and predict_params to be local variable
-        # to avoid the `self` object to be pickled to remote.
-        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()
+    def _out_schema(self) -> Tuple[bool, str]:
+        schema = (
+            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
+            f" {pred.probability} array<double>"
+        )
+        if self._get_pred_contrib_col_name() is not None:
+            # We will force setting strict_shape to True when predicting contribs,
+            # So, it will also output 3-D shape result.
+            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"

-        has_base_margin = False
-        if (
-            self.isDefined(self.base_margin_col)
-            and self.getOrDefault(self.base_margin_col) != ""
-        ):
-            has_base_margin = True
-            base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
-                alias.margin
-            )
+        return False, schema
+
+    def _get_predict_func(self) -> Callable:
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()

        def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
            if margins.ndim == 1:
@@ -1226,76 +1488,38 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                class_probs = softmax(raw_preds, axis=1)
            return raw_preds, class_probs

-        features_col, feature_col_names = self._get_feature_col(dataset)
-        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            margins = model.predict(
+                X,
+                base_margin=base_margin,
+                output_margin=True,
+                validate_features=False,
+                **predict_params,
+            )
+            raw_preds, class_probs = transform_margin(margins)

-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+            # It seems that they use argmax of class probs,
+            # not of margin to get the prediction (Note: scala implementation)
+            preds = np.argmax(class_probs, axis=1)
+            result: Dict[str, pd.Series] = {
+                pred.raw_prediction: pd.Series(list(raw_preds)),
+                pred.prediction: pd.Series(preds),
+                pred.probability: pd.Series(list(class_probs)),
+            }

-        schema = (
-            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
-            f" {pred.probability} array<double>"
-        )
-        if pred_contrib_col_name:
-            # We will force setting strict_shape to True when predicting contribs,
-            # So, it will also output 3-D shape result.
-            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin, strict_shape=True)
+                result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))

-        @pandas_udf(schema)  # type: ignore
-        def predict_udf(
-            iterator: Iterator[Tuple[pd.Series, ...]]
-        ) -> Iterator[pd.DataFrame]:
-            assert xgb_sklearn_model is not None
-            model = xgb_sklearn_model
-            for data in iterator:
-                if enable_sparse_data_optim:
-                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
-                else:
-                    if feature_col_names is not None:
-                        X = data[feature_col_names]  # type: ignore
-                    else:
-                        X = stack_series(data[alias.data])
+            return pd.DataFrame(data=result)

-                if has_base_margin:
-                    base_margin = stack_series(data[alias.margin])
-                else:
-                    base_margin = None
-
-                margins = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    output_margin=True,
-                    validate_features=False,
-                    **predict_params,
-                )
-                raw_preds, class_probs = transform_margin(margins)
-
-                # It seems that they use argmax of class probs,
-                # not of margin to get the prediction (Note: scala implementation)
-                preds = np.argmax(class_probs, axis=1)
-                result: Dict[str, pd.Series] = {
-                    pred.raw_prediction: pd.Series(list(raw_preds)),
-                    pred.prediction: pd.Series(preds),
-                    pred.probability: pd.Series(list(class_probs)),
-                }
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin, strict_shape=True)
-                    result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
-
-                yield pd.DataFrame(data=result)
-
-        if has_base_margin:
-            pred_struct = predict_udf(struct(*features_col, base_margin_col))
-        else:
-            pred_struct = predict_udf(struct(*features_col))
+        return _predict

+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
        pred_struct_col = "_prediction_struct"
-        dataset = dataset.withColumn(pred_struct_col, pred_struct)
+        dataset = dataset.withColumn(pred_struct_col, pred_col)

        raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
        if raw_prediction_col_name:
@@ -1317,7 +1541,8 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                array_to_vector(getattr(col(pred_struct_col), pred.probability)),
            )

-        if pred_contrib_col_name:
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+        if pred_contrib_col_name is not None:
            dataset = dataset.withColumn(
                pred_contrib_col_name,
                getattr(col(pred_struct_col), pred.pred_contrib),
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:

 def make_qdm(
    data: Dict[str, List[np.ndarray]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
    meta: Dict[str, Any],
    ref: Optional[DMatrix],
    params: Dict[str, Any],
@@ -165,7 +165,7 @@ def make_qdm(
    """Handle empty partition for QuantileDMatrix."""
    if not data:
        return QuantileDMatrix(np.empty((0, 0)), ref=ref)
-    it = PartIter(data, gpu_id, **meta)
+    it = PartIter(data, dev_ordinal, **meta)
    m = QuantileDMatrix(it, **params, ref=ref)
    return m

@@ -173,7 +173,7 @@ def make_qdm(
 def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
    iterator: Iterator[pd.DataFrame],
    feature_cols: Optional[Sequence[str]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
    use_qdm: bool,
    kwargs: Dict[str, Any],  # use dict to make sure this parameter is passed.
    enable_sparse_data_optim: bool,
@@ -187,7 +187,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
        Pyspark partition iterator.
    feature_cols:
        A sequence of feature names, used only when rapids plugin is enabled.
-    gpu_id:
+    dev_ordinal:
        Device ordinal, used when GPU is enabled.
    use_qdm :
        Whether QuantileDMatrix should be used instead of DMatrix.
@@ -304,13 +304,13 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments

    if feature_cols is not None and use_qdm:
        cache_partitions(iterator, append_fn)
-        dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
    elif feature_cols is not None and not use_qdm:
        cache_partitions(iterator, append_fn)
        dtrain = make(train_data, kwargs)
    elif feature_cols is None and use_qdm:
        cache_partitions(iterator, append_fn)
-        dtrain = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
    else:
        cache_partitions(iterator, append_fn)
        dtrain = make(train_data, kwargs)
@@ -324,7 +324,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
    if has_validation_col:
        if use_qdm:
            dvalid: Optional[DMatrix] = make_qdm(
-                valid_data, gpu_id, meta, dtrain, params
+                valid_data, dev_ordinal, meta, dtrain, params
            )
        else:
            dvalid = make(valid_data, kwargs) if has_validation_col else None
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -3,8 +3,8 @@
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
 # pylint: disable=unused-argument, too-many-locals

-
-from typing import Any, Dict, List, Optional, Type, Union
+import warnings
+from typing import Any, List, Optional, Type, Union

 import numpy as np
 from pyspark import keyword_only
@@ -77,28 +77,35 @@ def _set_pyspark_xgb_cls_param_attrs(
        set_param_attrs(name, param_obj)


+def _deprecated_use_gpu() -> None:
+    warnings.warn(
+        "`use_gpu` is deprecated since 2.0.0, use `device` instead", FutureWarning
+    )
+
+
 class SparkXGBRegressor(_SparkXGBEstimator):
-    """
-    SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
+    """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
    algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
-    and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
-    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
-    :py:class:`~pyspark.ml.classification.OneVsRest`
+    and PySpark ML meta algorithms like
+    - :py:class:`~pyspark.ml.tuning.CrossValidator`/
+    - :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
+    - :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBRegressor automatically supports most of the parameters in
    :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
-    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
+    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict`
+    method.

-    SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.

-    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
-    another param called `base_margin_col`. see doc below for more details.
+    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but
+    support another param called `base_margin_col`. see doc below for more details.

    SparkXGBRegressor doesn't support `validate_features` and `output_margin` param.

-    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.


    Parameters
@@ -134,8 +141,16 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        How many XGBoost workers to be used to train.
        Each XGBoost worker corresponds to one spark task.
    use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+
+    device:
+
+        .. versionadded:: 2.0.0
+
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -194,14 +209,17 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        weight_col: Optional[str] = None,
        base_margin_col: Optional[str] = None,
        num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
        force_repartition: bool = False,
        repartition_random_shuffle: bool = False,
        enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
    ) -> None:
        super().__init__()
        input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
        self.setParams(**input_kwargs)

    @classmethod
@@ -239,27 +257,29 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
    """SparkXGBClassifier is a PySpark ML estimator. It implements the XGBoost
    classification algorithm based on XGBoost python library, and it can be used in
    PySpark Pipeline and PySpark ML meta algorithms like
-    :py:class:`~pyspark.ml.tuning.CrossValidator`/
-    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
-    :py:class:`~pyspark.ml.classification.OneVsRest`
+    - :py:class:`~pyspark.ml.tuning.CrossValidator`/
+    - :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
+    - :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBClassifier automatically supports most of the parameters in
    :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
+    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict`
+    method.

-    SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.

-    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
-    another param called `base_margin_col`. see doc below for more details.
+    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but
+    support another param called `base_margin_col`. see doc below for more details.

-    SparkXGBClassifier doesn't support setting `output_margin`, but we can get output margin
-    from the raw prediction column. See `raw_prediction_col` param doc below for more details.
+    SparkXGBClassifier doesn't support setting `output_margin`, but we can get output
+    margin from the raw prediction column. See `raw_prediction_col` param doc below for
+    more details.

    SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.

-    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.


    Parameters
@@ -301,8 +321,16 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        How many XGBoost workers to be used to train.
        Each XGBoost worker corresponds to one spark task.
    use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+
+    device:
+
+        .. versionadded:: 2.0.0
+
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -361,11 +389,12 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        weight_col: Optional[str] = None,
        base_margin_col: Optional[str] = None,
        num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
        force_repartition: bool = False,
        repartition_random_shuffle: bool = False,
        enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
    ) -> None:
        super().__init__()
        # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
@@ -373,6 +402,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        # binary or multinomial input dataset, and we need to remove the fixed default
        # param value as well to avoid causing ambiguity.
        input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
        self.setParams(**input_kwargs)
        self._setDefault(objective=None)

@@ -423,19 +454,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
    :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
    :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.

-    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.

    SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
    another param called `base_margin_col`. see doc below for more details.

    SparkXGBRanker doesn't support setting `output_margin`, but we can get output margin
-    from the raw prediction column. See `raw_prediction_col` param doc below for more details.
+    from the raw prediction column. See `raw_prediction_col` param doc below for more
+    details.

    SparkXGBRanker doesn't support `validate_features` and `output_margin` param.

-    SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.


    Parameters
@@ -468,13 +500,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
        :py:class:`xgboost.XGBRanker` fit method.
    qid_col:
        Query id column name.
-
    num_workers:
        How many XGBoost workers to be used to train.
        Each XGBoost worker corresponds to one spark task.
    use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+
+    device:
+
+        .. versionadded:: 2.0.0
+
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -539,14 +578,17 @@ class SparkXGBRanker(_SparkXGBEstimator):
        base_margin_col: Optional[str] = None,
        qid_col: Optional[str] = None,
        num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
        force_repartition: bool = False,
        repartition_random_shuffle: bool = False,
        enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
    ) -> None:
        super().__init__()
        input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
        self.setParams(**input_kwargs)

    @classmethod
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -7,10 +7,10 @@ import os
 import sys
 import uuid
 from threading import Thread
-from typing import Any, Callable, Dict, Set, Type
+from typing import Any, Callable, Dict, Optional, Set, Type

 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles
+from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession

 from xgboost import Booster, XGBModel, collective
@@ -104,6 +104,10 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
    # If the logger is configured, skip the configure
    if not logger.handlers and not logging.getLogger().handlers:
        handler = logging.StreamHandler(sys.stderr)
+        formatter = logging.Formatter(
+            "%(asctime)s %(levelname)s %(name)s: %(funcName)s %(message)s"
+        )
+        handler.setFormatter(formatter)
        logger.addHandler(handler)
    return logger

@@ -125,7 +129,14 @@ def _is_local(spark_context: SparkContext) -> bool:
    return spark_context._jsc.sc().isLocal()


-def _get_gpu_id(task_context: BarrierTaskContext) -> int:
+def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
+    master = spark_context.getConf().get("spark.master")
+    return master is not None and (
+        master.startswith("spark://") or master.startswith("local-cluster")
+    )
+
+
+def _get_gpu_id(task_context: TaskContext) -> int:
    """Get the gpu id from the task resources"""
    if task_context is None:
        # This is a safety check.
@@ -186,3 +197,8 @@ def deserialize_booster(model: str) -> Booster:
        f.write(model)
    booster.load_model(tmp_file_name)
    return booster
+
+
+def use_cuda(device: Optional[str]) -> bool:
+    """Whether xgboost is using CUDA workers."""
+    return device in ("cuda", "gpu")
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -25,6 +25,7 @@ from typing import (
    Set,
    Tuple,
    TypedDict,
+    TypeVar,
    Union,
 )

@@ -198,20 +199,20 @@ class IteratorForTest(xgb.core.DataIter):
        X: Sequence,
        y: Sequence,
        w: Optional[Sequence],
-        cache: Optional[str] = "./",
+        cache: Optional[str],
    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.w = w
        self.it = 0
-        super().__init__(cache)
+        super().__init__(cache_prefix=cache)

    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
            return 0

-        with pytest.raises(TypeError, match="keyword args"):
+        with pytest.raises(TypeError, match="Keyword argument"):
            input_data(self.X[self.it], self.y[self.it], None)

        # Use copy to make sure the iterator doesn't hold a reference to the data.
@@ -229,7 +230,7 @@ class IteratorForTest(xgb.core.DataIter):

    def as_arrays(
        self,
-    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
+    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
        if isinstance(self.X[0], sparse.csr_matrix):
            X = sparse.vstack(self.X, format="csr")
        else:
@@ -243,7 +244,12 @@ class IteratorForTest(xgb.core.DataIter):


 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    use_cupy: bool = False,
+    *,
+    vary_size: bool = False,
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
    X = []
    y = []
@@ -254,16 +260,25 @@ def make_batches(
        rng = cupy.random.RandomState(1994)
    else:
        rng = np.random.RandomState(1994)
-    for _ in range(n_batches):
-        _X = rng.randn(n_samples_per_batch, n_features)
-        _y = rng.randn(n_samples_per_batch)
-        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
+    for i in range(n_batches):
+        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
+        _X = rng.randn(n_samples, n_features)
+        _y = rng.randn(n_samples)
+        _w = rng.uniform(low=0, high=1, size=n_samples)
        X.append(_X)
        y.append(_y)
        w.append(_w)
    return X, y, w


+def make_regression(
+    n_samples: int, n_features: int, use_cupy: bool
+) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
+    """Make a simple regression dataset."""
+    X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
+    return X[0], y[0], w[0]
+
+
 def make_batches_sparse(
    n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
 ) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
@@ -347,7 +362,9 @@ class TestDataset:
            if w is not None:
                weight.append(w)

-        it = IteratorForTest(predictor, response, weight if weight else None)
+        it = IteratorForTest(
+            predictor, response, weight if weight else None, cache="cache"
+        )
        return xgb.DMatrix(it)

    def __repr__(self) -> str:
@@ -709,6 +726,9 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
    )


+M = TypeVar("M", xgb.Booster, xgb.XGBModel)
+
+
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
--- a/python-package/xgboost/testing/data_iter.py
+++ b/python-package/xgboost/testing/data_iter.py
@@ -0,0 +1,34 @@
+"""Tests related to the `DataIter` interface."""
+import numpy as np
+
+import xgboost
+from xgboost import testing as tm
+
+
+def run_mixed_sparsity(device: str) -> None:
+    """Check QDM with mixed batches."""
+    X_0, y_0, _ = tm.make_regression(128, 16, False)
+    if device.startswith("cuda"):
+        X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
+    else:
+        X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, False)
+    X_2, y_2 = tm.make_sparse_regression(512, 16, 0.9, True)
+    X = [X_0, X_1, X_2]
+    y = [y_0, y_1, y_2]
+
+    if device.startswith("cuda"):
+        import cupy as cp  # pylint: disable=import-error
+
+        X = [cp.array(batch) for batch in X]
+
+    it = tm.IteratorForTest(X, y, None, None)
+    Xy_0 = xgboost.QuantileDMatrix(it)
+
+    X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
+    X = [X_0, X_1, X_2]
+    y = [y_0, y_1, y_2]
+    X_arr = np.concatenate(X, axis=0)
+    y_arr = np.concatenate(y, axis=0)
+    Xy_1 = xgboost.QuantileDMatrix(X_arr, y_arr)
+
+    assert tm.predictor_equal(Xy_0, Xy_1)
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -41,6 +41,10 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )

+hist_cache_strategy = strategies.fixed_dictionaries(
+    {"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
+)
+
 hist_multi_parameter_strategy = strategies.fixed_dictionaries(
    {
        "max_depth": strategies.integers(1, 11),
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -1,7 +1,7 @@
 """Tests for updaters."""
 import json
 from functools import partial, update_wrapper
-from typing import Dict
+from typing import Any, Dict, List

 import numpy as np

@@ -159,3 +159,238 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:

    for i in range(alpha.shape[0]):
        np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
+
+
+def check_cut(
+    n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
+) -> None:
+    """Check the cut values."""
+    from pandas.api.types import is_categorical_dtype
+
+    assert data.shape[0] == indptr[-1]
+    assert data.shape[0] == n_entries
+
+    assert indptr.dtype == np.uint64
+    for i in range(1, indptr.size):
+        beg = int(indptr[i - 1])
+        end = int(indptr[i])
+        for j in range(beg + 1, end):
+            assert data[j] > data[j - 1]
+            if is_categorical_dtype(dtypes[i - 1]):
+                assert data[j] == data[j - 1] + 1
+
+
+def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
+    """Check with optional cupy."""
+    from pandas.api.types import is_categorical_dtype
+
+    n_samples = 1024
+    n_features = 14
+    max_bin = 16
+    dtypes = [np.float32] * n_features
+
+    # numerical
+    X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
+    # - qdm
+    Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - dm
+    Xyw = xgb.DMatrix(X, y, weight=w)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - ext mem
+    n_batches = 3
+    n_samples_per_batch = 256
+    it = tm.IteratorForTest(
+        *tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
+        cache="cache",
+    )
+    Xy: xgb.DMatrix = xgb.DMatrix(it)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+
+    # categorical
+    n_categories = 32
+    X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
+    if use_cupy:
+        import cudf  # pylint: disable=import-error
+        import cupy as cp  # pylint: disable=import-error
+
+        X = cudf.from_pandas(X)
+        y = cp.array(y)
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+
+    # mixed
+    X, y = tm.make_categorical(
+        n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
+    )
+    n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
+    n_num_features = n_features - n_cat_features
+    n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+
+
+def check_get_quantile_cut(tree_method: str) -> None:
+    """Check the quantile cut getter."""
+
+    use_cupy = tree_method == "gpu_hist"
+    check_get_quantile_cut_device(tree_method, False)
+    if use_cupy:
+        check_get_quantile_cut_device(tree_method, True)
+
+
+USE_ONEHOT = np.iinfo(np.int32).max
+USE_PART = 1
+
+
+def check_categorical_ohe(  # pylint: disable=too-many-arguments
+    rows: int, cols: int, rounds: int, cats: int, device: str, tree_method: str
+) -> None:
+    "Test for one-hot encoding with categorical data."
+
+    onehot, label = tm.make_categorical(rows, cols, cats, True)
+    cat, _ = tm.make_categorical(rows, cols, cats, False)
+
+    by_etl_results: Dict[str, Dict[str, List[float]]] = {}
+    by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
+
+    parameters: Dict[str, Any] = {
+        "tree_method": tree_method,
+        # Use one-hot exclusively
+        "max_cat_to_onehot": USE_ONEHOT,
+        "device": device,
+    }
+
+    m = xgb.DMatrix(onehot, label, enable_categorical=False)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_etl_results,
+    )
+
+    m = xgb.DMatrix(cat, label, enable_categorical=True)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_builtin_results,
+    )
+
+    # There are guidelines on how to specify tolerance based on considering output
+    # as random variables. But in here the tree construction is extremely sensitive
+    # to floating point errors. An 1e-5 error in a histogram bin can lead to an
+    # entirely different tree. So even though the test is quite lenient, hypothesis
+    # can still pick up falsifying examples from time to time.
+    np.testing.assert_allclose(
+        np.array(by_etl_results["Train"]["rmse"]),
+        np.array(by_builtin_results["Train"]["rmse"]),
+        rtol=1e-3,
+    )
+    assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+    by_grouping: Dict[str, Dict[str, List[float]]] = {}
+    # switch to partition-based splits
+    parameters["max_cat_to_onehot"] = USE_PART
+    parameters["reg_lambda"] = 0
+    m = xgb.DMatrix(cat, label, enable_categorical=True)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_grouping,
+    )
+    rmse_oh = by_builtin_results["Train"]["rmse"]
+    rmse_group = by_grouping["Train"]["rmse"]
+    # always better or equal to onehot when there's no regularization.
+    for a, b in zip(rmse_oh, rmse_group):
+        assert a >= b
+
+    parameters["reg_lambda"] = 1.0
+    by_grouping = {}
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=32,
+        evals=[(m, "Train")],
+        evals_result=by_grouping,
+    )
+    assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
+
+
+def check_categorical_missing(
+    rows: int, cols: int, cats: int, device: str, tree_method: str
+) -> None:
+    """Check categorical data with missing values."""
+    parameters: Dict[str, Any] = {"tree_method": tree_method, "device": device}
+    cat, label = tm.make_categorical(
+        rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
+    )
+    Xy = xgb.DMatrix(cat, label, enable_categorical=True)
+
+    def run(max_cat_to_onehot: int) -> None:
+        # Test with onehot splits
+        parameters["max_cat_to_onehot"] = max_cat_to_onehot
+
+        evals_result: Dict[str, Dict] = {}
+        booster = xgb.train(
+            parameters,
+            Xy,
+            num_boost_round=16,
+            evals=[(Xy, "Train")],
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["rmse"])
+        y_predt = booster.predict(Xy)
+
+        rmse = tm.root_mean_square(label, y_predt)
+        np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1], rtol=2e-5)
+
+    # Test with OHE split
+    run(USE_ONEHOT)
+
+    # Test with partition-based split
+    run(USE_PART)
+
+
+def train_result(
+    param: Dict[str, Any], dmat: xgb.DMatrix, num_rounds: int
+) -> Dict[str, Any]:
+    """Get training result from parameters and data."""
+    result: Dict[str, Any] = {}
+    booster = xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        evals=[(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    assert booster.num_features() == dmat.num_col()
+    assert booster.num_boosted_rounds() == num_rounds
+    assert booster.feature_names == dmat.feature_names
+    assert booster.feature_types == dmat.feature_types
+
+    return result
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -28,17 +28,6 @@ from .core import (
 _CVFolds = Sequence["CVPack"]


-def _assert_new_callback(callbacks: Optional[Sequence[TrainingCallback]]) -> None:
-    is_new_callback: bool = not callbacks or all(
-        isinstance(c, TrainingCallback) for c in callbacks
-    )
-    if not is_new_callback:
-        link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
-        raise ValueError(
-            f"Old style callback was removed in version 1.6.  See: {link}."
-        )
-
-
 def _configure_custom_metric(
    feval: Optional[Metric], custom_metric: Optional[Metric]
 ) -> Optional[Metric]:
@@ -170,7 +159,6 @@ def train(
    bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
    start_iteration = 0

-    _assert_new_callback(callbacks)
    if verbose_eval:
        verbose_eval = 1 if verbose_eval is True else verbose_eval
        callbacks.append(EvaluationMonitor(period=verbose_eval))
@@ -247,7 +235,7 @@ class _PackedBooster:
        result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
        return result

-    def set_attr(self, **kwargs: Optional[str]) -> Any:
+    def set_attr(self, **kwargs: Optional[Any]) -> Any:
        """Iterate through folds for setting attributes"""
        for f in self.cvfolds:
            f.bst.set_attr(**kwargs)
@@ -274,11 +262,20 @@ class _PackedBooster:
        """Get best_iteration"""
        return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))

+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        """Get best_iteration"""
+        self.set_attr(best_iteration=iteration)
+
    @property
    def best_score(self) -> float:
        """Get best_score."""
        return float(cast(float, self.cvfolds[0].bst.attr("best_score")))

+    @best_score.setter
+    def best_score(self, score: float) -> None:
+        self.set_attr(best_score=score)
+

 def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
    """
@@ -551,7 +548,6 @@ def cv(

    # setup callbacks
    callbacks = [] if callbacks is None else copy.copy(list(callbacks))
-    _assert_new_callback(callbacks)

    if verbose_eval:
        verbose_eval = 1 if verbose_eval is True else verbose_eval