temp merge, disable 1 line, SetValid

2023-10-12 16:16:44 -07:00
parent 2e7e9d3b2d 85d3017ca5
commit ea19555474
492 changed files with 15533 additions and 9376 deletions
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(

    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
-        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
-        libxgboost_sys = sys_prefix / "lib" / _lib_name()
+        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
+        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
        if not libxgboost_sys.exists():
            raise RuntimeError(
                f"use_system_libxgboost was specified but {_lib_name()} is "
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "packager.pep517"

 [project]
 name = "xgboost"
-version = "2.0.0-dev"
+version = "2.1.0-dev"
 authors = [
    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-2.0.0-dev
+2.1.0-dev
--- a/python-package/xgboost/init.py
+++ b/python-package/xgboost/init.py
@@ -4,7 +4,7 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
 """

 from . import tracker  # noqa
-from . import collective, dask, rabit
+from . import collective, dask
 from .core import (
    Booster,
    DataIter,
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -8,7 +8,9 @@ from typing import (
    Callable,
    Dict,
    List,
+    Optional,
    Sequence,
+    Tuple,
    Type,
    TypeVar,
    Union,
@@ -20,8 +22,6 @@ import numpy as np

 DataType = Any

-# xgboost accepts some other possible types in practice due to historical reason, which is
-# lesser tested.  For now we encourage users to pass a simple list of string.
 FeatureInfo = Sequence[str]
 FeatureNames = FeatureInfo
 FeatureTypes = FeatureInfo
@@ -97,6 +97,13 @@ else:
        ctypes._Pointer,
    ]

+# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
+# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
+# being freed.
+TransformedData = Tuple[
+    Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
+]
+
 # template parameter
 _T = TypeVar("_T")
 _F = TypeVar("_F", bound=Callable[..., Any])
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -134,13 +134,17 @@ class CallbackContainer:
        is_cv: bool = False,
    ) -> None:
        self.callbacks = set(callbacks)
-        if metric is not None:
-            msg = (
-                "metric must be callable object for monitoring.  For "
-                + "builtin metrics, passing them in training parameter"
-                + " will invoke monitor automatically."
-            )
-            assert callable(metric), msg
+        for cb in callbacks:
+            if not isinstance(cb, TrainingCallback):
+                raise TypeError("callback must be an instance of `TrainingCallback`.")
+
+        msg = (
+            "metric must be callable object for monitoring.  For builtin metrics"
+            ", passing them in training parameter invokes monitor automatically."
+        )
+        if metric is not None and not callable(metric):
+            raise TypeError(msg)
+
        self.metric = metric
        self.history: TrainingCallback.EvalsLog = collections.OrderedDict()
        self._output_margin = output_margin
@@ -170,16 +174,6 @@ class CallbackContainer:
            else:
                assert isinstance(model, Booster), msg

-        if not self.is_cv:
-            if model.attr("best_score") is not None:
-                model.best_score = float(cast(str, model.attr("best_score")))
-                model.best_iteration = int(cast(str, model.attr("best_iteration")))
-            else:
-                # Due to compatibility with version older than 1.4, these attributes are
-                # added to Python object even if early stopping is not used.
-                model.best_iteration = model.num_boosted_rounds() - 1
-                model.set_attr(best_iteration=str(model.best_iteration))
-
        return model

    def before_iteration(
@@ -267,9 +261,14 @@ class LearningRateScheduler(TrainingCallback):
    def __init__(
        self, learning_rates: Union[Callable[[int], float], Sequence[float]]
    ) -> None:
-        assert callable(learning_rates) or isinstance(
+        if not callable(learning_rates) and not isinstance(
            learning_rates, collections.abc.Sequence
-        )
+        ):
+            raise TypeError(
+                "Invalid learning rates, expecting callable or sequence, got: "
+                f"{type(learning_rates)}"
+            )
+
        if callable(learning_rates):
            self.learning_rates = learning_rates
        else:
@@ -302,24 +301,28 @@ class EarlyStopping(TrainingCallback):
    save_best :
        Whether training should return the best model or the last model.
    min_delta :
-        Minimum absolute change in score to be qualified as an improvement.

        .. versionadded:: 1.5.0

-        .. code-block:: python
+        Minimum absolute change in score to be qualified as an improvement.

-            es = xgboost.callback.EarlyStopping(
-                rounds=2,
-                min_delta=1e-3,
-                save_best=True,
-                maximize=False,
-                data_name="validation_0",
-                metric_name="mlogloss",
-            )
-            clf = xgboost.XGBClassifier(tree_method="gpu_hist", callbacks=[es])
+    Examples
+    --------

-            X, y = load_digits(return_X_y=True)
-            clf.fit(X, y, eval_set=[(X, y)])
+    .. code-block:: python
+
+        es = xgboost.callback.EarlyStopping(
+            rounds=2,
+            min_delta=1e-3,
+            save_best=True,
+            maximize=False,
+            data_name="validation_0",
+            metric_name="mlogloss",
+        )
+        clf = xgboost.XGBClassifier(tree_method="hist", device="cuda", callbacks=[es])
+
+        X, y = load_digits(return_X_y=True)
+        clf.fit(X, y, eval_set=[(X, y)])
    """

    # pylint: disable=too-many-arguments
@@ -363,7 +366,7 @@ class EarlyStopping(TrainingCallback):
            return numpy.greater(get_s(new) - self._min_delta, get_s(best))

        def minimize(new: _Score, best: _Score) -> bool:
-            """New score should be smaller than the old one."""
+            """New score should be lesser than the old one."""
            return numpy.greater(get_s(best) - self._min_delta, get_s(new))

        if self.maximize is None:
@@ -419,38 +422,53 @@ class EarlyStopping(TrainingCallback):
    ) -> bool:
        epoch += self.starting_round  # training continuation
        msg = "Must have at least 1 validation dataset for early stopping."
-        assert len(evals_log.keys()) >= 1, msg
-        data_name = ""
+        if len(evals_log.keys()) < 1:
+            raise ValueError(msg)
+
+        # Get data name
        if self.data:
-            for d, _ in evals_log.items():
-                if d == self.data:
-                    data_name = d
-            if not data_name:
-                raise ValueError("No dataset named:", self.data)
+            data_name = self.data
        else:
            # Use the last one as default.
            data_name = list(evals_log.keys())[-1]
-        assert isinstance(data_name, str) and data_name
+        if data_name not in evals_log:
+            raise ValueError(f"No dataset named: {data_name}")
+
+        if not isinstance(data_name, str):
+            raise TypeError(
+                f"The name of the dataset should be a string. Got: {type(data_name)}"
+            )
        data_log = evals_log[data_name]

-        # Filter out scores that can not be used for early stopping.
+        # Get metric name
        if self.metric_name:
            metric_name = self.metric_name
        else:
            # Use last metric by default.
-            assert isinstance(data_log, collections.OrderedDict)
            metric_name = list(data_log.keys())[-1]
+        if metric_name not in data_log:
+            raise ValueError(f"No metric named: {metric_name}")
+
+        # The latest score
        score = data_log[metric_name][-1]
        return self._update_rounds(score, data_name, metric_name, model, epoch)

    def after_training(self, model: _Model) -> _Model:
+        if not self.save_best:
+            return model
+
        try:
-            if self.save_best:
-                model = model[: int(model.attr("best_iteration")) + 1]
+            best_iteration = model.best_iteration
+            best_score = model.best_score
+            assert best_iteration is not None and best_score is not None
+            model = model[: best_iteration + 1]
+            model.best_iteration = best_iteration
+            model.best_score = best_score
        except XGBoostError as e:
            raise XGBoostError(
-                "`save_best` is not applicable to current booster"
+                "`save_best` is not applicable to the current booster"
            ) from e
+
        return model


@@ -462,8 +480,6 @@ class EvaluationMonitor(TrainingCallback):
    Parameters
    ----------

-    metric :
-        Extra user defined metric.
    rank :
        Which worker should be used for printing the result.
    period :
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3,11 +3,13 @@
 """Core XGBoost Library."""
 import copy
 import ctypes
+import importlib.util
 import json
 import os
 import re
 import sys
 import warnings
+import weakref
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from enum import IntEnum, unique
@@ -50,6 +52,7 @@ from ._typing import (
    FeatureTypes,
    ModelIn,
    NumpyOrCupy,
+    TransformedData,
    c_bst_ulong,
 )
 from .compat import PANDAS_INSTALLED, DataFrame, py_str
@@ -152,7 +155,11 @@ def _expect(expectations: Sequence[Type], got: Type) -> str:

 def _log_callback(msg: bytes) -> None:
    """Redirect logs from native library into Python console"""
-    print(py_str(msg))
+    smsg = py_str(msg)
+    if smsg.find("WARNING:") != -1:
+        warnings.warn(smsg, UserWarning)
+        return
+    print(smsg)


 def _get_log_callback_func() -> Callable:
@@ -228,8 +235,11 @@ Error message(s): {os_error_list}

    def parse(ver: str) -> Tuple[int, int, int]:
        """Avoid dependency on packaging (PEP 440)."""
-        # 2.0.0-dev or 2.0.0
+        # 2.0.0-dev, 2.0.0, or 2.0.0rc1
        major, minor, patch = ver.split("-")[0].split(".")
+        rc = patch.find("rc")
+        if rc != -1:
+            patch = patch[:rc]
        return int(major), int(minor), int(patch)

    libver = _lib_version(lib)
@@ -271,6 +281,44 @@ def _check_call(ret: int) -> None:
        raise XGBoostError(py_str(_LIB.XGBGetLastError()))


+def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
+    """Validate parameters in distributed environments."""
+    device = kwargs.get("device", None)
+    if device and not isinstance(device, str):
+        msg = "Invalid type for the `device` parameter"
+        msg += _expect((str,), type(device))
+        raise TypeError(msg)
+
+    if device and device.find(":") != -1:
+        raise ValueError(
+            "Distributed training doesn't support selecting device ordinal as GPUs are"
+            " managed by the distributed framework. use `device=cuda` or `device=gpu`"
+            " instead."
+        )
+
+    if kwargs.get("booster", None) == "gblinear":
+        raise NotImplementedError(
+            f"booster `{kwargs['booster']}` is not supported for distributed training."
+        )
+
+
+def _validate_feature_info(
+    feature_info: Sequence[str], n_features: int, name: str
+) -> List[str]:
+    if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
+        raise TypeError(
+            f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
+        )
+    feature_info = list(feature_info)
+    if len(feature_info) != n_features and n_features != 0:
+        msg = (
+            f"{name} must have the same length as the number of data columns, ",
+            f"expected {n_features}, got {len(feature_info)}",
+        )
+        raise ValueError(msg)
+    return feature_info
+
+
 def build_info() -> dict:
    """Build information of XGBoost.  The returned value format is not stable. Also,
    please note that build time dependency is not the same as runtime dependency. For
@@ -381,6 +429,54 @@ def c_array(
    return (ctype * len(values))(*values)


+def from_array_interface(interface: dict) -> NumpyOrCupy:
+    """Convert array interface to numpy or cupy array"""
+
+    class Array:  # pylint: disable=too-few-public-methods
+        """Wrapper type for communicating with numpy and cupy."""
+
+        _interface: Optional[dict] = None
+
+        @property
+        def __array_interface__(self) -> Optional[dict]:
+            return self._interface
+
+        @__array_interface__.setter
+        def __array_interface__(self, interface: dict) -> None:
+            self._interface = copy.copy(interface)
+            # converts some fields to tuple as required by numpy
+            self._interface["shape"] = tuple(self._interface["shape"])
+            self._interface["data"] = tuple(self._interface["data"])
+            if self._interface.get("strides", None) is not None:
+                self._interface["strides"] = tuple(self._interface["strides"])
+
+        @property
+        def __cuda_array_interface__(self) -> Optional[dict]:
+            return self.__array_interface__
+
+        @__cuda_array_interface__.setter
+        def __cuda_array_interface__(self, interface: dict) -> None:
+            self.__array_interface__ = interface
+
+    arr = Array()
+
+    if "stream" in interface:
+        # CUDA stream is presented, this is a __cuda_array_interface__.
+        spec = importlib.util.find_spec("cupy")
+        if spec is None:
+            raise ImportError("`cupy` is required for handling CUDA buffer.")
+
+        import cupy as cp  # pylint: disable=import-error
+
+        arr.__cuda_array_interface__ = interface
+        out = cp.array(arr, copy=True)
+    else:
+        arr.__array_interface__ = interface
+        out = np.array(arr, copy=True)
+
+    return out
+
+
 def _prediction_output(
    shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
 ) -> NumpyOrCupy:
@@ -395,13 +491,21 @@ def _prediction_output(


 class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
-    """The interface for user defined data iterator.
+    """The interface for user defined data iterator. The iterator facilitates
+    distributed training, :py:class:`QuantileDMatrix`, and external memory support using
+    :py:class:`DMatrix`. Most of time, users don't need to interact with this class
+    directly.
+
+    .. note::
+
+        The class caches some intermediate results using the `data` input (predictor
+        `X`) as key. Don't repeat the `X` for multiple batches with different meta data
+        (like `label`), make a copy if necessary.

    Parameters
    ----------
    cache_prefix :
-        Prefix to the cache files, only used in external memory.  It can be either an
-        URI or a file path.
+        Prefix to the cache files, only used in external memory.
    release_data :
        Whether the iterator should release the data during reset. Set it to True if the
        data transformation (converting data to np.float32 type) is expensive.
@@ -419,13 +523,13 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
        self._allow_host = True
        self._release = release_data
        # Stage data in Python until reset or next is called to avoid data being free.
-        self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
-        self._input_id: int = 0
+        self._temporary_data: Optional[TransformedData] = None
+        self._data_ref: Optional[weakref.ReferenceType] = None

    def get_callbacks(
        self, allow_host: bool, enable_categorical: bool
    ) -> Tuple[Callable, Callable]:
-        """Get callback functions for iterating in C."""
+        """Get callback functions for iterating in C. This is an internal function."""
        assert hasattr(self, "cache_prefix"), "__init__ is not called."
        self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
            self._reset_wrapper
@@ -491,8 +595,8 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes

        @require_keyword_args(True)
        def input_data(
-            data: Any,
            *,
+            data: Any,
            feature_names: Optional[FeatureNames] = None,
            feature_types: Optional[FeatureTypes] = None,
            **kwargs: Any,
@@ -500,7 +604,19 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
            from .data import _proxy_transform, dispatch_proxy_set_data

            # Reduce the amount of transformation that's needed for QuantileDMatrix.
-            if self._temporary_data is not None and id(data) == self._input_id:
+            #
+            # To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
+            # GPU. If the QDM has only one batch of input (most of the cases), we can
+            # avoid transforming the data repeatly.
+            try:
+                ref = weakref.ref(data)
+            except TypeError:
+                ref = None
+            if (
+                self._temporary_data is not None
+                and ref is not None
+                and ref is self._data_ref
+            ):
                new, cat_codes, feature_names, feature_types = self._temporary_data
            else:
                new, cat_codes, feature_names, feature_types = _proxy_transform(
@@ -517,7 +633,7 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
                feature_types=feature_types,
                **kwargs,
            )
-            self._input_id = id(data)
+            self._data_ref = ref

        # pylint: disable=not-callable
        return self._handle_exception(lambda: self.next(input_data), 0)
@@ -593,6 +709,9 @@ def require_keyword_args(
        @wraps(func)
        def inner_f(*args: Any, **kwargs: Any) -> _T:
            extra_args = len(args) - len(all_args)
+            if not all_args and extra_args > 0:  # keyword argument only
+                raise TypeError("Keyword argument is required.")
+
            if extra_args > 0:
                # ignore first 'self' argument for instance methods
                args_msg = [
@@ -1040,7 +1159,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        testing purposes. If this is a quantized DMatrix then quantized values are
        returned instead of input values.

-            .. versionadded:: 1.7.0
+        .. versionadded:: 1.7.0

        """
        indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
@@ -1060,6 +1179,36 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        )
        return ret

+    def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get quantile cuts for quantization.
+
+        .. versionadded:: 2.0.0
+
+        """
+        n_features = self.num_col()
+
+        c_sindptr = ctypes.c_char_p()
+        c_sdata = ctypes.c_char_p()
+        config = make_jcargs()
+        _check_call(
+            _LIB.XGDMatrixGetQuantileCut(
+                self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
+            )
+        )
+        assert c_sindptr.value is not None
+        assert c_sdata.value is not None
+
+        i_indptr = json.loads(c_sindptr.value)
+        indptr = from_array_interface(i_indptr)
+        assert indptr.size == n_features + 1
+        assert indptr.dtype == np.uint64
+
+        i_data = json.loads(c_sdata.value)
+        data = from_array_interface(i_data)
+        assert data.size == indptr[-1]
+        assert data.dtype == np.float32
+        return indptr, data
+
    def num_row(self) -> int:
        """Get the number of rows in the DMatrix."""
        ret = c_bst_ulong()
@@ -1117,11 +1266,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m

    @property
    def feature_names(self) -> Optional[FeatureNames]:
-        """Get feature names (column labels).
+        """Labels for features (column labels).
+
+        Setting it to ``None`` resets existing feature names.

-        Returns
-        -------
-        feature_names : list or None
        """
        length = c_bst_ulong()
        sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1140,67 +1288,61 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m

    @feature_names.setter
    def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
-        """Set feature names (column labels).
-
-        Parameters
-        ----------
-        feature_names : list or None
-            Labels for features. None will reset existing feature names
-        """
-        if feature_names is not None:
-            # validate feature name
-            try:
-                if not isinstance(feature_names, str):
-                    feature_names = list(feature_names)
-                else:
-                    feature_names = [feature_names]
-            except TypeError:
-                feature_names = [cast(str, feature_names)]
-
-            if len(feature_names) != len(set(feature_names)):
-                raise ValueError("feature_names must be unique")
-            if len(feature_names) != self.num_col() and self.num_col() != 0:
-                msg = (
-                    "feature_names must have the same length as data, ",
-                    f"expected {self.num_col()}, got {len(feature_names)}",
-                )
-                raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. []<
-            if not all(
-                isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
-                for f in feature_names
-            ):
-                raise ValueError(
-                    "feature_names must be string, and may not contain [, ] or <"
-                )
-            feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
-            c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
-                *feature_names_bytes
-            )
-            _check_call(
-                _LIB.XGDMatrixSetStrFeatureInfo(
-                    self.handle,
-                    c_str("feature_name"),
-                    c_feature_names,
-                    c_bst_ulong(len(feature_names)),
-                )
-            )
-        else:
-            # reset feature_types also
+        if feature_names is None:
            _check_call(
                _LIB.XGDMatrixSetStrFeatureInfo(
                    self.handle, c_str("feature_name"), None, c_bst_ulong(0)
                )
            )
-            self.feature_types = None
+            return
+
+        # validate feature name
+        feature_names = _validate_feature_info(
+            feature_names, self.num_col(), "feature names"
+        )
+        if len(feature_names) != len(set(feature_names)):
+            values, counts = np.unique(
+                feature_names,
+                return_index=False,
+                return_inverse=False,
+                return_counts=True,
+            )
+            duplicates = [name for name, cnt in zip(values, counts) if cnt > 1]
+            raise ValueError(
+                f"feature_names must be unique. Duplicates found: {duplicates}"
+            )
+
+        # prohibit the use symbols that may affect parsing. e.g. []<
+        if not all(
+            isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
+            for f in feature_names
+        ):
+            raise ValueError(
+                "feature_names must be string, and may not contain [, ] or <"
+            )
+
+        feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
+        c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
+            *feature_names_bytes
+        )
+        _check_call(
+            _LIB.XGDMatrixSetStrFeatureInfo(
+                self.handle,
+                c_str("feature_name"),
+                c_feature_names,
+                c_bst_ulong(len(feature_names)),
+            )
+        )

    @property
    def feature_types(self) -> Optional[FeatureTypes]:
-        """Get feature types (column types).
+        """Type of features (column types).
+
+        This is for displaying the results and categorical data support. See
+        :py:class:`DMatrix` for details.
+
+        Setting it to ``None`` resets existing feature types.

-        Returns
-        -------
-        feature_types : list or None
        """
        length = c_bst_ulong()
        sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1218,57 +1360,32 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        return res

    @feature_types.setter
-    def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
-        """Set feature types (column types).
-
-        This is for displaying the results and categorical data support. See
-        :py:class:`DMatrix` for details.
-
-        Parameters
-        ----------
-        feature_types :
-            Labels for features. None will reset existing feature names
-
-        """
-        # For compatibility reason this function wraps single str input into a list.  But
-        # we should not promote such usage since other than visualization, the field is
-        # also used for specifying categorical data type.
-        if feature_types is not None:
-            if not isinstance(feature_types, (list, str)):
-                raise TypeError("feature_types must be string or list of strings")
-            if isinstance(feature_types, str):
-                # single string will be applied to all columns
-                feature_types = [feature_types] * self.num_col()
-            try:
-                if not isinstance(feature_types, str):
-                    feature_types = list(feature_types)
-                else:
-                    feature_types = [feature_types]
-            except TypeError:
-                feature_types = [cast(str, feature_types)]
-            feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
-            c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
-                *feature_types_bytes
-            )
-            _check_call(
-                _LIB.XGDMatrixSetStrFeatureInfo(
-                    self.handle,
-                    c_str("feature_type"),
-                    c_feature_types,
-                    c_bst_ulong(len(feature_types)),
-                )
-            )
-
-            if len(feature_types) != self.num_col() and self.num_col() != 0:
-                msg = "feature_types must have the same length as data"
-                raise ValueError(msg)
-        else:
-            # Reset.
+    def feature_types(self, feature_types: Optional[FeatureTypes]) -> None:
+        if feature_types is None:
+            # Reset
            _check_call(
                _LIB.XGDMatrixSetStrFeatureInfo(
                    self.handle, c_str("feature_type"), None, c_bst_ulong(0)
                )
            )
+            return
+
+        feature_types = _validate_feature_info(
+            feature_types, self.num_col(), "feature types"
+        )
+
+        feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
+        c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
+            *feature_types_bytes
+        )
+        _check_call(
+            _LIB.XGDMatrixSetStrFeatureInfo(
+                self.handle,
+                c_str("feature_type"),
+                c_feature_types,
+                c_bst_ulong(len(feature_types)),
+            )
+        )


 class _ProxyDMatrix(DMatrix):
@@ -1318,13 +1435,13 @@ class _ProxyDMatrix(DMatrix):


 class QuantileDMatrix(DMatrix):
-    """A DMatrix variant that generates quantilized data directly from input for
-    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
-    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
-    number of bins during quantisation, which should be consistent with the training
-    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
-    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
-    it defeats the purpose of saving memory) constructed from training dataset.  See
+    """A DMatrix variant that generates quantilized data directly from input for the
+    ``hist`` tree method. This DMatrix is primarily designed to save memory in training
+    by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
+    during quantisation, which should be consistent with the training parameter
+    ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
+    should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
+    defeats the purpose of saving memory) constructed from training dataset.  See
    :py:obj:`xgboost.DMatrix` for documents on meta info.

    .. note::
@@ -1372,7 +1489,7 @@ class QuantileDMatrix(DMatrix):
        enable_categorical: bool = False,
        data_split_mode: DataSplitMode = DataSplitMode.ROW,
    ) -> None:
-        self.max_bin: int = max_bin if max_bin is not None else 256
+        self.max_bin = max_bin
        self.missing = missing if missing is not None else np.nan
        self.nthread = nthread if nthread is not None else -1
        self._silent = silent  # unused, kept for compatibility
@@ -1544,7 +1661,7 @@ class Booster:
        )
        for d in cache:
            # Validate feature only after the feature names are saved into booster.
-            self._validate_dmatrix_features(d)
+            self._assign_dmatrix_features(d)

        if isinstance(model_file, Booster):
            assert self.handle is not None
@@ -1667,6 +1784,11 @@ class Booster:
        self.__dict__.update(state)

    def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
+        """Get a slice of the tree-based model.
+
+        .. versionadded:: 1.3.0
+
+        """
        if isinstance(val, int):
            val = slice(val, val + 1)
        if isinstance(val, tuple):
@@ -1705,6 +1827,11 @@ class Booster:
        return sliced

    def __iter__(self) -> Generator["Booster", None, None]:
+        """Iterator method for getting individual trees.
+
+        .. versionadded:: 2.0.0
+
+        """
        for i in range(0, self.num_boosted_rounds()):
            yield self[i]

@@ -1795,7 +1922,7 @@ class Booster:
        attr_names = from_cstr_to_pystr(sarr, length)
        return {n: self.attr(n) for n in attr_names}

-    def set_attr(self, **kwargs: Optional[str]) -> None:
+    def set_attr(self, **kwargs: Optional[Any]) -> None:
        """Set the attribute of the Booster.

        Parameters
@@ -1915,7 +2042,7 @@ class Booster:
        """
        if not isinstance(dtrain, DMatrix):
            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_dmatrix_features(dtrain)
+        self._assign_dmatrix_features(dtrain)

        if fobj is None:
            _check_call(
@@ -1926,12 +2053,14 @@ class Booster:
        else:
            pred = self.predict(dtrain, output_margin=True, training=True)
            grad, hess = fobj(pred, dtrain)
-            self.boost(dtrain, grad, hess)
+            self.boost(dtrain, iteration=iteration, grad=grad, hess=hess)

-    def boost(self, dtrain: DMatrix, grad: np.ndarray, hess: np.ndarray) -> None:
-        """Boost the booster for one iteration, with customized gradient
-        statistics.  Like :py:func:`xgboost.Booster.update`, this
-        function should not be called directly by users.
+    def boost(
+        self, dtrain: DMatrix, iteration: int, grad: NumpyOrCupy, hess: NumpyOrCupy
+    ) -> None:
+        """Boost the booster for one iteration with customized gradient statistics.
+        Like :py:func:`xgboost.Booster.update`, this function should not be called
+        directly by users.

        Parameters
        ----------
@@ -1943,19 +2072,53 @@ class Booster:
            The second order of gradient.

        """
-        if len(grad) != len(hess):
-            raise ValueError(f"grad / hess length mismatch: {len(grad)} / {len(hess)}")
-        if not isinstance(dtrain, DMatrix):
-            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_dmatrix_features(dtrain)
+        from .data import (
+            _array_interface,
+            _cuda_array_interface,
+            _ensure_np_dtype,
+            _is_cupy_array,
+        )
+
+        self._assign_dmatrix_features(dtrain)
+
+        def is_flatten(array: NumpyOrCupy) -> bool:
+            return len(array.shape) == 1 or array.shape[1] == 1
+
+        def array_interface(array: NumpyOrCupy) -> bytes:
+            # Can we check for __array_interface__ instead of a specific type instead?
+            msg = (
+                "Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian."
+                f" Got: {type(array)}"
+            )
+            if not isinstance(array, np.ndarray) and not _is_cupy_array(array):
+                raise TypeError(msg)
+
+            n_samples = dtrain.num_row()
+            if array.shape[0] != n_samples and is_flatten(array):
+                warnings.warn(
+                    "Since 2.1.0, the shape of the gradient and hessian is required to"
+                    " be (n_samples, n_targets) or (n_samples, n_classes).",
+                    FutureWarning,
+                )
+                array = array.reshape(n_samples, array.size // n_samples)
+
+            if isinstance(array, np.ndarray):
+                array, _ = _ensure_np_dtype(array, array.dtype)
+                interface = _array_interface(array)
+            elif _is_cupy_array(array):
+                interface = _cuda_array_interface(array)
+            else:
+                raise TypeError(msg)
+
+            return interface

        _check_call(
-            _LIB.XGBoosterBoostOneIter(
+            _LIB.XGBoosterTrainOneIter(
                self.handle,
                dtrain.handle,
-                c_array(ctypes.c_float, grad),
-                c_array(ctypes.c_float, hess),
-                c_bst_ulong(len(grad)),
+                iteration,
+                array_interface(grad),
+                array_interface(hess),
            )
        )

@@ -1988,7 +2151,7 @@ class Booster:
                raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
            if not isinstance(d[1], str):
                raise TypeError(f"expected string, got {type(d[1]).__name__}")
-            self._validate_dmatrix_features(d[0])
+            self._assign_dmatrix_features(d[0])

        dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
        evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
@@ -2040,7 +2203,7 @@ class Booster:
        result: str
            Evaluation result string.
        """
-        self._validate_dmatrix_features(data)
+        self._assign_dmatrix_features(data)
        return self.eval_set([(data, name)], iteration)

    # pylint: disable=too-many-function-args
@@ -2139,7 +2302,8 @@ class Booster:
        if not isinstance(data, DMatrix):
            raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
        if validate_features:
-            self._validate_dmatrix_features(data)
+            fn = data.feature_names
+            self._validate_features(fn)
        args = {
            "type": 0,
            "training": training,
@@ -2187,20 +2351,25 @@ class Booster:
        base_margin: Any = None,
        strict_shape: bool = False,
    ) -> NumpyOrCupy:
-        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
-        does not cache the prediction result.
+        """Run prediction in-place when possible, Unlike :py:meth:`predict` method,
+        inplace prediction does not cache the prediction result.

        Calling only ``inplace_predict`` in multiple threads is safe and lock
        free.  But the safety does not hold when used in conjunction with other
        methods. E.g. you can't train the booster in one thread and perform
        prediction in the other.

+        .. note::
+
+            If the device ordinal of the input data doesn't match the one configured for
+            the booster, data will be copied to the booster device.
+
        .. code-block:: python

-            booster.set_param({"predictor": "gpu_predictor"})
+            booster.set_param({"device": "cuda:0"})
            booster.inplace_predict(cupy_array)

-            booster.set_param({"predictor": "cpu_predictor"})
+            booster.set_param({"device": "cpu"})
            booster.inplace_predict(numpy_array)

        .. versionadded:: 1.1.0
@@ -2208,9 +2377,7 @@ class Booster:
        Parameters
        ----------
        data :
-            The input data, must not be a view for numpy array.  Set
-            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
-            array or CuDF DataFrame.
+            The input data.
        iteration_range :
            See :py:meth:`predict` for details.
        predict_type :
@@ -2233,8 +2400,8 @@ class Booster:
        Returns
        -------
        prediction : numpy.ndarray/cupy.ndarray
-            The prediction result.  When input data is on GPU, prediction
-            result is stored in a cupy array.
+            The prediction result.  When input data is on GPU, prediction result is
+            stored in a cupy array.

        """
        preds = ctypes.POINTER(ctypes.c_float)()
@@ -2426,8 +2593,7 @@ class Booster:
        return ctypes2buffer(cptr, length.value)

    def load_model(self, fname: ModelIn) -> None:
-        """Load the model from a file or bytearray. Path to file can be local
-        or as an URI.
+        """Load the model from a file or a bytearray.

        The model is loaded from XGBoost format which is universal among the various
        XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
@@ -2460,10 +2626,35 @@ class Booster:
        else:
            raise TypeError("Unknown file type: ", fname)

-        if self.attr("best_iteration") is not None:
-            self.best_iteration = int(cast(int, self.attr("best_iteration")))
-        if self.attr("best_score") is not None:
-            self.best_score = float(cast(float, self.attr("best_score")))
+    @property
+    def best_iteration(self) -> int:
+        """The best iteration during training."""
+        best = self.attr("best_iteration")
+        if best is not None:
+            return int(best)
+
+        raise AttributeError(
+            "`best_iteration` is only defined when early stopping is used."
+        )
+
+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        self.set_attr(best_iteration=iteration)
+
+    @property
+    def best_score(self) -> float:
+        """The best evaluation score during training."""
+        best = self.attr("best_score")
+        if best is not None:
+            return float(best)
+
+        raise AttributeError(
+            "`best_score` is only defined when early stopping is used."
+        )
+
+    @best_score.setter
+    def best_score(self, score: int) -> None:
+        self.set_attr(best_score=score)

    def num_boosted_rounds(self) -> int:
        """Get number of boosted rounds.  For gblinear this is reset to 0 after
@@ -2761,14 +2952,13 @@ class Booster:
        # pylint: disable=no-member
        return df.sort(["Tree", "Node"]).reset_index(drop=True)

-    def _validate_dmatrix_features(self, data: DMatrix) -> None:
+    def _assign_dmatrix_features(self, data: DMatrix) -> None:
        if data.num_row() == 0:
            return

        fn = data.feature_names
        ft = data.feature_types
-        # Be consistent with versions before 1.7, "validate" actually modifies the
-        # booster.
+
        if self.feature_names is None:
            self.feature_names = fn
        if self.feature_types is None:
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -47,6 +47,7 @@ from typing import (
    Callable,
    Dict,
    Generator,
+    Iterable,
    List,
    Optional,
    Sequence,
@@ -70,6 +71,7 @@ from .core import (
    Metric,
    Objective,
    QuantileDMatrix,
+    _check_distributed_params,
    _deprecate_positional_args,
    _expect,
 )
@@ -82,6 +84,7 @@ from .sklearn import (
    XGBRanker,
    XGBRankerMixIn,
    XGBRegressorBase,
+    _can_use_qdm,
    _check_rf_callback,
    _cls_predict_proba,
    _objective_decorator,
@@ -95,10 +98,12 @@ if TYPE_CHECKING:
    import dask
    import distributed
    from dask import array as da
+    from dask import bag as db
    from dask import dataframe as dd
 else:
    dd = LazyLoader("dd", globals(), "dask.dataframe")
    da = LazyLoader("da", globals(), "dask.array")
+    db = LazyLoader("db", globals(), "dask.bag")
    dask = LazyLoader("dask", globals(), "dask")
    distributed = LazyLoader("distributed", globals(), "dask.distributed")

@@ -507,12 +512,10 @@ async def map_worker_partitions(
    func: Callable[..., _MapRetT],
    *refs: Any,
    workers: Sequence[str],
-) -> List[_MapRetT]:
+) -> _MapRetT:
    """Map a function onto partitions of each worker."""
    # Note for function purity:
-    # XGBoost is deterministic in most of the cases, which means train function is
-    # supposed to be idempotent.  One known exception is gblinear with shotgun updater.
-    # We haven't been able to do a full verification so here we keep pure to be False.
+    # XGBoost is sensitive to data partition and uses random number generator.
    client = _xgb_get_client(client)
    futures = []
    for addr in workers:
@@ -524,11 +527,26 @@ async def map_worker_partitions(
            else:
                args.append(ref)
        fut = client.submit(
-            func, *args, pure=False, workers=[addr], allow_other_workers=False
+            # turn result into a list for bag construction
+            lambda *args, **kwargs: [func(*args, **kwargs)],
+            *args,
+            pure=False,
+            workers=[addr],
+            allow_other_workers=False,
        )
        futures.append(fut)
-    results = await client.gather(futures)
-    return results
+
+    def first_valid(results: Iterable[Optional[_MapRetT]]) -> Optional[_MapRetT]:
+        for v in results:
+            if v is not None:
+                return v
+        return None
+
+    bag = db.from_delayed(futures)
+    fut = await bag.reduction(first_valid, first_valid)
+    result = await client.compute(fut).result()
+
+    return result


 _DataParts = List[Dict[str, Any]]
@@ -617,14 +635,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
        if self._iter == len(self._data):
            # Return 0 when there's no more batch.
            return 0
-        feature_names: Optional[FeatureNames] = None
-        if self._feature_names:
-            feature_names = self._feature_names
-        else:
-            if hasattr(self.data(), "columns"):
-                feature_names = self.data().columns.format()
-            else:
-                feature_names = None
+
        input_data(
            data=self.data(),
            label=self._get("_label"),
@@ -634,7 +645,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
            base_margin=self._get("_base_margin"),
            label_lower_bound=self._get("_label_lower_bound"),
            label_upper_bound=self._get("_label_upper_bound"),
-            feature_names=feature_names,
+            feature_names=self._feature_names,
            feature_types=self._feature_types,
            feature_weights=self._feature_weights,
        )
@@ -855,8 +866,6 @@ async def _get_rabit_args(
    except Exception:  # pylint: disable=broad-except
        sched_addr = None

-    # make sure all workers are online so that we can obtain reliable scheduler_info
-    await client.wait_for_workers(n_workers)  # type: ignore
    env = await client.run_on_scheduler(
        _start_tracker, n_workers, sched_addr, user_addr
    )
@@ -889,27 +898,14 @@ def _get_workers_from_data(
    return list(X_worker_map)


-def _filter_empty(
-    booster: Booster, local_history: TrainingCallback.EvalsLog, is_valid: bool
-) -> Optional[TrainReturnT]:
-    n_workers = collective.get_world_size()
-    non_empty = numpy.zeros(shape=(n_workers,), dtype=numpy.int32)
-    rank = collective.get_rank()
-    non_empty[rank] = int(is_valid)
-    non_empty = collective.allreduce(non_empty, collective.Op.SUM)
-    non_empty = non_empty.astype(bool)
-    ret: Optional[TrainReturnT] = {
-        "booster": booster,
-        "history": local_history,
-    }
-    for i in range(non_empty.size):
-        # This is the first valid worker
-        if non_empty[i] and i == rank:
-            return ret
-        if non_empty[i]:
-            return None
-
-    raise ValueError("None of the workers can provide a valid result.")
+async def _check_workers_are_alive(
+    workers: List[str], client: "distributed.Client"
+) -> None:
+    info = await client.scheduler.identity()
+    current_workers = info["workers"].keys()
+    missing_workers = set(workers) - current_workers
+    if missing_workers:
+        raise RuntimeError(f"Missing required workers: {missing_workers}")


 async def _train_async(
@@ -929,12 +925,9 @@ async def _train_async(
    custom_metric: Optional[Metric],
 ) -> Optional[TrainReturnT]:
    workers = _get_workers_from_data(dtrain, evals)
+    await _check_workers_are_alive(workers, client)
    _rabit_args = await _get_rabit_args(len(workers), dconfig, client)
-
-    if params.get("booster", None) == "gblinear":
-        raise NotImplementedError(
-            f"booster `{params['booster']}` is not yet supported for dask."
-        )
+    _check_distributed_params(params)

    def dispatched_train(
        parameters: Dict,
@@ -997,10 +990,17 @@ async def _train_async(
                xgb_model=xgb_model,
                callbacks=callbacks,
            )
-            # Don't return the boosters from empty workers. It's quite difficult to
-            # guarantee everything is in sync in the present of empty workers,
-            # especially with complex objectives like quantile.
-            return _filter_empty(booster, local_history, Xy.num_row() != 0)
+        # Don't return the boosters from empty workers. It's quite difficult to
+        # guarantee everything is in sync in the present of empty workers, especially
+        # with complex objectives like quantile.
+        if Xy.num_row() != 0:
+            ret: Optional[TrainReturnT] = {
+                "booster": booster,
+                "history": local_history,
+            }
+        else:
+            ret = None
+        return ret

    async with distributed.MultiLock(workers, client):
        if evals is not None:
@@ -1012,7 +1012,7 @@ async def _train_async(
            evals_name = []
            evals_id = []

-        results = await map_worker_partitions(
+        result = await map_worker_partitions(
            client,
            dispatched_train,
            # extra function parameters
@@ -1025,7 +1025,7 @@ async def _train_async(
            # workers to be used for training
            workers=workers,
        )
-        return list(filter(lambda ret: ret is not None, results))[0]
+        return result


@_deprecate_positional_args
@@ -1574,7 +1574,7 @@ async def _async_wrap_evaluation_matrices(
    """A switch function for async environment."""

    def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
-        if tree_method in ("hist", "gpu_hist"):
+        if _can_use_qdm(tree_method):
            return DaskQuantileDMatrix(
                client=client, ref=ref, max_bin=max_bin, **kwargs
            )
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast

 import numpy as np

@@ -17,6 +17,7 @@ from ._typing import (
    FloatCompatible,
    NumpyDType,
    PandasDType,
+    TransformedData,
    c_bst_ulong,
 )
 from .compat import DataFrame, lazy_isinstance
@@ -197,6 +198,7 @@ def _from_numpy_array(
    nthread: int,
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
    """Initialize data from a 2-D numpy matrix."""
    _check_data_shape(data)
@@ -205,7 +207,11 @@ def _from_numpy_array(
    _check_call(
        _LIB.XGDMatrixCreateFromDense(
            _array_interface(data),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
            ctypes.byref(handle),
        )
    )
@@ -1046,7 +1052,9 @@ def dispatch_data_backend(
            data.tocsr(), missing, threads, feature_names, feature_types
        )
    if _is_numpy_array(data):
-        return _from_numpy_array(data, missing, threads, feature_names, feature_types)
+        return _from_numpy_array(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
    if _is_uri(data):
        return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
    if _is_list(data):
@@ -1261,12 +1269,7 @@ def _proxy_transform(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
    enable_categorical: bool,
-) -> Tuple[
-    Union[bool, ctypes.c_void_p, np.ndarray],
-    Optional[list],
-    Optional[FeatureNames],
-    Optional[FeatureTypes],
-]:
+) -> TransformedData:
    if _is_cudf_df(data) or _is_cudf_ser(data):
        return _transform_cudf_df(
            data, feature_names, feature_types, enable_categorical
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
        os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
        # use libxgboost from a system prefix, if available.  This should be the last
        # option.
-        os.path.join(sys.prefix, "lib"),
+        os.path.join(sys.base_prefix, "lib"),
    ]

    if sys.platform == "win32":
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
            + ("\n- ".join(dll_path))
            + "\nXGBoost Python package path: "
            + curr_path
-            + "\nsys.prefix: "
-            + sys.prefix
+            + "\nsys.base_prefix: "
+            + sys.base_prefix
            + "\nSee: "
            + link
            + " for installing XGBoost."
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@@ -1,169 +0,0 @@
-"""Compatibility shim for xgboost.rabit; to be removed in 2.0"""
-import logging
-import warnings
-from enum import IntEnum, unique
-from typing import Any, Callable, List, Optional, TypeVar
-
-import numpy as np
-
-from . import collective
-
-LOGGER = logging.getLogger("[xgboost.rabit]")
-
-
-def _deprecation_warning() -> str:
-    return (
-        "The xgboost.rabit submodule is marked as deprecated in 1.7 and will be removed "
-        "in 2.0. Please use xgboost.collective instead."
-    )
-
-
-def init(args: Optional[List[bytes]] = None) -> None:
-    """Initialize the rabit library with arguments"""
-    warnings.warn(_deprecation_warning(), FutureWarning)
-    parsed = {}
-    if args:
-        for arg in args:
-            kv = arg.decode().split("=")
-            if len(kv) == 2:
-                parsed[kv[0]] = kv[1]
-    collective.init(**parsed)
-
-
-def finalize() -> None:
-    """Finalize the process, notify tracker everything is done."""
-    collective.finalize()
-
-
-def get_rank() -> int:
-    """Get rank of current process.
-    Returns
-    -------
-    rank : int
-        Rank of current process.
-    """
-    return collective.get_rank()
-
-
-def get_world_size() -> int:
-    """Get total number workers.
-    Returns
-    -------
-    n : int
-        Total number of process.
-    """
-    return collective.get_world_size()
-
-
-def is_distributed() -> int:
-    """If rabit is distributed."""
-    return collective.is_distributed()
-
-
-def tracker_print(msg: Any) -> None:
-    """Print message to the tracker.
-    This function can be used to communicate the information of
-    the progress to the tracker
-    Parameters
-    ----------
-    msg : str
-        The message to be printed to tracker.
-    """
-    collective.communicator_print(msg)
-
-
-def get_processor_name() -> bytes:
-    """Get the processor name.
-    Returns
-    -------
-    name : str
-        the name of processor(host)
-    """
-    return collective.get_processor_name().encode()
-
-
-T = TypeVar("T")  # pylint:disable=invalid-name
-
-
-def broadcast(data: T, root: int) -> T:
-    """Broadcast object from one node to all other nodes.
-    Parameters
-    ----------
-    data : any type that can be pickled
-        Input data, if current rank does not equal root, this can be None
-    root : int
-        Rank of the node to broadcast data from.
-    Returns
-    -------
-    object : int
-        the result of broadcast.
-    """
-    return collective.broadcast(data, root)
-
-
-@unique
-class Op(IntEnum):
-    """Supported operations for rabit."""
-
-    MAX = 0
-    MIN = 1
-    SUM = 2
-    OR = 3
-
-
-def allreduce(  # pylint:disable=invalid-name
-    data: np.ndarray, op: Op, prepare_fun: Optional[Callable[[np.ndarray], None]] = None
-) -> np.ndarray:
-    """Perform allreduce, return the result.
-    Parameters
-    ----------
-    data :
-        Input data.
-    op :
-        Reduction operators, can be MIN, MAX, SUM, BITOR
-    prepare_fun :
-        Lazy preprocessing function, if it is not None, prepare_fun(data)
-        will be called by the function before performing allreduce, to initialize the data
-        If the result of Allreduce can be recovered directly,
-        then prepare_fun will NOT be called
-    Returns
-    -------
-    result :
-        The result of allreduce, have same shape as data
-    Notes
-    -----
-    This function is not thread-safe.
-    """
-    if prepare_fun is None:
-        return collective.allreduce(data, collective.Op(op))
-    raise ValueError("preprocessing function is no longer supported")
-
-
-def version_number() -> int:
-    """Returns version number of current stored model.
-    This means how many calls to CheckPoint we made so far.
-    Returns
-    -------
-    version : int
-        Version number of currently stored model
-    """
-    return 0
-
-
-class RabitContext:
-    """A context controlling rabit initialization and finalization."""
-
-    def __init__(self, args: Optional[List[bytes]] = None) -> None:
-        if args is None:
-            args = []
-        self.args = args
-
-    def __enter__(self) -> None:
-        init(self.args)
-        assert is_distributed()
-        LOGGER.warning(_deprecation_warning())
-        LOGGER.debug("-------------- rabit say hello ------------------")
-
-    def __exit__(self, *args: List) -> None:
-        finalize()
-        LOGGER.debug("--------------- rabit say bye ------------------")
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -76,6 +76,10 @@ def _check_rf_callback(
        )


+def _can_use_qdm(tree_method: Optional[str]) -> bool:
+    return tree_method in ("hist", "gpu_hist", None, "auto")
+
+
 SklObjective = Optional[
    Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
 ]
@@ -226,10 +230,10 @@ __model_doc = f"""
    subsample : Optional[float]
        Subsample ratio of the training instance.
    sampling_method :
-        Sampling method. Used only by `gpu_hist` tree method.
-          - `uniform`: select random training instances uniformly.
-          - `gradient_based` select random training instances with higher probability when
-            the gradient and hessian are larger. (cf. CatBoost)
+        Sampling method. Used only by the GPU version of ``hist`` tree method.
+          - ``uniform``: select random training instances uniformly.
+          - ``gradient_based`` select random training instances with higher probability
+            when the gradient and hessian are larger. (cf. CatBoost)
    colsample_bytree : Optional[float]
        Subsample ratio of columns when constructing each tree.
    colsample_bylevel : Optional[float]
@@ -273,13 +277,16 @@ __model_doc = f"""
        * For linear model, only "weight" is defined and it's the normalized coefficients
          without bias.

-    gpu_id : Optional[int]
-        Device ordinal.
+    device : Optional[str]
+
+        .. versionadded:: 2.0.0
+
+        Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
+
    validate_parameters : Optional[bool]
+
        Give warnings for unknown parameter.
-    predictor : Optional[str]
-        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
-        gpu_predictor].
+
    enable_categorical : bool

        .. versionadded:: 1.5.0
@@ -381,17 +388,21 @@ __model_doc = f"""
          every **early_stopping_rounds** round(s) to continue training.  Requires at
          least one item in **eval_set** in :py:meth:`fit`.

-        - The method returns the model from the last iteration, not the best one, use a
-          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
-          model is preferred.
+        - If early stopping occurs, the model will have two additional attributes:
+          :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
+          :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
+          number of trees during inference. If users want to access the full model
+          (including trees built after early stopping), they can specify the
+          `iteration_range` in these inference methods. In addition, other utilities
+          like model plotting can also use the entire model.
+
+        - If you prefer to discard the trees after `best_iteration`, consider using the
+          callback function :py:class:`xgboost.callback.EarlyStopping`.

        - If there's more than one item in **eval_set**, the last entry will be used for
          early stopping.  If there's more than one metric in **eval_metric**, the last
          metric will be used for early stopping.

-        - If early stopping occurs, the model will have three additional fields:
-          :py:attr:`best_score`, :py:attr:`best_iteration`.
-
        .. note::

            This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.
@@ -646,9 +657,8 @@ class XGBModel(XGBModelBase):
        monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
        interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
        importance_type: Optional[str] = None,
-        gpu_id: Optional[int] = None,
+        device: Optional[str] = None,
        validate_parameters: Optional[bool] = None,
-        predictor: Optional[str] = None,
        enable_categorical: bool = False,
        feature_types: Optional[FeatureTypes] = None,
        max_cat_to_onehot: Optional[int] = None,
@@ -693,9 +703,8 @@ class XGBModel(XGBModelBase):
        self.monotone_constraints = monotone_constraints
        self.interaction_constraints = interaction_constraints
        self.importance_type = importance_type
-        self.gpu_id = gpu_id
+        self.device = device
        self.validate_parameters = validate_parameters
-        self.predictor = predictor
        self.enable_categorical = enable_categorical
        self.feature_types = feature_types
        self.max_cat_to_onehot = max_cat_to_onehot
@@ -931,8 +940,7 @@ class XGBModel(XGBModelBase):
        callbacks = self.callbacks if self.callbacks is not None else callbacks

        tree_method = params.get("tree_method", None)
-        cat_support = {"gpu_hist", "approx", "hist"}
-        if self.enable_categorical and tree_method not in cat_support:
+        if self.enable_categorical and tree_method == "exact":
            raise ValueError(
                "Experimental support for categorical data is not implemented for"
                " current tree method yet."
@@ -941,7 +949,7 @@ class XGBModel(XGBModelBase):

    def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
        # Use `QuantileDMatrix` to save memory.
-        if self.tree_method in ("hist", "gpu_hist"):
+        if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
            try:
                return QuantileDMatrix(
                    **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
@@ -984,12 +992,12 @@ class XGBModel(XGBModelBase):
        X :
            Feature matrix. See :ref:`py-data` for a list of supported types.

-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        sample_weight :
@@ -1002,13 +1010,17 @@ class XGBModel(XGBModelBase):
            Validation metrics will help us track the performance of the model.

        eval_metric : str, list of str, or callable, optional
+
            .. deprecated:: 1.6.0
-                Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
+
+            Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.

        early_stopping_rounds : int
+
            .. deprecated:: 1.6.0
-                Use `early_stopping_rounds` in :py:meth:`__init__` or
-                :py:meth:`set_params` instead.
+
+            Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
+            instead.
        verbose :
            If `verbose` is True and an evaluation set is used, the evaluation metric
            measured on the validation set is printed to stdout at each boosting stage.
@@ -1089,12 +1101,7 @@ class XGBModel(XGBModelBase):
            return self

    def _can_use_inplace_predict(self) -> bool:
-        # When predictor is explicitly set, using `inplace_predict` might result into
-        # error with incompatible data type.
-        # Inplace predict doesn't handle as many data types as DMatrix, but it's
-        # sufficient for dask interface where input is simpiler.
-        predictor = self.get_xgb_params().get("predictor", None)
-        if predictor in ("auto", None) and self.booster != "gblinear":
+        if self.booster != "gblinear":
            return True
        return False

@@ -1120,9 +1127,9 @@ class XGBModel(XGBModelBase):
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> ArrayLike:
        """Predict with `X`.  If the model is trained with early stopping, then
-        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
-        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
-        prediction is run on GPU automatically, otherwise it will run on CPU.
+        :py:attr:`best_iteration` is used automatically. The estimator uses
+        `inplace_predict` by default and falls back to using :py:class:`DMatrix` if
+        devices between the data and the estimator don't match.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1272,19 +1279,10 @@ class XGBModel(XGBModelBase):
            )
        return np.array(feature_names)

-    def _early_stopping_attr(self, attr: str) -> Union[float, int]:
-        booster = self.get_booster()
-        try:
-            return getattr(booster, attr)
-        except AttributeError as e:
-            raise AttributeError(
-                f"`{attr}` in only defined when early stopping is used."
-            ) from e
-
    @property
    def best_score(self) -> float:
        """The best score obtained by early stopping."""
-        return float(self._early_stopping_attr("best_score"))
+        return self.get_booster().best_score

    @property
    def best_iteration(self) -> int:
@@ -1292,7 +1290,7 @@ class XGBModel(XGBModelBase):
        for instance if the best iteration is the first round, then best_iteration is 0.

        """
-        return int(self._early_stopping_attr("best_iteration"))
+        return self.get_booster().best_iteration

    @property
    def feature_importances_(self) -> np.ndarray:
@@ -1361,25 +1359,25 @@ class XGBModel(XGBModelBase):

    @property
    def intercept_(self) -> np.ndarray:
-        """
-        Intercept (bias) property
+        """Intercept (bias) property

-        .. note:: Intercept is defined only for linear learners
-
-            Intercept (bias) is only defined when the linear model is chosen as base
-            learner (`booster=gblinear`). It is not defined for other base learner types,
-            such as tree learners (`booster=gbtree`).
+        For tree-based model, the returned value is the `base_score`.

        Returns
        -------
        intercept_ : array of shape ``(1,)`` or ``[n_classes]``
+
        """
-        if self.get_xgb_params()["booster"] != "gblinear":
-            raise AttributeError(
-                f"Intercept (bias) is not defined for Booster type {self.booster}"
-            )
+        booster_config = self.get_xgb_params()["booster"]
        b = self.get_booster()
-        return np.array(json.loads(b.get_dump(dump_format="json")[0])["bias"])
+        if booster_config != "gblinear":  # gbtree, dart
+            config = json.loads(b.save_config())
+            intercept = config["learner"]["learner_model_param"]["base_score"]
+            return np.array([float(intercept)], dtype=np.float32)
+
+        return np.array(
+            json.loads(b.get_dump(dump_format="json")[0])["bias"], dtype=np.float32
+        )


 PredtT = TypeVar("PredtT", bound=np.ndarray)
@@ -1584,7 +1582,9 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    ) -> np.ndarray:
        """Predict the probability of each `X` example being of a given class. If the
        model is trained with early stopping, then :py:attr:`best_iteration` is used
-        automatically.
+        automatically. The estimator uses `inplace_predict` by default and falls back to
+        using :py:class:`DMatrix` if devices between the data and the estimator don't
+        match.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -1917,12 +1917,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
            | 1   | :math:`x_{20}` | :math:`x_{21}` |
            +-----+----------------+----------------+

-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
            for conserving memory. However, this has performance implications when the
            device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
        y :
            Labels
        group :
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1,4 +1,4 @@
-"""Xgboost pyspark integration submodule for core code."""
+"""XGBoost pyspark integration submodule for core code."""
 import base64

 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@@ -60,10 +60,11 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
 import xgboost
 from xgboost import XGBClassifier
 from xgboost.compat import is_cudf_available
-from xgboost.core import Booster
-from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel
+from xgboost.core import Booster, _check_distributed_params
+from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train

+from .._typing import ArrayLike
 from .data import (
    _read_csr_matrix_from_unwrapped_spark_vec,
    alias,
@@ -92,6 +93,7 @@ from .utils import (
    get_class_name,
    get_logger,
    serialize_booster,
+    use_cuda,
 )

 # Put pyspark specific params here, they won't be passed to XGBoost.
@@ -108,13 +110,13 @@ _pyspark_specific_params = [
    "arbitrary_params_dict",
    "force_repartition",
    "num_workers",
-    "use_gpu",
    "feature_names",
    "features_cols",
    "enable_sparse_data_optim",
    "qid_col",
    "repartition_random_shuffle",
    "pred_contrib_col",
+    "use_gpu",
 ]

 _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -132,7 +134,7 @@ _pyspark_param_alias_map = {
 _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.items()}

 _unsupported_xgb_params = [
-    "gpu_id",  # we have "use_gpu" pyspark param instead.
+    "gpu_id",  # we have "device" pyspark param instead.
    "enable_categorical",  # Use feature_types param to specify categorical feature instead
    "use_label_encoder",
    "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -197,11 +199,24 @@ class _SparkXGBParams(
        "The number of XGBoost workers. Each XGBoost worker corresponds to one spark task.",
        TypeConverters.toInt,
    )
+    device = Param(
+        Params._dummy(),
+        "device",
+        (
+            "The device type for XGBoost executors. Available options are `cpu`,`cuda`"
+            " and `gpu`. Set `device` to `cuda` or `gpu` if the executors are running "
+            "on GPU instances. Currently, only one GPU per task is supported."
+        ),
+        TypeConverters.toString,
+    )
    use_gpu = Param(
        Params._dummy(),
        "use_gpu",
-        "A boolean variable. Set use_gpu=true if the executors "
-        + "are running on GPU instances. Currently, only one GPU per task is supported.",
+        (
+            "Deprecated, use `device` instead. A boolean variable. Set use_gpu=true "
+            "if the executors are running on GPU instances. Currently, only one GPU per"
+            " task is supported."
+        ),
        TypeConverters.toBoolean,
    )
    force_repartition = Param(
@@ -335,10 +350,18 @@ class _SparkXGBParams(
                f"It cannot be less than 1 [Default is 1]"
            )

+        tree_method = self.getOrDefault(self.getParam("tree_method"))
+        if tree_method == "exact":
+            raise ValueError(
+                "The `exact` tree method is not supported for distributed systems."
+            )
+
        if self.getOrDefault(self.features_cols):
-            if not self.getOrDefault(self.use_gpu):
+            if not use_cuda(self.getOrDefault(self.device)) and not self.getOrDefault(
+                self.use_gpu
+            ):
                raise ValueError(
-                    "features_col param with list value requires enabling use_gpu."
+                    "features_col param with list value requires `device=cuda`."
                )

        if self.getOrDefault("objective") is not None:
@@ -391,17 +414,7 @@ class _SparkXGBParams(
                    "`pyspark.ml.linalg.Vector` type."
                )

-        if self.getOrDefault(self.use_gpu):
-            tree_method = self.getParam("tree_method")
-            if (
-                self.getOrDefault(tree_method) is not None
-                and self.getOrDefault(tree_method) != "gpu_hist"
-            ):
-                raise ValueError(
-                    f"tree_method should be 'gpu_hist' or None when use_gpu is True,"
-                    f"found {self.getOrDefault(tree_method)}."
-                )
-
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
            gpu_per_task = (
                _get_spark_session()
                .sparkContext.getConf()
@@ -412,35 +425,41 @@ class _SparkXGBParams(

            if is_local:
                # checking spark local mode.
-                if gpu_per_task:
+                if gpu_per_task is not None:
                    raise RuntimeError(
-                        "The spark cluster does not support gpu configuration for local mode. "
-                        "Please delete spark.executor.resource.gpu.amount and "
+                        "The spark local mode does not support gpu configuration."
+                        "Please remove spark.executor.resource.gpu.amount and "
                        "spark.task.resource.gpu.amount"
                    )

-                # Support GPU training in Spark local mode is just for debugging purposes,
-                # so it's okay for printing the below warning instead of checking the real
-                # gpu numbers and raising the exception.
+                # Support GPU training in Spark local mode is just for debugging
+                # purposes, so it's okay for printing the below warning instead of
+                # checking the real gpu numbers and raising the exception.
                get_logger(self.__class__.__name__).warning(
-                    "You enabled use_gpu in spark local mode. Please make sure your local node "
-                    "has at least %d GPUs",
+                    "You have enabled GPU in spark local mode. Please make sure your"
+                    " local node has at least %d GPUs",
                    self.getOrDefault(self.num_workers),
                )
            else:
                # checking spark non-local mode.
-                if not gpu_per_task or int(gpu_per_task) < 1:
-                    raise RuntimeError(
-                        "The spark cluster does not have the necessary GPU"
-                        + "configuration for the spark task. Therefore, we cannot"
-                        + "run xgboost training using GPU."
-                    )
+                if gpu_per_task is not None:
+                    if float(gpu_per_task) < 1.0:
+                        raise ValueError(
+                            "XGBoost doesn't support GPU fractional configurations. "
+                            "Please set `spark.task.resource.gpu.amount=spark.executor"
+                            ".resource.gpu.amount`"
+                        )

-                if int(gpu_per_task) > 1:
-                    get_logger(self.__class__.__name__).warning(
-                        "You configured %s GPU cores for each spark task, but in "
-                        "XGBoost training, every Spark task will only use one GPU core.",
-                        gpu_per_task,
+                    if float(gpu_per_task) > 1.0:
+                        get_logger(self.__class__.__name__).warning(
+                            "%s GPUs for each Spark task is configured, but each "
+                            "XGBoost training task uses only 1 GPU.",
+                            gpu_per_task,
+                        )
+                else:
+                    raise ValueError(
+                        "The `spark.task.resource.gpu.amount` is required for training"
+                        " on GPU."
                    )


@@ -557,6 +576,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
        #  they are added in `setParams`.
        self._setDefault(
            num_workers=1,
+            device="cpu",
            use_gpu=False,
            force_repartition=False,
            repartition_random_shuffle=False,
@@ -565,9 +585,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            arbitrary_params_dict={},
        )

-    def setParams(
-        self, **kwargs: Dict[str, Any]
-    ) -> None:  # pylint: disable=invalid-name
+    def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
        """
        Set params for the estimator.
        """
@@ -612,6 +630,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                    )
                    raise ValueError(err_msg)
                _extra_params[k] = v
+
+        _check_distributed_params(kwargs)
        _existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
        self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})

@@ -708,9 +728,6 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
        # TODO: support "num_parallel_tree" for random forest
        params["num_boost_round"] = self.getOrDefault("n_estimators")

-        if self.getOrDefault(self.use_gpu):
-            params["tree_method"] = "gpu_hist"
-
        return params

    @classmethod
@@ -882,8 +899,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            dmatrix_kwargs,
        ) = self._get_xgb_parameters(dataset)

-        use_gpu = self.getOrDefault(self.use_gpu)
-
+        run_on_gpu = use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
+            self.use_gpu
+        )
        is_local = _is_local(_get_spark_session().sparkContext)

        num_workers = self.getOrDefault(self.num_workers)
@@ -899,34 +917,30 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

            context = BarrierTaskContext.get()

-            gpu_id = None
-            use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+            dev_ordinal = None
+            use_qdm = _can_use_qdm(booster_params.get("tree_method", None))

-            if use_gpu:
-                gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
-                booster_params["gpu_id"] = gpu_id
+            if run_on_gpu:
+                dev_ordinal = (
+                    context.partitionId() if is_local else _get_gpu_id(context)
+                )
+                booster_params["device"] = "cuda:" + str(dev_ordinal)
                # If cuDF is not installed, then using DMatrix instead of QDM,
                # because without cuDF, DMatrix performs better than QDM.
                # Note: Checking `is_cudf_available` in spark worker side because
                # spark worker might has different python environment with driver side.
-                use_qdm = use_hist and is_cudf_available()
-            else:
-                use_qdm = use_hist
+                use_qdm = use_qdm and is_cudf_available()
+                get_logger("XGBoost-PySpark").info(
+                    "Leveraging %s to train with QDM: %s",
+                    booster_params["device"],
+                    "on" if use_qdm else "off",
+                )

            if use_qdm and (booster_params.get("max_bin", None) is not None):
                dmatrix_kwargs["max_bin"] = booster_params["max_bin"]

            _rabit_args = {}
            if context.partitionId() == 0:
-                get_logger("XGBoostPySpark").debug(
-                    "booster params: %s\n"
-                    "train_call_kwargs_params: %s\n"
-                    "dmatrix_kwargs: %s",
-                    booster_params,
-                    train_call_kwargs_params,
-                    dmatrix_kwargs,
-                )
-
                _rabit_args = _get_rabit_args(context, num_workers)

            worker_message = {
@@ -945,7 +959,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                dtrain, dvalid = create_dmatrix_from_partitions(
                    pandas_df_iter,
                    feature_prop.features_cols_names,
-                    gpu_id,
+                    dev_ordinal,
                    use_qdm,
                    dmatrix_kwargs,
                    enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
@@ -983,7 +997,19 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            )
            return ret[0], ret[1]

+        get_logger("XGBoost-PySpark").info(
+            "Running xgboost-%s on %s workers with"
+            "\n\tbooster params: %s"
+            "\n\ttrain_call_kwargs_params: %s"
+            "\n\tdmatrix_kwargs: %s",
+            xgboost._py_version(),
+            num_workers,
+            booster_params,
+            train_call_kwargs_params,
+            dmatrix_kwargs,
+        )
        (config, booster) = _run_job()
+        get_logger("XGBoost-PySpark").info("Finished xgboost training!")

        result_xgb_model = self._convert_to_sklearn_model(
            bytearray(booster, "utf-8"), config
@@ -1092,12 +1118,86 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
            )
        return features_col, feature_col_names

+    def _get_pred_contrib_col_name(self) -> Optional[str]:
+        """Return the pred_contrib_col col name"""
+        pred_contrib_col_name = None
+        if (
+            self.isDefined(self.pred_contrib_col)
+            and self.getOrDefault(self.pred_contrib_col) != ""
+        ):
+            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+
+        return pred_contrib_col_name
+
+    def _out_schema(self) -> Tuple[bool, str]:
+        """Return the bool to indicate if it's a single prediction, true is single prediction,
+        and the returned type of the user-defined function. The value must
+        be a DDL-formatted type string."""
+
+        if self._get_pred_contrib_col_name() is not None:
+            return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+
+        return True, "double"
+
+    def _get_predict_func(self) -> Callable:
+        """Return the true prediction function which will be running on the executor side"""
+
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            data = {}
+            preds = model.predict(
+                X,
+                base_margin=base_margin,
+                validate_features=False,
+                **predict_params,
+            )
+            data[pred.prediction] = pd.Series(preds)
+
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin)
+                data[pred.pred_contrib] = pd.Series(list(contribs))
+                return pd.DataFrame(data=data)
+
+            return data[pred.prediction]
+
+        return _predict
+
+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
+        """Post process of transform"""
+        prediction_col_name = self.getOrDefault(self.predictionCol)
+        single_pred, _ = self._out_schema()
+
+        if single_pred:
+            if prediction_col_name:
+                dataset = dataset.withColumn(prediction_col_name, pred_col)
+        else:
+            pred_struct_col = "_prediction_struct"
+            dataset = dataset.withColumn(pred_struct_col, pred_col)
+
+            if prediction_col_name:
+                dataset = dataset.withColumn(
+                    prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
+                )
+
+            pred_contrib_col_name = self._get_pred_contrib_col_name()
+            if pred_contrib_col_name is not None:
+                dataset = dataset.withColumn(
+                    pred_contrib_col_name,
+                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
+                )
+
+            dataset = dataset.drop(pred_struct_col)
+        return dataset
+
    def _transform(self, dataset: DataFrame) -> DataFrame:
        # pylint: disable=too-many-statements, too-many-locals
        # Save xgb_sklearn_model and predict_params to be local variable
        # to avoid the `self` object to be pickled to remote.
        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()

        has_base_margin = False
        if (
@@ -1112,18 +1212,9 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
        features_col, feature_col_names = self._get_feature_col(dataset)
        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)

-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+        predict_func = self._get_predict_func()

-        single_pred = True
-        schema = "double"
-        if pred_contrib_col_name:
-            single_pred = False
-            schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+        _, schema = self._out_schema()

        @pandas_udf(schema)  # type: ignore
        def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
@@ -1143,48 +1234,14 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
                else:
                    base_margin = None

-                data = {}
-                preds = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    validate_features=False,
-                    **predict_params,
-                )
-                data[pred.prediction] = pd.Series(preds)
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin)
-                    data[pred.pred_contrib] = pd.Series(list(contribs))
-                    yield pd.DataFrame(data=data)
-                else:
-                    yield data[pred.prediction]
+                yield predict_func(model, X, base_margin)

        if has_base_margin:
            pred_col = predict_udf(struct(*features_col, base_margin_col))
        else:
            pred_col = predict_udf(struct(*features_col))

-        prediction_col_name = self.getOrDefault(self.predictionCol)
-
-        if single_pred:
-            dataset = dataset.withColumn(prediction_col_name, pred_col)
-        else:
-            pred_struct_col = "_prediction_struct"
-            dataset = dataset.withColumn(pred_struct_col, pred_col)
-
-            dataset = dataset.withColumn(
-                prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
-            )
-
-            if pred_contrib_col_name:
-                dataset = dataset.withColumn(
-                    pred_contrib_col_name,
-                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
-                )
-
-            dataset = dataset.drop(pred_struct_col)
-
-        return dataset
+        return self._post_transform(dataset, pred_col)


 class _ClassificationModel(  # pylint: disable=abstract-method
@@ -1196,22 +1253,21 @@ class _ClassificationModel(  # pylint: disable=abstract-method
    .. Note:: This API is experimental.
    """

-    def _transform(self, dataset: DataFrame) -> DataFrame:
-        # pylint: disable=too-many-statements, too-many-locals
-        # Save xgb_sklearn_model and predict_params to be local variable
-        # to avoid the `self` object to be pickled to remote.
-        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()
+    def _out_schema(self) -> Tuple[bool, str]:
+        schema = (
+            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
+            f" {pred.probability} array<double>"
+        )
+        if self._get_pred_contrib_col_name() is not None:
+            # We will force setting strict_shape to True when predicting contribs,
+            # So, it will also output 3-D shape result.
+            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"

-        has_base_margin = False
-        if (
-            self.isDefined(self.base_margin_col)
-            and self.getOrDefault(self.base_margin_col) != ""
-        ):
-            has_base_margin = True
-            base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
-                alias.margin
-            )
+        return False, schema
+
+    def _get_predict_func(self) -> Callable:
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()

        def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
            if margins.ndim == 1:
@@ -1226,76 +1282,38 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                class_probs = softmax(raw_preds, axis=1)
            return raw_preds, class_probs

-        features_col, feature_col_names = self._get_feature_col(dataset)
-        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            margins = model.predict(
+                X,
+                base_margin=base_margin,
+                output_margin=True,
+                validate_features=False,
+                **predict_params,
+            )
+            raw_preds, class_probs = transform_margin(margins)

-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+            # It seems that they use argmax of class probs,
+            # not of margin to get the prediction (Note: scala implementation)
+            preds = np.argmax(class_probs, axis=1)
+            result: Dict[str, pd.Series] = {
+                pred.raw_prediction: pd.Series(list(raw_preds)),
+                pred.prediction: pd.Series(preds),
+                pred.probability: pd.Series(list(class_probs)),
+            }

-        schema = (
-            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
-            f" {pred.probability} array<double>"
-        )
-        if pred_contrib_col_name:
-            # We will force setting strict_shape to True when predicting contribs,
-            # So, it will also output 3-D shape result.
-            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin, strict_shape=True)
+                result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))

-        @pandas_udf(schema)  # type: ignore
-        def predict_udf(
-            iterator: Iterator[Tuple[pd.Series, ...]]
-        ) -> Iterator[pd.DataFrame]:
-            assert xgb_sklearn_model is not None
-            model = xgb_sklearn_model
-            for data in iterator:
-                if enable_sparse_data_optim:
-                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
-                else:
-                    if feature_col_names is not None:
-                        X = data[feature_col_names]  # type: ignore
-                    else:
-                        X = stack_series(data[alias.data])
+            return pd.DataFrame(data=result)

-                if has_base_margin:
-                    base_margin = stack_series(data[alias.margin])
-                else:
-                    base_margin = None
-
-                margins = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    output_margin=True,
-                    validate_features=False,
-                    **predict_params,
-                )
-                raw_preds, class_probs = transform_margin(margins)
-
-                # It seems that they use argmax of class probs,
-                # not of margin to get the prediction (Note: scala implementation)
-                preds = np.argmax(class_probs, axis=1)
-                result: Dict[str, pd.Series] = {
-                    pred.raw_prediction: pd.Series(list(raw_preds)),
-                    pred.prediction: pd.Series(preds),
-                    pred.probability: pd.Series(list(class_probs)),
-                }
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin, strict_shape=True)
-                    result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
-
-                yield pd.DataFrame(data=result)
-
-        if has_base_margin:
-            pred_struct = predict_udf(struct(*features_col, base_margin_col))
-        else:
-            pred_struct = predict_udf(struct(*features_col))
+        return _predict

+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
        pred_struct_col = "_prediction_struct"
-        dataset = dataset.withColumn(pred_struct_col, pred_struct)
+        dataset = dataset.withColumn(pred_struct_col, pred_col)

        raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
        if raw_prediction_col_name:
@@ -1317,7 +1335,8 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                array_to_vector(getattr(col(pred_struct_col), pred.probability)),
            )

-        if pred_contrib_col_name:
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+        if pred_contrib_col_name is not None:
            dataset = dataset.withColumn(
                pred_contrib_col_name,
                getattr(col(pred_struct_col), pred.pred_contrib),
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:

 def make_qdm(
    data: Dict[str, List[np.ndarray]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
    meta: Dict[str, Any],
    ref: Optional[DMatrix],
    params: Dict[str, Any],
@@ -165,7 +165,7 @@ def make_qdm(
    """Handle empty partition for QuantileDMatrix."""
    if not data:
        return QuantileDMatrix(np.empty((0, 0)), ref=ref)
-    it = PartIter(data, gpu_id, **meta)
+    it = PartIter(data, dev_ordinal, **meta)
    m = QuantileDMatrix(it, **params, ref=ref)
    return m

@@ -173,7 +173,7 @@ def make_qdm(
 def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
    iterator: Iterator[pd.DataFrame],
    feature_cols: Optional[Sequence[str]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
    use_qdm: bool,
    kwargs: Dict[str, Any],  # use dict to make sure this parameter is passed.
    enable_sparse_data_optim: bool,
@@ -187,7 +187,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
        Pyspark partition iterator.
    feature_cols:
        A sequence of feature names, used only when rapids plugin is enabled.
-    gpu_id:
+    dev_ordinal:
        Device ordinal, used when GPU is enabled.
    use_qdm :
        Whether QuantileDMatrix should be used instead of DMatrix.
@@ -304,13 +304,13 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments

    if feature_cols is not None and use_qdm:
        cache_partitions(iterator, append_fn)
-        dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
    elif feature_cols is not None and not use_qdm:
        cache_partitions(iterator, append_fn)
        dtrain = make(train_data, kwargs)
    elif feature_cols is None and use_qdm:
        cache_partitions(iterator, append_fn)
-        dtrain = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
    else:
        cache_partitions(iterator, append_fn)
        dtrain = make(train_data, kwargs)
@@ -324,7 +324,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
    if has_validation_col:
        if use_qdm:
            dvalid: Optional[DMatrix] = make_qdm(
-                valid_data, gpu_id, meta, dtrain, params
+                valid_data, dev_ordinal, meta, dtrain, params
            )
        else:
            dvalid = make(valid_data, kwargs) if has_validation_col else None
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -3,8 +3,8 @@
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
 # pylint: disable=unused-argument, too-many-locals

-
-from typing import Any, Dict, List, Optional, Type, Union
+import warnings
+from typing import Any, List, Optional, Type, Union

 import numpy as np
 from pyspark import keyword_only
@@ -77,28 +77,35 @@ def _set_pyspark_xgb_cls_param_attrs(
        set_param_attrs(name, param_obj)


+def _deprecated_use_gpu() -> None:
+    warnings.warn(
+        "`use_gpu` is deprecated since 2.0.0, use `device` instead", FutureWarning
+    )
+
+
 class SparkXGBRegressor(_SparkXGBEstimator):
-    """
-    SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
+    """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
    algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
-    and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
-    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
-    :py:class:`~pyspark.ml.classification.OneVsRest`
+    and PySpark ML meta algorithms like
+    - :py:class:`~pyspark.ml.tuning.CrossValidator`/
+    - :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
+    - :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBRegressor automatically supports most of the parameters in
    :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
-    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
+    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict`
+    method.

-    SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.

-    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
-    another param called `base_margin_col`. see doc below for more details.
+    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but
+    support another param called `base_margin_col`. see doc below for more details.

    SparkXGBRegressor doesn't support `validate_features` and `output_margin` param.

-    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.


    Parameters
@@ -134,8 +141,16 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        How many XGBoost workers to be used to train.
        Each XGBoost worker corresponds to one spark task.
    use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+
+    device:
+
+        .. versionadded:: 2.0.0
+
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -194,14 +209,17 @@ class SparkXGBRegressor(_SparkXGBEstimator):
        weight_col: Optional[str] = None,
        base_margin_col: Optional[str] = None,
        num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
        force_repartition: bool = False,
        repartition_random_shuffle: bool = False,
        enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
    ) -> None:
        super().__init__()
        input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
        self.setParams(**input_kwargs)

    @classmethod
@@ -239,27 +257,29 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
    """SparkXGBClassifier is a PySpark ML estimator. It implements the XGBoost
    classification algorithm based on XGBoost python library, and it can be used in
    PySpark Pipeline and PySpark ML meta algorithms like
-    :py:class:`~pyspark.ml.tuning.CrossValidator`/
-    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
-    :py:class:`~pyspark.ml.classification.OneVsRest`
+    - :py:class:`~pyspark.ml.tuning.CrossValidator`/
+    - :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
+    - :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBClassifier automatically supports most of the parameters in
    :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
+    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict`
+    method.

-    SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.

-    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
-    another param called `base_margin_col`. see doc below for more details.
+    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but
+    support another param called `base_margin_col`. see doc below for more details.

-    SparkXGBClassifier doesn't support setting `output_margin`, but we can get output margin
-    from the raw prediction column. See `raw_prediction_col` param doc below for more details.
+    SparkXGBClassifier doesn't support setting `output_margin`, but we can get output
+    margin from the raw prediction column. See `raw_prediction_col` param doc below for
+    more details.

    SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.

-    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.


    Parameters
@@ -301,8 +321,16 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        How many XGBoost workers to be used to train.
        Each XGBoost worker corresponds to one spark task.
    use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+
+    device:
+
+        .. versionadded:: 2.0.0
+
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -361,11 +389,12 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        weight_col: Optional[str] = None,
        base_margin_col: Optional[str] = None,
        num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
        force_repartition: bool = False,
        repartition_random_shuffle: bool = False,
        enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
    ) -> None:
        super().__init__()
        # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
@@ -373,6 +402,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
        # binary or multinomial input dataset, and we need to remove the fixed default
        # param value as well to avoid causing ambiguity.
        input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
        self.setParams(**input_kwargs)
        self._setDefault(objective=None)

@@ -423,19 +454,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
    :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
    :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.

-    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.

    SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
    another param called `base_margin_col`. see doc below for more details.

    SparkXGBRanker doesn't support setting `output_margin`, but we can get output margin
-    from the raw prediction column. See `raw_prediction_col` param doc below for more details.
+    from the raw prediction column. See `raw_prediction_col` param doc below for more
+    details.

    SparkXGBRanker doesn't support `validate_features` and `output_margin` param.

-    SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.


    Parameters
@@ -468,13 +500,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
        :py:class:`xgboost.XGBRanker` fit method.
    qid_col:
        Query id column name.
-
    num_workers:
        How many XGBoost workers to be used to train.
        Each XGBoost worker corresponds to one spark task.
    use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+
+    device:
+
+        .. versionadded:: 2.0.0
+
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
    force_repartition:
        Boolean value to specify if forcing the input dataset to be repartitioned
        before XGBoost training.
@@ -539,14 +578,17 @@ class SparkXGBRanker(_SparkXGBEstimator):
        base_margin_col: Optional[str] = None,
        qid_col: Optional[str] = None,
        num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
        force_repartition: bool = False,
        repartition_random_shuffle: bool = False,
        enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
    ) -> None:
        super().__init__()
        input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
        self.setParams(**input_kwargs)

    @classmethod
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -7,7 +7,7 @@ import os
 import sys
 import uuid
 from threading import Thread
-from typing import Any, Callable, Dict, Set, Type
+from typing import Any, Callable, Dict, Optional, Set, Type

 import pyspark
 from pyspark import BarrierTaskContext, SparkContext, SparkFiles
@@ -104,6 +104,10 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
    # If the logger is configured, skip the configure
    if not logger.handlers and not logging.getLogger().handlers:
        handler = logging.StreamHandler(sys.stderr)
+        formatter = logging.Formatter(
+            "%(asctime)s %(levelname)s %(name)s: %(funcName)s %(message)s"
+        )
+        handler.setFormatter(formatter)
        logger.addHandler(handler)
    return logger

@@ -186,3 +190,8 @@ def deserialize_booster(model: str) -> Booster:
        f.write(model)
    booster.load_model(tmp_file_name)
    return booster
+
+
+def use_cuda(device: Optional[str]) -> bool:
+    """Whether xgboost is using CUDA workers."""
+    return device in ("cuda", "gpu")
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -25,6 +25,7 @@ from typing import (
    Set,
    Tuple,
    TypedDict,
+    TypeVar,
    Union,
 )

@@ -93,6 +94,10 @@ def no_ipv6() -> PytestSkip:
    return {"condition": not has_ipv6(), "reason": "IPv6 is required to be enabled."}


+def not_linux() -> PytestSkip:
+    return {"condition": system() != "Linux", "reason": "Linux is required."}
+
+
 def no_ubjson() -> PytestSkip:
    return no_mod("ubjson")

@@ -198,20 +203,20 @@ class IteratorForTest(xgb.core.DataIter):
        X: Sequence,
        y: Sequence,
        w: Optional[Sequence],
-        cache: Optional[str] = "./",
+        cache: Optional[str],
    ) -> None:
        assert len(X) == len(y)
        self.X = X
        self.y = y
        self.w = w
        self.it = 0
-        super().__init__(cache)
+        super().__init__(cache_prefix=cache)

    def next(self, input_data: Callable) -> int:
        if self.it == len(self.X):
            return 0

-        with pytest.raises(TypeError, match="keyword args"):
+        with pytest.raises(TypeError, match="Keyword argument"):
            input_data(self.X[self.it], self.y[self.it], None)

        # Use copy to make sure the iterator doesn't hold a reference to the data.
@@ -229,7 +234,7 @@ class IteratorForTest(xgb.core.DataIter):

    def as_arrays(
        self,
-    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
+    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
        if isinstance(self.X[0], sparse.csr_matrix):
            X = sparse.vstack(self.X, format="csr")
        else:
@@ -243,7 +248,12 @@ class IteratorForTest(xgb.core.DataIter):


 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    use_cupy: bool = False,
+    *,
+    vary_size: bool = False,
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
    X = []
    y = []
@@ -254,16 +264,25 @@ def make_batches(
        rng = cupy.random.RandomState(1994)
    else:
        rng = np.random.RandomState(1994)
-    for _ in range(n_batches):
-        _X = rng.randn(n_samples_per_batch, n_features)
-        _y = rng.randn(n_samples_per_batch)
-        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
+    for i in range(n_batches):
+        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
+        _X = rng.randn(n_samples, n_features)
+        _y = rng.randn(n_samples)
+        _w = rng.uniform(low=0, high=1, size=n_samples)
        X.append(_X)
        y.append(_y)
        w.append(_w)
    return X, y, w


+def make_regression(
+    n_samples: int, n_features: int, use_cupy: bool
+) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
+    """Make a simple regression dataset."""
+    X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
+    return X[0], y[0], w[0]
+
+
 def make_batches_sparse(
    n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
 ) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
@@ -347,7 +366,9 @@ class TestDataset:
            if w is not None:
                weight.append(w)

-        it = IteratorForTest(predictor, response, weight if weight else None)
+        it = IteratorForTest(
+            predictor, response, weight if weight else None, cache="cache"
+        )
        return xgb.DMatrix(it)

    def __repr__(self) -> str:
@@ -709,6 +730,9 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
    )


+M = TypeVar("M", xgb.Booster, xgb.XGBModel)
+
+
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
    """Evaluation metric for xgb.train"""
    label = dtrain.get_label()
@@ -743,13 +767,31 @@ def softmax(x: np.ndarray) -> np.ndarray:
    return e / np.sum(e)


-def softprob_obj(classes: int) -> SklObjective:
+def softprob_obj(
+    classes: int, use_cupy: bool = False, order: str = "C", gdtype: str = "float32"
+) -> SklObjective:
+    """Custom softprob objective for testing.
+
+    Parameters
+    ----------
+    use_cupy :
+        Whether the objective should return cupy arrays.
+    order :
+        The order of gradient matrices. "C" or "F".
+    gdtype :
+        DType for gradient. Hessian is not set. This is for testing asymmetric types.
+    """
+    if use_cupy:
+        import cupy as backend
+    else:
+        backend = np
+
    def objective(
-        labels: np.ndarray, predt: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray]:
+        labels: backend.ndarray, predt: backend.ndarray
+    ) -> Tuple[backend.ndarray, backend.ndarray]:
        rows = labels.shape[0]
-        grad = np.zeros((rows, classes), dtype=float)
-        hess = np.zeros((rows, classes), dtype=float)
+        grad = backend.zeros((rows, classes), dtype=np.float32)
+        hess = backend.zeros((rows, classes), dtype=np.float32)
        eps = 1e-6
        for r in range(predt.shape[0]):
            target = labels[r]
@@ -761,8 +803,10 @@ def softprob_obj(classes: int) -> SklObjective:
                grad[r, c] = g
                hess[r, c] = h

-        grad = grad.reshape((rows * classes, 1))
-        hess = hess.reshape((rows * classes, 1))
+        grad = grad.reshape((rows, classes))
+        hess = hess.reshape((rows, classes))
+        grad = backend.require(grad, requirements=order, dtype=gdtype)
+        hess = backend.require(hess, requirements=order)
        return grad, hess

    return objective
--- a/python-package/xgboost/testing/data_iter.py
+++ b/python-package/xgboost/testing/data_iter.py
@@ -0,0 +1,34 @@
+"""Tests related to the `DataIter` interface."""
+import numpy as np
+
+import xgboost
+from xgboost import testing as tm
+
+
+def run_mixed_sparsity(device: str) -> None:
+    """Check QDM with mixed batches."""
+    X_0, y_0, _ = tm.make_regression(128, 16, False)
+    if device.startswith("cuda"):
+        X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
+    else:
+        X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, False)
+    X_2, y_2 = tm.make_sparse_regression(512, 16, 0.9, True)
+    X = [X_0, X_1, X_2]
+    y = [y_0, y_1, y_2]
+
+    if device.startswith("cuda"):
+        import cupy as cp  # pylint: disable=import-error
+
+        X = [cp.array(batch) for batch in X]
+
+    it = tm.IteratorForTest(X, y, None, None)
+    Xy_0 = xgboost.QuantileDMatrix(it)
+
+    X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
+    X = [X_0, X_1, X_2]
+    y = [y_0, y_1, y_2]
+    X_arr = np.concatenate(X, axis=0)
+    y_arr = np.concatenate(y, axis=0)
+    Xy_1 = xgboost.QuantileDMatrix(X_arr, y_arr)
+
+    assert tm.predictor_equal(Xy_0, Xy_1)
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -41,6 +41,10 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
    and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )

+hist_cache_strategy = strategies.fixed_dictionaries(
+    {"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
+)
+
 hist_multi_parameter_strategy = strategies.fixed_dictionaries(
    {
        "max_depth": strategies.integers(1, 11),
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -1,7 +1,7 @@
 """Tests for updaters."""
 import json
 from functools import partial, update_wrapper
-from typing import Dict
+from typing import Any, Dict, List

 import numpy as np

@@ -159,3 +159,238 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:

    for i in range(alpha.shape[0]):
        np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
+
+
+def check_cut(
+    n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
+) -> None:
+    """Check the cut values."""
+    from pandas.api.types import is_categorical_dtype
+
+    assert data.shape[0] == indptr[-1]
+    assert data.shape[0] == n_entries
+
+    assert indptr.dtype == np.uint64
+    for i in range(1, indptr.size):
+        beg = int(indptr[i - 1])
+        end = int(indptr[i])
+        for j in range(beg + 1, end):
+            assert data[j] > data[j - 1]
+            if is_categorical_dtype(dtypes[i - 1]):
+                assert data[j] == data[j - 1] + 1
+
+
+def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
+    """Check with optional cupy."""
+    from pandas.api.types import is_categorical_dtype
+
+    n_samples = 1024
+    n_features = 14
+    max_bin = 16
+    dtypes = [np.float32] * n_features
+
+    # numerical
+    X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
+    # - qdm
+    Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - dm
+    Xyw = xgb.DMatrix(X, y, weight=w)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - ext mem
+    n_batches = 3
+    n_samples_per_batch = 256
+    it = tm.IteratorForTest(
+        *tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
+        cache="cache",
+    )
+    Xy: xgb.DMatrix = xgb.DMatrix(it)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+
+    # categorical
+    n_categories = 32
+    X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
+    if use_cupy:
+        import cudf  # pylint: disable=import-error
+        import cupy as cp  # pylint: disable=import-error
+
+        X = cudf.from_pandas(X)
+        y = cp.array(y)
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+
+    # mixed
+    X, y = tm.make_categorical(
+        n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
+    )
+    n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
+    n_num_features = n_features - n_cat_features
+    n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+
+
+def check_get_quantile_cut(tree_method: str) -> None:
+    """Check the quantile cut getter."""
+
+    use_cupy = tree_method == "gpu_hist"
+    check_get_quantile_cut_device(tree_method, False)
+    if use_cupy:
+        check_get_quantile_cut_device(tree_method, True)
+
+
+USE_ONEHOT = np.iinfo(np.int32).max
+USE_PART = 1
+
+
+def check_categorical_ohe(  # pylint: disable=too-many-arguments
+    rows: int, cols: int, rounds: int, cats: int, device: str, tree_method: str
+) -> None:
+    "Test for one-hot encoding with categorical data."
+
+    onehot, label = tm.make_categorical(rows, cols, cats, True)
+    cat, _ = tm.make_categorical(rows, cols, cats, False)
+
+    by_etl_results: Dict[str, Dict[str, List[float]]] = {}
+    by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
+
+    parameters: Dict[str, Any] = {
+        "tree_method": tree_method,
+        # Use one-hot exclusively
+        "max_cat_to_onehot": USE_ONEHOT,
+        "device": device,
+    }
+
+    m = xgb.DMatrix(onehot, label, enable_categorical=False)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_etl_results,
+    )
+
+    m = xgb.DMatrix(cat, label, enable_categorical=True)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_builtin_results,
+    )
+
+    # There are guidelines on how to specify tolerance based on considering output
+    # as random variables. But in here the tree construction is extremely sensitive
+    # to floating point errors. An 1e-5 error in a histogram bin can lead to an
+    # entirely different tree. So even though the test is quite lenient, hypothesis
+    # can still pick up falsifying examples from time to time.
+    np.testing.assert_allclose(
+        np.array(by_etl_results["Train"]["rmse"]),
+        np.array(by_builtin_results["Train"]["rmse"]),
+        rtol=1e-3,
+    )
+    assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+    by_grouping: Dict[str, Dict[str, List[float]]] = {}
+    # switch to partition-based splits
+    parameters["max_cat_to_onehot"] = USE_PART
+    parameters["reg_lambda"] = 0
+    m = xgb.DMatrix(cat, label, enable_categorical=True)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_grouping,
+    )
+    rmse_oh = by_builtin_results["Train"]["rmse"]
+    rmse_group = by_grouping["Train"]["rmse"]
+    # always better or equal to onehot when there's no regularization.
+    for a, b in zip(rmse_oh, rmse_group):
+        assert a >= b
+
+    parameters["reg_lambda"] = 1.0
+    by_grouping = {}
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=32,
+        evals=[(m, "Train")],
+        evals_result=by_grouping,
+    )
+    assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
+
+
+def check_categorical_missing(
+    rows: int, cols: int, cats: int, device: str, tree_method: str
+) -> None:
+    """Check categorical data with missing values."""
+    parameters: Dict[str, Any] = {"tree_method": tree_method, "device": device}
+    cat, label = tm.make_categorical(
+        rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
+    )
+    Xy = xgb.DMatrix(cat, label, enable_categorical=True)
+
+    def run(max_cat_to_onehot: int) -> None:
+        # Test with onehot splits
+        parameters["max_cat_to_onehot"] = max_cat_to_onehot
+
+        evals_result: Dict[str, Dict] = {}
+        booster = xgb.train(
+            parameters,
+            Xy,
+            num_boost_round=16,
+            evals=[(Xy, "Train")],
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["rmse"])
+        y_predt = booster.predict(Xy)
+
+        rmse = tm.root_mean_square(label, y_predt)
+        np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1], rtol=2e-5)
+
+    # Test with OHE split
+    run(USE_ONEHOT)
+
+    # Test with partition-based split
+    run(USE_PART)
+
+
+def train_result(
+    param: Dict[str, Any], dmat: xgb.DMatrix, num_rounds: int
+) -> Dict[str, Any]:
+    """Get training result from parameters and data."""
+    result: Dict[str, Any] = {}
+    booster = xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        evals=[(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    assert booster.num_features() == dmat.num_col()
+    assert booster.num_boosted_rounds() == num_rounds
+    assert booster.feature_names == dmat.feature_names
+    assert booster.feature_types == dmat.feature_types
+
+    return result
--- a/python-package/xgboost/tracker.py
+++ b/python-package/xgboost/tracker.py
@@ -137,15 +137,9 @@ class WorkerEntry:
        return self._get_remote(wait_conn, nnset)

    def _get_remote(
-        self, wait_conn: Dict[int, "WorkerEntry"], nnset: Set[int]
+        self, wait_conn: Dict[int, "WorkerEntry"], badset: Set[int]
    ) -> List[int]:
        while True:
-            ngood = self.sock.recvint()
-            goodset = set()
-            for _ in range(ngood):
-                goodset.add(self.sock.recvint())
-            assert goodset.issubset(nnset)
-            badset = nnset - goodset
            conset = []
            for r in badset:
                if r in wait_conn:
@@ -343,7 +337,7 @@ class RabitTracker:
                shutdown[s.rank] = s
                logging.debug("Received %s signal from %d", s.cmd, s.rank)
                continue
-            assert s.cmd in ("start", "recover")
+            assert s.cmd == "start"
            # lazily initialize the workers
            if tree_map is None:
                assert s.cmd == "start"
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -28,17 +28,6 @@ from .core import (
 _CVFolds = Sequence["CVPack"]


-def _assert_new_callback(callbacks: Optional[Sequence[TrainingCallback]]) -> None:
-    is_new_callback: bool = not callbacks or all(
-        isinstance(c, TrainingCallback) for c in callbacks
-    )
-    if not is_new_callback:
-        link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
-        raise ValueError(
-            f"Old style callback was removed in version 1.6.  See: {link}."
-        )
-
-
 def _configure_custom_metric(
    feval: Optional[Metric], custom_metric: Optional[Metric]
 ) -> Optional[Metric]:
@@ -170,7 +159,6 @@ def train(
    bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
    start_iteration = 0

-    _assert_new_callback(callbacks)
    if verbose_eval:
        verbose_eval = 1 if verbose_eval is True else verbose_eval
        callbacks.append(EvaluationMonitor(period=verbose_eval))
@@ -190,7 +178,7 @@ def train(
    for i in range(start_iteration, num_boost_round):
        if cb_container.before_iteration(bst, i, dtrain, evals):
            break
-        bst.update(dtrain, i, obj)
+        bst.update(dtrain, iteration=i, fobj=obj)
        if cb_container.after_iteration(bst, i, dtrain, evals):
            break

@@ -247,7 +235,7 @@ class _PackedBooster:
        result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
        return result

-    def set_attr(self, **kwargs: Optional[str]) -> Any:
+    def set_attr(self, **kwargs: Optional[Any]) -> Any:
        """Iterate through folds for setting attributes"""
        for f in self.cvfolds:
            f.bst.set_attr(**kwargs)
@@ -274,11 +262,20 @@ class _PackedBooster:
        """Get best_iteration"""
        return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))

+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        """Get best_iteration"""
+        self.set_attr(best_iteration=iteration)
+
    @property
    def best_score(self) -> float:
        """Get best_score."""
        return float(cast(float, self.cvfolds[0].bst.attr("best_score")))

+    @best_score.setter
+    def best_score(self, score: float) -> None:
+        self.set_attr(best_score=score)
+

 def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
    """
@@ -551,7 +548,6 @@ def cv(

    # setup callbacks
    callbacks = [] if callbacks is None else copy.copy(list(callbacks))
-    _assert_new_callback(callbacks)

    if verbose_eval:
        verbose_eval = 1 if verbose_eval is True else verbose_eval