merge latest, Jan 12 2024

2024-01-12 09:57:11 -08:00
parent c42c7d99f1 73b3955dd4
commit 1e1e8be3a5
251 changed files with 9023 additions and 5012 deletions
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -62,11 +62,31 @@ class TrainingCallback(ABC):
        return model

    def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
-        """Run before each iteration.  Return True when training should stop."""
+        """Run before each iteration.  Returns True when training should stop. See
+        :py:meth:`after_iteration` for details.
+
+        """
        return False

    def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
-        """Run after each iteration.  Return True when training should stop."""
+        """Run after each iteration.  Returns `True` when training should stop.
+
+        Parameters
+        ----------
+
+        model :
+            Eeither a :py:class:`~xgboost.Booster` object or a CVPack if the cv function
+            in xgboost is being used.
+        epoch :
+            The current training iteration.
+        evals_log :
+            A dictionary containing the evaluation history:
+
+            .. code-block:: python
+
+                {"data_name": {"metric_name": [0.5, ...]}}
+
+        """
        return False


@@ -547,14 +567,16 @@ class TrainingCheckPoint(TrainingCallback):

    .. versionadded:: 1.3.0

+    Since XGBoost 2.1.0, the default format is changed to UBJSON.
+
    Parameters
    ----------

    directory :
        Output model directory.
    name :
-        pattern of output model file.  Models will be saved as name_0.json, name_1.json,
-        name_2.json ....
+        pattern of output model file.  Models will be saved as name_0.ubj, name_1.ubj,
+        name_2.ubj ....
    as_pickle :
        When set to True, all training parameters will be saved in pickle format,
        instead of saving only the model.
@@ -564,6 +586,8 @@ class TrainingCheckPoint(TrainingCallback):

    """

+    default_format = "ubj"
+
    def __init__(
        self,
        directory: Union[str, os.PathLike],
@@ -592,7 +616,7 @@ class TrainingCheckPoint(TrainingCallback):
                self._name
                + "_"
                + (str(epoch + self._start))
-                + (".pkl" if self._as_pickle else ".json"),
+                + (".pkl" if self._as_pickle else f".{self.default_format}"),
            )
            self._epoch = 0  # reset counter
            if collective.get_rank() == 0:
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -100,6 +100,16 @@ def is_cupy_available() -> bool:
        return False


+def import_cupy() -> types.ModuleType:
+    """Import cupy."""
+    if not is_cupy_available():
+        raise ImportError("`cupy` is required for handling CUDA buffer.")
+
+    import cupy  # pylint: disable=import-error
+
+    return cupy
+
+
 try:
    import scipy.sparse as scipy_sparse
    from scipy.sparse import csr_matrix as scipy_csr
@@ -128,9 +138,9 @@ def concat(value: Sequence[_T]) -> _T:  # pylint: disable=too-many-return-statem
        from cudf import concat as CUDF_concat  # pylint: disable=import-error

        return CUDF_concat(value, axis=0)
-    from .data import _is_cupy_array
+    from .data import _is_cupy_alike

-    if _is_cupy_array(value[0]):
+    if _is_cupy_alike(value[0]):
        import cupy  # pylint: disable=import-error

        # pylint: disable=c-extension-no-member,no-member
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3,7 +3,6 @@
 """Core XGBoost Library."""
 import copy
 import ctypes
-import importlib.util
 import json
 import os
 import re
@@ -45,7 +44,6 @@ from ._typing import (
    CStrPptr,
    CStrPtr,
    CTypeT,
-    CupyT,
    DataType,
    FeatureInfo,
    FeatureNames,
@@ -55,7 +53,7 @@ from ._typing import (
    TransformedData,
    c_bst_ulong,
 )
-from .compat import PANDAS_INSTALLED, DataFrame, py_str
+from .compat import PANDAS_INSTALLED, DataFrame, import_cupy, py_str
 from .libpath import find_lib_path


@@ -213,6 +211,7 @@ def _load_lib() -> ctypes.CDLL:
            lib = ctypes.cdll.LoadLibrary(lib_path)
            setattr(lib, "path", os.path.normpath(lib_path))
            lib_success = True
+            break
        except OSError as e:
            os_error_list.append(str(e))
            continue
@@ -358,10 +357,13 @@ def _numpy2ctypes_type(dtype: Type[np.number]) -> Type[CNumeric]:
    return _NUMPY_TO_CTYPES_MAPPING[dtype]


+def _array_hasobject(data: DataType) -> bool:
+    return hasattr(data.dtype, "hasobject") and data.dtype.hasobject
+
+
 def _cuda_array_interface(data: DataType) -> bytes:
-    assert (
-        data.dtype.hasobject is False
-    ), "Input data contains `object` dtype.  Expecting numeric data."
+    if _array_hasobject(data):
+        raise ValueError("Input data contains `object` dtype.  Expecting numeric data.")
    interface = data.__cuda_array_interface__
    if "mask" in interface:
        interface["mask"] = interface["mask"].__cuda_array_interface__
@@ -380,34 +382,6 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n
    return res


-def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT:
-    """Convert a ctypes pointer array to a cupy array."""
-    # pylint: disable=import-error
-    import cupy
-    from cupy.cuda.memory import MemoryPointer, UnownedMemory
-
-    CUPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = {
-        cupy.float32: ctypes.c_float,
-        cupy.uint32: ctypes.c_uint,
-    }
-    if dtype not in CUPY_TO_CTYPES_MAPPING:
-        raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
-    addr = ctypes.cast(cptr, ctypes.c_void_p).value
-    # pylint: disable=c-extension-no-member,no-member
-    device = cupy.cuda.runtime.pointerGetAttributes(addr).device
-    # The owner field is just used to keep the memory alive with ref count.  As
-    # unowned's life time is scoped within this function we don't need that.
-    unownd = UnownedMemory(
-        addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None
-    )
-    memptr = MemoryPointer(unownd, 0)
-    # pylint: disable=unexpected-keyword-arg
-    mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr)
-    assert mem.device.id == device
-    arr = cupy.array(mem, copy=True)
-    return arr
-
-
 def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
    """Convert ctypes pointer to buffer type."""
    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
@@ -466,14 +440,8 @@ def from_array_interface(interface: dict) -> NumpyOrCupy:

    if "stream" in interface:
        # CUDA stream is presented, this is a __cuda_array_interface__.
-        spec = importlib.util.find_spec("cupy")
-        if spec is None:
-            raise ImportError("`cupy` is required for handling CUDA buffer.")
-
-        import cupy as cp  # pylint: disable=import-error
-
        arr.__cuda_array_interface__ = interface
-        out = cp.array(arr, copy=True)
+        out = import_cupy().array(arr, copy=True)
    else:
        arr.__array_interface__ = interface
        out = np.array(arr, copy=True)
@@ -481,17 +449,42 @@ def from_array_interface(interface: dict) -> NumpyOrCupy:
    return out


+def make_array_interface(
+    ptr: CNumericPtr, shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool
+) -> Dict[str, Union[int, tuple, None]]:
+    """Make an __(cuda)_array_interface__ from a pointer."""
+    # Use an empty array to handle typestr and descr
+    if is_cuda:
+        empty = import_cupy().empty(shape=(0,), dtype=dtype)
+        array = empty.__cuda_array_interface__  # pylint: disable=no-member
+    else:
+        empty = np.empty(shape=(0,), dtype=dtype)
+        array = empty.__array_interface__  # pylint: disable=no-member
+
+    addr = ctypes.cast(ptr, ctypes.c_void_p).value
+    length = int(np.prod(shape))
+    # Handle empty dataset.
+    assert addr is not None or length == 0
+
+    if addr is None:
+        return array
+
+    array["data"] = (addr, True)
+    if is_cuda:
+        array["stream"] = 2
+    array["shape"] = shape
+    array["strides"] = None
+    return array
+
+
 def _prediction_output(
    shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
 ) -> NumpyOrCupy:
-    arr_shape = ctypes2numpy(shape, dims.value, np.uint64)
-    length = int(np.prod(arr_shape))
-    if is_cuda:
-        arr_predict = ctypes2cupy(predts, length, np.float32)
-    else:
-        arr_predict = ctypes2numpy(predts, length, np.float32)
-    arr_predict = arr_predict.reshape(arr_shape)
-    return arr_predict
+    arr_shape = tuple(ctypes2numpy(shape, dims.value, np.uint64).flatten())
+    array = from_array_interface(
+        make_array_interface(predts, arr_shape, np.float32, is_cuda)
+    )
+    return array


 class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
@@ -795,7 +788,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
                 so it doesn't make sense to assign weights to individual data points.

        base_margin :
-            Base margin used for boosting from existing model.
+            Global bias for each instance. See :doc:`/tutorials/intercept` for details.
        missing :
            Value in the input data which needs to be present as a missing value. If
            None, defaults to np.nan.
@@ -832,9 +825,19 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m

            .. note:: This parameter is experimental

-            Experimental support of specializing for categorical features.  Do not set
-            to True unless you are interested in development. Also, JSON/UBJSON
-            serialization format is required.
+            Experimental support of specializing for categorical features.
+
+            If passing 'True' and 'data' is a data frame (from supported libraries such
+            as Pandas, Modin or cuDF), columns of categorical types will automatically
+            be set to be of categorical type (feature_type='c') in the resulting
+            DMatrix.
+
+            If passing 'False' and 'data' is a data frame with categorical columns,
+            it will result in an error being thrown.
+
+            If 'data' is not a data frame, this argument is ignored.
+
+            JSON/UBJSON serialization format is required for this.

        """
        if group is not None and qid is not None:
@@ -1441,6 +1444,12 @@ class _ProxyDMatrix(DMatrix):
            _LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data))
        )

+    def _set_data_from_pandas(self, data: DataType) -> None:
+        """Set data from a pandas DataFrame. The input is a PandasTransformed instance."""
+        _check_call(
+            _LIB.XGProxyDMatrixSetDataColumnar(self.handle, data.array_interface())
+        )
+
    def _set_data_from_csr(self, csr: scipy.sparse.csr_matrix) -> None:
        """Set data from scipy csr"""
        from .data import _array_interface
@@ -2096,7 +2105,7 @@ class Booster:
            _array_interface,
            _cuda_array_interface,
            _ensure_np_dtype,
-            _is_cupy_array,
+            _is_cupy_alike,
        )

        self._assign_dmatrix_features(dtrain)
@@ -2110,7 +2119,7 @@ class Booster:
                "Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian."
                f" Got: {type(array)}"
            )
-            if not isinstance(array, np.ndarray) and not _is_cupy_array(array):
+            if not isinstance(array, np.ndarray) and not _is_cupy_alike(array):
                raise TypeError(msg)

            n_samples = dtrain.num_row()
@@ -2125,7 +2134,7 @@ class Booster:
            if isinstance(array, np.ndarray):
                array, _ = _ensure_np_dtype(array, array.dtype)
                interface = _array_interface(array)
-            elif _is_cupy_array(array):
+            elif _is_cupy_alike(array):
                interface = _cuda_array_interface(array)
            else:
                raise TypeError(msg)
@@ -2450,11 +2459,12 @@ class Booster:
        assert proxy is None or isinstance(proxy, _ProxyDMatrix)

        from .data import (
+            PandasTransformed,
            _array_interface,
            _arrow_transform,
            _is_arrow,
            _is_cudf_df,
-            _is_cupy_array,
+            _is_cupy_alike,
            _is_list,
            _is_np_array_like,
            _is_pandas_df,
@@ -2504,6 +2514,19 @@ class Booster:
                )
            )
            return _prediction_output(shape, dims, preds, False)
+        if isinstance(data, PandasTransformed):
+            _check_call(
+                _LIB.XGBoosterPredictFromColumnar(
+                    self.handle,
+                    data.array_interface(),
+                    args,
+                    p_handle,
+                    ctypes.byref(shape),
+                    ctypes.byref(dims),
+                    ctypes.byref(preds),
+                )
+            )
+            return _prediction_output(shape, dims, preds, False)
        if isinstance(data, scipy.sparse.csr_matrix):
            from .data import transform_scipy_sparse

@@ -2523,7 +2546,7 @@ class Booster:
                )
            )
            return _prediction_output(shape, dims, preds, False)
-        if _is_cupy_array(data):
+        if _is_cupy_alike(data):
            from .data import _transform_cupy_array

            data = _transform_cupy_array(data)
@@ -2571,9 +2594,8 @@ class Booster:

        The model is saved in an XGBoost internal format which is universal among the
        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
-        (such as feature_names) will not be saved when using binary format.  To save
-        those attributes, use JSON/UBJ instead. See :doc:`Model IO
-        </tutorials/saving_model>` for more info.
+        (such as feature_names) are only saved when using JSON or UBJSON (default)
+        format. See :doc:`Model IO </tutorials/saving_model>` for more info.

        .. code-block:: python

@@ -2593,15 +2615,18 @@ class Booster:
        else:
            raise TypeError("fname must be a string or os PathLike")

-    def save_raw(self, raw_format: str = "deprecated") -> bytearray:
+    def save_raw(self, raw_format: str = "ubj") -> bytearray:
        """Save the model to a in memory buffer representation instead of file.

+        The model is saved in an XGBoost internal format which is universal among the
+        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
+        (such as feature_names) are only saved when using JSON or UBJSON (default)
+        format. See :doc:`Model IO </tutorials/saving_model>` for more info.
+
        Parameters
        ----------
        raw_format :
-            Format of output buffer. Can be `json`, `ubj` or `deprecated`.  Right now
-            the default is `deprecated` but it will be changed to `ubj` (univeral binary
-            json) in the future.
+            Format of output buffer. Can be `json`, `ubj` or `deprecated`.

        Returns
        -------
@@ -2620,11 +2645,10 @@ class Booster:
    def load_model(self, fname: ModelIn) -> None:
        """Load the model from a file or a bytearray.

-        The model is loaded from XGBoost format which is universal among the various
-        XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
-        feature_names) will not be loaded when using binary format.  To save those
-        attributes, use JSON/UBJ instead.  See :doc:`Model IO </tutorials/saving_model>`
-        for more info.
+        The model is saved in an XGBoost internal format which is universal among the
+        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
+        (such as feature_names) are only saved when using JSON or UBJSON (default)
+        format. See :doc:`Model IO </tutorials/saving_model>` for more info.

        .. code-block:: python

@@ -2749,9 +2773,9 @@ class Booster:
        with_stats: bool = False,
        dump_format: str = "text",
    ) -> List[str]:
-        """Returns the model dump as a list of strings.  Unlike :py:meth:`save_model`, the output
-        format is primarily used for visualization or interpretation, hence it's more
-        human readable but cannot be loaded back to XGBoost.
+        """Returns the model dump as a list of strings.  Unlike :py:meth:`save_model`,
+        the output format is primarily used for visualization or interpretation, hence
+        it's more human readable but cannot be loaded back to XGBoost.

        Parameters
        ----------
--- a/python-package/xgboost/dask/init.py
+++ b/python-package/xgboost/dask/init.py
@@ -75,7 +75,7 @@ from xgboost.core import (
    _deprecate_positional_args,
    _expect,
 )
-from xgboost.data import _is_cudf_ser, _is_cupy_array
+from xgboost.data import _is_cudf_ser, _is_cupy_alike
 from xgboost.sklearn import (
    XGBClassifier,
    XGBClassifierBase,
@@ -1909,7 +1909,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
            self.classes_ = await self.client.compute(y.drop_duplicates())
        if _is_cudf_ser(self.classes_):
            self.classes_ = self.classes_.to_cupy()
-        if _is_cupy_array(self.classes_):
+        if _is_cupy_alike(self.classes_):
            self.classes_ = self.classes_.get()
        self.classes_ = numpy.array(self.classes_)
        self.n_classes_ = len(self.classes_)
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -26,6 +26,7 @@ from .core import (
    DataIter,
    DataSplitMode,
    DMatrix,
+    _array_hasobject,
    _check_call,
    _cuda_array_interface,
    _ProxyDMatrix,
@@ -57,21 +58,36 @@ def _check_data_shape(data: DataType) -> None:
        raise ValueError("Please reshape the input data into 2-dimensional matrix.")


-def _is_scipy_csr(data: DataType) -> bool:
+def is_scipy_csr(data: DataType) -> bool:
+    """Predicate for scipy CSR input."""
+    is_array = False
+    is_matrix = False
    try:
-        import scipy.sparse
+        from scipy.sparse import csr_array
+
+        is_array = isinstance(data, csr_array)
    except ImportError:
-        return False
-    return isinstance(data, scipy.sparse.csr_matrix)
+        pass
+    try:
+        from scipy.sparse import csr_matrix
+
+        is_matrix = isinstance(data, csr_matrix)
+    except ImportError:
+        pass
+    return is_array or is_matrix


-def _array_interface(data: np.ndarray) -> bytes:
-    assert (
-        data.dtype.hasobject is False
-    ), "Input data contains `object` dtype.  Expecting numeric data."
+def _array_interface_dict(data: np.ndarray) -> dict:
+    if _array_hasobject(data):
+        raise ValueError("Input data contains `object` dtype.  Expecting numeric data.")
    interface = data.__array_interface__
    if "mask" in interface:
        interface["mask"] = interface["mask"].__array_interface__
+    return interface
+
+
+def _array_interface(data: np.ndarray) -> bytes:
+    interface = _array_interface_dict(data)
    interface_str = bytes(json.dumps(interface), "utf-8")
    return interface_str

@@ -130,12 +146,23 @@ def _from_scipy_csr(
    return handle, feature_names, feature_types


-def _is_scipy_csc(data: DataType) -> bool:
+def is_scipy_csc(data: DataType) -> bool:
+    """Predicate for scipy CSC input."""
+    is_array = False
+    is_matrix = False
    try:
-        import scipy.sparse
+        from scipy.sparse import csc_array
+
+        is_array = isinstance(data, csc_array)
    except ImportError:
-        return False
-    return isinstance(data, scipy.sparse.csc_matrix)
+        pass
+    try:
+        from scipy.sparse import csc_matrix
+
+        is_matrix = isinstance(data, csc_matrix)
+    except ImportError:
+        pass
+    return is_array or is_matrix


 def _from_scipy_csc(
@@ -166,12 +193,23 @@ def _from_scipy_csc(
    return handle, feature_names, feature_types


-def _is_scipy_coo(data: DataType) -> bool:
+def is_scipy_coo(data: DataType) -> bool:
+    """Predicate for scipy COO input."""
+    is_array = False
+    is_matrix = False
    try:
-        import scipy.sparse
+        from scipy.sparse import coo_array
+
+        is_array = isinstance(data, coo_array)
    except ImportError:
-        return False
-    return isinstance(data, scipy.sparse.coo_matrix)
+        pass
+    try:
+        from scipy.sparse import coo_matrix
+
+        is_matrix = isinstance(data, coo_matrix)
+    except ImportError:
+        pass
+    return is_array or is_matrix


 def _is_np_array_like(data: DataType) -> bool:
@@ -181,7 +219,7 @@ def _is_np_array_like(data: DataType) -> bool:
 def _ensure_np_dtype(
    data: DataType, dtype: Optional[NumpyDType]
 ) -> Tuple[np.ndarray, Optional[NumpyDType]]:
-    if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
+    if _array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
        dtype = np.float32
        data = data.astype(dtype, copy=False)
    if not data.flags.aligned:
@@ -265,24 +303,24 @@ pandas_nullable_mapper = {
    "Int16": "int",
    "Int32": "int",
    "Int64": "int",
-    "UInt8": "i",
-    "UInt16": "i",
-    "UInt32": "i",
-    "UInt64": "i",
+    "UInt8": "int",
+    "UInt16": "int",
+    "UInt32": "int",
+    "UInt64": "int",
    "Float32": "float",
    "Float64": "float",
    "boolean": "i",
 }

 pandas_pyarrow_mapper = {
-    "int8[pyarrow]": "i",
-    "int16[pyarrow]": "i",
-    "int32[pyarrow]": "i",
-    "int64[pyarrow]": "i",
-    "uint8[pyarrow]": "i",
-    "uint16[pyarrow]": "i",
-    "uint32[pyarrow]": "i",
-    "uint64[pyarrow]": "i",
+    "int8[pyarrow]": "int",
+    "int16[pyarrow]": "int",
+    "int32[pyarrow]": "int",
+    "int64[pyarrow]": "int",
+    "uint8[pyarrow]": "int",
+    "uint16[pyarrow]": "int",
+    "uint32[pyarrow]": "int",
+    "uint64[pyarrow]": "int",
    "float[pyarrow]": "float",
    "float32[pyarrow]": "float",
    "double[pyarrow]": "float",
@@ -295,7 +333,7 @@ _pandas_dtype_mapper.update(pandas_pyarrow_mapper)


 _ENABLE_CAT_ERR = (
-    "When categorical type is supplied, The experimental DMatrix parameter"
+    "When categorical type is supplied, the experimental DMatrix parameter"
    "`enable_categorical` must be set to `True`."
 )

@@ -407,89 +445,122 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
    return is_sparse(dtype)


-def pandas_cat_null(data: DataFrame) -> DataFrame:
-    """Handle categorical dtype and nullable extension types from pandas."""
-    import pandas as pd
-
-    # handle category codes and nullable.
-    cat_columns = []
-    nul_columns = []
-    # avoid an unnecessary conversion if possible
-    for col, dtype in zip(data.columns, data.dtypes):
-        if is_pd_cat_dtype(dtype):
-            cat_columns.append(col)
-        elif is_pa_ext_categorical_dtype(dtype):
-            raise ValueError(
-                "pyarrow dictionary type is not supported. Use pandas category instead."
-            )
-        elif is_nullable_dtype(dtype):
-            nul_columns.append(col)
-
-    if cat_columns or nul_columns:
-        # Avoid transformation due to: PerformanceWarning: DataFrame is highly
-        # fragmented
-        transformed = data.copy(deep=False)
-    else:
-        transformed = data
-
-    def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_pd_cat_dtype(ser.dtype):
-            return ser.cat.codes
-        assert is_pa_ext_categorical_dtype(ser.dtype)
-        # Not yet supported, the index is not ordered for some reason. Alternately:
-        # `combine_chunks().to_pandas().cat.codes`. The result is the same.
-        return ser.array.__arrow_array__().combine_chunks().dictionary_encode().indices
-
-    if cat_columns:
-        # DF doesn't have the cat attribute, as a result, we use apply here
-        transformed[cat_columns] = (
-            transformed[cat_columns]
-            .apply(cat_codes)
-            .astype(np.float32)
-            .replace(-1.0, np.NaN)
-        )
-    if nul_columns:
-        transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
-
-    # TODO(jiamingy): Investigate the possibility of using dataframe protocol or arrow
-    # IPC format for pandas so that we can apply the data transformation inside XGBoost
-    # for better memory efficiency.
-
-    return transformed
-
-
-def pandas_ext_num_types(data: DataFrame) -> DataFrame:
-    """Experimental suppport for handling pyarrow extension numeric types."""
+def pandas_pa_type(ser: Any) -> np.ndarray:
+    """Handle pandas pyarrow extention."""
    import pandas as pd
    import pyarrow as pa

+    # No copy, callstack:
+    # pandas.core.internals.managers.SingleBlockManager.array_values()
+    # pandas.core.internals.blocks.EABackedBlock.values
+    d_array: pd.arrays.ArrowExtensionArray = ser.array
+    # no copy in __arrow_array__
+    # ArrowExtensionArray._data is a chunked array
+    aa: pa.ChunkedArray = d_array.__arrow_array__()
+    # combine_chunks takes the most significant amount of time
+    chunk: pa.Array = aa.combine_chunks()
+    # When there's null value, we have to use copy
+    zero_copy = chunk.null_count == 0
+    # Alternately, we can use chunk.buffers(), which returns a list of buffers and
+    # we need to concatenate them ourselves.
+    # FIXME(jiamingy): Is there a better way to access the arrow buffer along with
+    # its mask?
+    # Buffers from chunk.buffers() have the address attribute, but don't expose the
+    # mask.
+    arr: np.ndarray = chunk.to_numpy(zero_copy_only=zero_copy, writable=False)
+    arr, _ = _ensure_np_dtype(arr, arr.dtype)
+    return arr
+
+
+def pandas_transform_data(data: DataFrame) -> List[np.ndarray]:
+    """Handle categorical dtype and extension types from pandas."""
+    import pandas as pd
+    from pandas import Float32Dtype, Float64Dtype
+
+    result: List[np.ndarray] = []
+
+    def cat_codes(ser: pd.Series) -> np.ndarray:
+        if is_pd_cat_dtype(ser.dtype):
+            return _ensure_np_dtype(
+                ser.cat.codes.astype(np.float32)
+                .replace(-1.0, np.NaN)
+                .to_numpy(na_value=np.nan),
+                np.float32,
+            )[0]
+        # Not yet supported, the index is not ordered for some reason. Alternately:
+        # `combine_chunks().to_pandas().cat.codes`. The result is the same.
+        assert is_pa_ext_categorical_dtype(ser.dtype)
+        return (
+            ser.array.__arrow_array__()
+            .combine_chunks()
+            .dictionary_encode()
+            .indices.astype(np.float32)
+            .replace(-1.0, np.NaN)
+        )
+
+    def nu_type(ser: pd.Series) -> np.ndarray:
+        # Avoid conversion when possible
+        if isinstance(dtype, Float32Dtype):
+            res_dtype: NumpyDType = np.float32
+        elif isinstance(dtype, Float64Dtype):
+            res_dtype = np.float64
+        else:
+            res_dtype = np.float32
+        return _ensure_np_dtype(
+            ser.to_numpy(dtype=res_dtype, na_value=np.nan), res_dtype
+        )[0]
+
+    def oth_type(ser: pd.Series) -> np.ndarray:
+        # The dtypes module is added in 1.25.
+        npdtypes = np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0")
+        npdtypes = npdtypes and isinstance(
+            ser.dtype,
+            (
+                # pylint: disable=no-member
+                np.dtypes.Float32DType,  # type: ignore
+                # pylint: disable=no-member
+                np.dtypes.Float64DType,  # type: ignore
+            ),
+        )
+
+        if npdtypes or dtype in {np.float32, np.float64}:
+            array = ser.to_numpy()
+        else:
+            # Specifying the dtype can significantly slow down the conversion (about
+            # 15% slow down for dense inplace-predict)
+            array = ser.to_numpy(dtype=np.float32, na_value=np.nan)
+        return _ensure_np_dtype(array, array.dtype)[0]
+
    for col, dtype in zip(data.columns, data.dtypes):
-        if not is_pa_ext_dtype(dtype):
-            continue
-        # No copy, callstack:
-        # pandas.core.internals.managers.SingleBlockManager.array_values()
-        # pandas.core.internals.blocks.EABackedBlock.values
-        d_array: pd.arrays.ArrowExtensionArray = data[col].array
-        # no copy in __arrow_array__
-        # ArrowExtensionArray._data is a chunked array
-        aa: pa.ChunkedArray = d_array.__arrow_array__()
-        chunk: pa.Array = aa.combine_chunks()
-        # Alternately, we can use chunk.buffers(), which returns a list of buffers and
-        # we need to concatenate them ourselves.
-        arr = chunk.__array__()
-        data[col] = arr
-    return data
+        if is_pa_ext_categorical_dtype(dtype):
+            raise ValueError(
+                "pyarrow dictionary type is not supported. Use pandas category instead."
+            )
+        if is_pd_cat_dtype(dtype):
+            result.append(cat_codes(data[col]))
+        elif is_pa_ext_dtype(dtype):
+            result.append(pandas_pa_type(data[col]))
+        elif is_nullable_dtype(dtype):
+            result.append(nu_type(data[col]))
+        elif is_pd_sparse_dtype(dtype):
+            arr = cast(pd.arrays.SparseArray, data[col].values)
+            arr = arr.to_dense()
+            if _is_np_array_like(arr):
+                arr, _ = _ensure_np_dtype(arr, arr.dtype)
+            result.append(arr)
+        else:
+            result.append(oth_type(data[col]))
+
+    # FIXME(jiamingy): Investigate the possibility of using dataframe protocol or arrow
+    # IPC format for pandas so that we can apply the data transformation inside XGBoost
+    # for better memory efficiency.
+    return result


-def _transform_pandas_df(
-    data: DataFrame,
-    enable_categorical: bool,
-    feature_names: Optional[FeatureNames] = None,
-    feature_types: Optional[FeatureTypes] = None,
-    meta: Optional[str] = None,
-    meta_type: Optional[NumpyDType] = None,
-) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
-    pyarrow_extension = False
+def pandas_check_dtypes(data: DataFrame, enable_categorical: bool) -> None:
+    """Validate the input types, returns True if the dataframe is backed by arrow."""
+    sparse_extension = False
+
    for dtype in data.dtypes:
        if not (
            (dtype.name in _pandas_dtype_mapper)
@@ -498,27 +569,65 @@ def _transform_pandas_df(
            or is_pa_ext_dtype(dtype)
        ):
            _invalid_dataframe_dtype(data)
-        if is_pa_ext_dtype(dtype):
-            pyarrow_extension = True
+
+        if is_pd_sparse_dtype(dtype):
+            sparse_extension = True
+
+    if sparse_extension:
+        warnings.warn("Sparse arrays from pandas are converted into dense.")
+
+
+class PandasTransformed:
+    """A storage class for transformed pandas DataFrame."""
+
+    def __init__(self, columns: List[np.ndarray]) -> None:
+        self.columns = columns
+
+    def array_interface(self) -> bytes:
+        """Return a byte string for JSON encoded array interface."""
+        aitfs = list(map(_array_interface_dict, self.columns))
+        sarrays = bytes(json.dumps(aitfs), "utf-8")
+        return sarrays
+
+    @property
+    def shape(self) -> Tuple[int, int]:
+        """Return shape of the transformed DataFrame."""
+        return self.columns[0].shape[0], len(self.columns)
+
+
+def _transform_pandas_df(
+    data: DataFrame,
+    enable_categorical: bool,
+    feature_names: Optional[FeatureNames] = None,
+    feature_types: Optional[FeatureTypes] = None,
+    meta: Optional[str] = None,
+) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
+    pandas_check_dtypes(data, enable_categorical)
+    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
+        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

    feature_names, feature_types = pandas_feature_info(
        data, meta, feature_names, feature_types, enable_categorical
    )

-    transformed = pandas_cat_null(data)
-    if pyarrow_extension:
-        if transformed is data:
-            transformed = data.copy(deep=False)
-        transformed = pandas_ext_num_types(transformed)
+    arrays = pandas_transform_data(data)
+    return PandasTransformed(arrays), feature_names, feature_types

-    if meta and len(data.columns) > 1 and meta not in _matrix_meta:
-        raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

-    dtype = meta_type if meta_type else np.float32
-    arr: np.ndarray = transformed.values
-    if meta_type:
-        arr = arr.astype(dtype)
-    return arr, feature_names, feature_types
+def _meta_from_pandas_df(
+    data: DataType,
+    name: str,
+    dtype: Optional[NumpyDType],
+    handle: ctypes.c_void_p,
+) -> None:
+    data, _, _ = _transform_pandas_df(data, False, meta=name)
+    if len(data.columns) == 1:
+        array = data.columns[0]
+    else:
+        array = np.stack(data.columns).T
+
+    array, dtype = _ensure_np_dtype(array, dtype)
+    _meta_from_numpy(array, name, dtype, handle)


 def _from_pandas_df(
@@ -530,12 +639,21 @@ def _from_pandas_df(
    feature_types: Optional[FeatureTypes],
    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
-    data, feature_names, feature_types = _transform_pandas_df(
+    df, feature_names, feature_types = _transform_pandas_df(
        data, enable_categorical, feature_names, feature_types
    )
-    return _from_numpy_array(
-        data, missing, nthread, feature_names, feature_types, data_split_mode
+
+    handle = ctypes.c_void_p()
+    _check_call(
+        _LIB.XGDMatrixCreateFromColumnar(
+            df.array_interface(),
+            make_jcargs(
+                nthread=nthread, missing=missing, data_split_mode=data_split_mode
+            ),
+            ctypes.byref(handle),
+        )
    )
+    return handle, feature_names, feature_types


 def _is_pandas_series(data: DataType) -> bool:
@@ -550,7 +668,12 @@ def _meta_from_pandas_series(
    data: DataType, name: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
 ) -> None:
    """Help transform pandas series for meta data like labels"""
-    data = data.values.astype("float")
+    if is_pd_sparse_dtype(data.dtype):
+        data = data.values.to_dense().astype(np.float32)
+    elif is_pa_ext_dtype(data.dtype):
+        data = pandas_pa_type(data)
+    else:
+        data = data.to_numpy(np.float32, na_value=np.nan)

    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
        data = data.to_dense()  # type: ignore
@@ -732,6 +855,8 @@ def _arrow_transform(data: DataType) -> Any:
            return pd.ArrowDtype(pa.bool_())
        return None

+    # For common cases, this is zero-copy, can check with:
+    # pa.total_allocated_bytes()
    df = data.to_pandas(types_mapper=type_mapper)
    return df

@@ -859,11 +984,10 @@ def _from_cudf_df(
    )
    interfaces_str = _cudf_array_interfaces(data, cat_codes)
    handle = ctypes.c_void_p()
-    config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
    _check_call(
        _LIB.XGDMatrixCreateFromCudaColumnar(
            interfaces_str,
-            config,
+            make_jcargs(nthread=nthread, missing=missing),
            ctypes.byref(handle),
        )
    )
@@ -874,11 +998,8 @@ def _is_cudf_ser(data: DataType) -> bool:
    return lazy_isinstance(data, "cudf.core.series", "Series")


-def _is_cupy_array(data: DataType) -> bool:
-    return any(
-        lazy_isinstance(data, n, "ndarray")
-        for n in ("cupy.core.core", "cupy", "cupy._core.core")
-    )
+def _is_cupy_alike(data: DataType) -> bool:
+    return hasattr(data, "__cuda_array_interface__")


 def _transform_cupy_array(data: DataType) -> CupyT:
@@ -886,7 +1007,7 @@ def _transform_cupy_array(data: DataType) -> CupyT:

    if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"):
        data = cupy.array(data, copy=False)
-    if data.dtype.hasobject or data.dtype in [cupy.bool_]:
+    if _array_hasobject(data) or data.dtype in [cupy.bool_]:
        data = data.astype(cupy.float32, copy=False)
    return data

@@ -1047,15 +1168,15 @@ def dispatch_data_backend(
    """Dispatch data for DMatrix."""
    if not _is_cudf_ser(data) and not _is_pandas_series(data):
        _check_data_shape(data)
-    if _is_scipy_csr(data):
+    if is_scipy_csr(data):
        return _from_scipy_csr(
            data, missing, threads, feature_names, feature_types, data_split_mode
        )
-    if _is_scipy_csc(data):
+    if is_scipy_csc(data):
        return _from_scipy_csc(
            data, missing, threads, feature_names, feature_types, data_split_mode
        )
-    if _is_scipy_coo(data):
+    if is_scipy_coo(data):
        return _from_scipy_csr(
            data.tocsr(),
            missing,
@@ -1098,7 +1219,7 @@ def dispatch_data_backend(
        return _from_cudf_df(
            data, missing, threads, feature_names, feature_types, enable_categorical
        )
-    if _is_cupy_array(data):
+    if _is_cupy_alike(data):
        return _from_cupy_array(data, missing, threads, feature_names, feature_types)
    if _is_cupy_csr(data):
        raise TypeError("cupyx CSR is not supported yet.")
@@ -1221,8 +1342,7 @@ def dispatch_meta_backend(
    if _is_arrow(data):
        data = _arrow_transform(data)
    if _is_pandas_df(data):
-        data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
-        _meta_from_numpy(data, name, dtype, handle)
+        _meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
        return
    if _is_pandas_series(data):
        _meta_from_pandas_series(data, name, dtype, handle)
@@ -1231,7 +1351,7 @@ def dispatch_meta_backend(
        data = _transform_dlpack(data)
        _meta_from_cupy_array(data, name, handle)
        return
-    if _is_cupy_array(data):
+    if _is_cupy_alike(data):
        _meta_from_cupy_array(data, name, handle)
        return
    if _is_cudf_ser(data):
@@ -1244,8 +1364,7 @@ def dispatch_meta_backend(
        _meta_from_dt(data, name, dtype, handle)
        return
    if _is_modin_df(data):
-        data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
-        _meta_from_numpy(data, name, dtype, handle)
+        _meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
        return
    if _is_modin_series(data):
        data = data.values.astype("float")
@@ -1297,7 +1416,7 @@ def _proxy_transform(
        return _transform_cudf_df(
            data, feature_names, feature_types, enable_categorical
        )
-    if _is_cupy_array(data):
+    if _is_cupy_alike(data):
        data = _transform_cupy_array(data)
        return data, None, feature_names, feature_types
    if _is_dlpack(data):
@@ -1307,9 +1426,15 @@ def _proxy_transform(
    if _is_np_array_like(data):
        data, _ = _ensure_np_dtype(data, data.dtype)
        return data, None, feature_names, feature_types
-    if _is_scipy_csr(data):
+    if is_scipy_csr(data):
        data = transform_scipy_sparse(data, True)
        return data, None, feature_names, feature_types
+    if is_scipy_csc(data):
+        data = transform_scipy_sparse(data.tocsr(), True)
+        return data, None, feature_names, feature_types
+    if is_scipy_coo(data):
+        data = transform_scipy_sparse(data.tocsr(), True)
+        return data, None, feature_names, feature_types
    if _is_pandas_series(data):
        import pandas as pd

@@ -1317,11 +1442,10 @@ def _proxy_transform(
    if _is_arrow(data):
        data = _arrow_transform(data)
    if _is_pandas_df(data):
-        arr, feature_names, feature_types = _transform_pandas_df(
+        df, feature_names, feature_types = _transform_pandas_df(
            data, enable_categorical, feature_names, feature_types
        )
-        arr, _ = _ensure_np_dtype(arr, arr.dtype)
-        return arr, None, feature_names, feature_types
+        return df, None, feature_names, feature_types
    raise TypeError("Value type is not supported for data iterator:" + str(type(data)))


@@ -1343,7 +1467,7 @@ def dispatch_proxy_set_data(
        # pylint: disable=W0212
        proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
        return
-    if _is_cupy_array(data):
+    if _is_cupy_alike(data):
        proxy._set_data_from_cuda_interface(data)  # pylint: disable=W0212
        return
    if _is_dlpack(data):
@@ -1356,11 +1480,14 @@ def dispatch_proxy_set_data(
    if not allow_host:
        raise err

+    if isinstance(data, PandasTransformed):
+        proxy._set_data_from_pandas(data)  # pylint: disable=W0212
+        return
    if _is_np_array_like(data):
        _check_data_shape(data)
        proxy._set_data_from_array(data)  # pylint: disable=W0212
        return
-    if _is_scipy_csr(data):
+    if is_scipy_csr(data):
        proxy._set_data_from_csr(data)  # pylint: disable=W0212
        return
    raise err
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -39,7 +39,7 @@ from .core import (
    _deprecate_positional_args,
    _parse_eval_str,
 )
-from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
+from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_alike, _is_pandas_df
 from .training import train


@@ -192,11 +192,16 @@ __model_doc = f"""
        Boosting learning rate (xgb's "eta")
    verbosity : Optional[int]
        The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
+
    objective : {SklObjective}
-        Specify the learning task and the corresponding learning objective or
-        a custom objective function to be used (see note below).
+
+        Specify the learning task and the corresponding learning objective or a custom
+        objective function to be used. For custom objective, see
+        :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
+        information.
+
    booster: Optional[str]
-        Specify which booster to use: gbtree, gblinear or dart.
+        Specify which booster to use: `gbtree`, `gblinear` or `dart`.
    tree_method: Optional[str]
        Specify which tree method to use.  Default to auto.  If this parameter is set to
        default, XGBoost will choose the most conservative option available.  It's
@@ -276,13 +281,7 @@ __model_doc = f"""

    enable_categorical : bool

-        .. versionadded:: 1.5.0
-
-        .. note:: This parameter is experimental
-
-        Experimental support for categorical data.  When enabled, cudf/pandas.DataFrame
-        should be used to specify categorical data type.  Also, JSON/UBJSON
-        serialization format is required.
+        See the same parameter of :py:class:`DMatrix` for details.

    feature_types : Optional[FeatureTypes]

@@ -334,21 +333,21 @@ __model_doc = f"""

        Metric used for monitoring the training result and early stopping.  It can be a
        string or list of strings as names of predefined metric in XGBoost (See
-        doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any other
-        user defined metric that looks like `sklearn.metrics`.
+        doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
+        other user defined metric that looks like `sklearn.metrics`.

        If custom objective is also provided, then custom metric should implement the
        corresponding reverse link function.

        Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
-        object is provided, it's assumed to be a cost function and by default XGBoost will
-        minimize the result during early stopping.
+        object is provided, it's assumed to be a cost function and by default XGBoost
+        will minimize the result during early stopping.

-        For advanced usage on Early stopping like directly choosing to maximize instead of
-        minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
+        For advanced usage on Early stopping like directly choosing to maximize instead
+        of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.

-        See :doc:`Custom Objective and Evaluation Metric </tutorials/custom_metric_obj>`
-        for more.
+        See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
+        information.

        .. note::

@@ -1012,7 +1011,7 @@ class XGBModel(XGBModelBase):
        sample_weight :
            instance weights
        base_margin :
-            global bias for each instance.
+            Global bias for each instance. See :doc:`/tutorials/intercept` for details.
        eval_set :
            A list of (X, y) tuple pairs to use as validation sets, for which
            metrics will be computed.
@@ -1152,7 +1151,7 @@ class XGBModel(XGBModelBase):
            When this is True, validate that the Booster's and data's feature_names are
            identical.  Otherwise, it is assumed that the feature_names are the same.
        base_margin :
-            Margin added to prediction.
+            Global bias for each instance. See :doc:`/tutorials/intercept` for details.
        iteration_range :
            Specifies which layer of trees are used in prediction.  For example, if a
            random forest is trained with 100 rounds.  Specifying ``iteration_range=(10,
@@ -1178,7 +1177,7 @@ class XGBModel(XGBModelBase):
                        base_margin=base_margin,
                        validate_features=validate_features,
                    )
-                    if _is_cupy_array(predts):
+                    if _is_cupy_alike(predts):
                        import cupy  # pylint: disable=import-error

                        predts = cupy.asnumpy(predts)  # ensure numpy array is used.
@@ -1459,7 +1458,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                classes = cp.unique(y.values)
                self.n_classes_ = len(classes)
                expected_classes = cp.array(self.classes_)
-            elif _is_cupy_array(y):
+            elif _is_cupy_alike(y):
                import cupy as cp  # pylint: disable=E0401

                classes = cp.unique(y)
@@ -1605,7 +1604,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            When this is True, validate that the Booster's and data's feature_names are
            identical.  Otherwise, it is assumed that the feature_names are the same.
        base_margin :
-            Margin added to prediction.
+            Global bias for each instance. See :doc:`/tutorials/intercept` for details.
        iteration_range :
            Specifies which layer of trees are used in prediction.  For example, if a
            random forest is trained with 100 rounds.  Specifying `iteration_range=(10,
@@ -1948,7 +1947,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
                weights to individual data points.

        base_margin :
-            Global bias for each instance.
+            Global bias for each instance. See :doc:`/tutorials/intercept` for details.
        eval_set :
            A list of (X, y) tuple pairs to use as validation sets, for which
            metrics will be computed.
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -630,7 +630,7 @@ sparse_datasets_strategy = strategies.sampled_from(

 def make_datasets_with_margin(
    unweighted_strategy: strategies.SearchStrategy,
-) -> Callable:
+) -> Callable[[], strategies.SearchStrategy[TestDataset]]:
    """Factory function for creating strategies that generates datasets with weight and
    base margin.

@@ -668,8 +668,7 @@ def make_datasets_with_margin(

 # A strategy for drawing from a set of example datasets. May add random weights to the
 # dataset
-@memory.cache
-def make_dataset_strategy() -> Callable:
+def make_dataset_strategy() -> strategies.SearchStrategy[TestDataset]:
    _unweighted_datasets_strategy = strategies.sampled_from(
        [
            TestDataset(
@@ -815,6 +814,13 @@ def softprob_obj(
    return objective


+def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Least squared error."""
+    grad = y_pred - y_true
+    hess = np.ones(len(y_true))
+    return grad, hess
+
+
 class DirectoryExcursion:
    """Change directory.  Change back and optionally cleaning up the directory when
    exit.
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -3,7 +3,18 @@
 import os
 import zipfile
 from dataclasses import dataclass
-from typing import Any, Generator, List, NamedTuple, Optional, Tuple, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    NamedTuple,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
 from urllib import request

 import numpy as np
@@ -15,6 +26,11 @@ from scipy import sparse
 import xgboost
 from xgboost.data import pandas_pyarrow_mapper

+if TYPE_CHECKING:
+    from ..compat import DataFrame as DataFrameT
+else:
+    DataFrameT = Any
+
 joblib = pytest.importorskip("joblib")
 memory = joblib.Memory("./cachedir", verbose=0)

@@ -246,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
    return X, y


+# pylint: disable=too-many-statements
@memory.cache
-def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
-    """
+def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
+    """Get a synthetic version of the amse housing dataset.
+
+    The real one can be obtained via:
+
+    .. code-block::
+
+        from sklearn import datasets
+
+        datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+
    Number of samples: 1460
    Number of features: 20
    Number of categorical features: 10
    Number of numerical features: 10
    """
-    datasets = pytest.importorskip("sklearn.datasets")
-    X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+    pytest.importorskip("pandas")
+    import pandas as pd

-    categorical_columns_subset: List[str] = [
-        "BldgType",  # 5 cats, no nan
-        "GarageFinish",  # 3 cats, nan
-        "LotConfig",  # 5 cats, no nan
-        "Functional",  # 7 cats, no nan
-        "MasVnrType",  # 4 cats, nan
-        "HouseStyle",  # 8 cats, no nan
-        "FireplaceQu",  # 5 cats, nan
-        "ExterCond",  # 5 cats, no nan
-        "ExterQual",  # 4 cats, no nan
-        "PoolQC",  # 3 cats, nan
-    ]
+    rng = np.random.default_rng(1994)
+    n_samples = 1460
+    df = pd.DataFrame()

-    numerical_columns_subset: List[str] = [
-        "3SsnPorch",
-        "Fireplaces",
-        "BsmtHalfBath",
-        "HalfBath",
-        "GarageCars",
-        "TotRmsAbvGrd",
-        "BsmtFinSF1",
-        "BsmtFinSF2",
-        "GrLivArea",
-        "ScreenPorch",
-    ]
+    def synth_cat(
+        name_proba: Dict[Union[str, float], float], density: float
+    ) -> pd.Series:
+        n_nulls = int(n_samples * (1 - density))
+        has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
+        if has_nan:
+            sparsity = 1.0 - density
+            name_proba[np.nan] = sparsity

-    X = X[categorical_columns_subset + numerical_columns_subset]
-    X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
-    return X, y
+        keys = list(name_proba.keys())
+        p = list(name_proba.values())
+        p[-1] += 1.0 - np.sum(p)  # Fix floating point error
+        x = rng.choice(keys, size=n_samples, p=p)
+
+        series = pd.Series(
+            x,
+            dtype=pd.CategoricalDtype(
+                # not NA
+                filter(lambda x: isinstance(x, str), keys)
+            ),
+        )
+        return series
+
+    df["BldgType"] = synth_cat(
+        {
+            "1Fam": 0.835616,
+            "2fmCon": 0.078082,
+            "Duplex": 0.035616,
+            "Twnhs": 0.029452,
+            "TwnhsE": 0.021233,
+        },
+        1.0,
+    )
+    df["GarageFinish"] = synth_cat(
+        {"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
+    )
+    df["LotConfig"] = synth_cat(
+        {
+            "Corner": 0.180137,
+            "CulDSac": 0.064384,
+            "FR2": 0.032192,
+            "FR3": 0.002740,
+        },
+        1.0,
+    )
+    df["Functional"] = synth_cat(
+        {
+            "Typ": 0.931506,
+            "Min2": 0.023287,
+            "Min1": 0.021232,
+            "Mod": 0.010273,
+            "Maj1": 0.009589,
+            "Maj2": 0.003424,
+            "Sev": 0.000684,
+        },
+        1.0,
+    )
+    df["MasVnrType"] = synth_cat(
+        {
+            "None": 0.591780,
+            "BrkFace": 0.304794,
+            "Stone": 0.087671,
+            "BrkCmn": 0.010273,
+        },
+        0.99452,
+    )
+    df["HouseStyle"] = synth_cat(
+        {
+            "1Story": 0.497260,
+            "2Story": 0.304794,
+            "1.5Fin": 0.105479,
+            "SLvl": 0.044520,
+            "SFoyer": 0.025342,
+            "1.5Unf": 0.009589,
+            "2.5Unf": 0.007534,
+            "2.5Fin": 0.005479,
+        },
+        1.0,
+    )
+    df["FireplaceQu"] = synth_cat(
+        {
+            "Gd": 0.260273,
+            "TA": 0.214383,
+            "Fa": 0.022602,
+            "Ex": 0.016438,
+            "Po": 0.013698,
+        },
+        0.527397,
+    )
+    df["ExterCond"] = synth_cat(
+        {
+            "TA": 0.878082,
+            "Gd": 0.1,
+            "Fa": 0.019178,
+            "Ex": 0.002054,
+            "Po": 0.000684,
+        },
+        1.0,
+    )
+    df["ExterQual"] = synth_cat(
+        {
+            "TA": 0.620547,
+            "Gd": 0.334246,
+            "Ex": 0.035616,
+            "Fa": 0.009589,
+        },
+        1.0,
+    )
+    df["PoolQC"] = synth_cat(
+        {
+            "Gd": 0.002054,
+            "Ex": 0.001369,
+            "Fa": 0.001369,
+        },
+        0.004794,
+    )
+
+    # We focus on the cateogircal values here, for numerical features, simple normal
+    # distribution is used, which doesn't match the original data.
+    def synth_num(loc: float, std: float, density: float) -> pd.Series:
+        x = rng.normal(loc=loc, scale=std, size=n_samples)
+        n_nulls = int(n_samples * (1 - density))
+        if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
+            null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
+            x[null_idx] = np.nan
+        return pd.Series(x, dtype=np.float64)
+
+    df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
+    df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
+    df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
+    df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
+    df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
+    df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
+    df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
+    df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
+    df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
+    df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
+
+    columns = list(df.columns)
+    rng.shuffle(columns)
+    df = df[columns]
+
+    # linear interaction for testing purposes.
+    y = np.zeros(shape=(n_samples,))
+    for c in df.columns:
+        if isinstance(df[c].dtype, pd.CategoricalDtype):
+            y += df[c].cat.codes.astype(np.float64)
+        else:
+            y += df[c].values
+
+    # Shift and scale to match the original y.
+    y *= 79442.50288288662 / y.std()
+    y += 180921.19589041095 - y.mean()
+
+    return df, y


@memory.cache
@@ -603,3 +759,51 @@ def sort_ltr_samples(
    data = X, clicks, y, qid

    return data
+
+
+def run_base_margin_info(
+    DType: Callable, DMatrixT: Type[xgboost.DMatrix], device: str
+) -> None:
+    """Run tests for base margin."""
+    rng = np.random.default_rng()
+    X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
+    if hasattr(X, "iloc"):
+        y = X.iloc[:, 0]
+    else:
+        y = X[:, 0]
+    base_margin = X
+    # no error at set
+    Xy = DMatrixT(X, y, base_margin=base_margin)
+    # Error at train, caused by check in predictor.
+    with pytest.raises(ValueError, match=r".*base_margin.*"):
+        xgboost.train({"tree_method": "hist", "device": device}, Xy)
+
+    if not hasattr(X, "iloc"):
+        # column major matrix
+        got = DType(Xy.get_base_margin().reshape(50, 2))
+        assert (got == base_margin).all()
+
+        assert base_margin.T.flags.c_contiguous is False
+        assert base_margin.T.flags.f_contiguous is True
+        Xy.set_info(base_margin=base_margin.T)
+        got = DType(Xy.get_base_margin().reshape(2, 50))
+        assert (got == base_margin.T).all()
+
+        # Row vs col vec.
+        base_margin = y
+        Xy.set_base_margin(base_margin)
+        bm_col = Xy.get_base_margin()
+        Xy.set_base_margin(base_margin.reshape(1, base_margin.size))
+        bm_row = Xy.get_base_margin()
+        assert (bm_row == bm_col).all()
+
+        # type
+        base_margin = base_margin.astype(np.float64)
+        Xy.set_base_margin(base_margin)
+        bm_f64 = Xy.get_base_margin()
+        assert (bm_f64 == bm_col).all()
+
+        # too many dimensions
+        base_margin = X.reshape(2, 5, 2, 5)
+        with pytest.raises(ValueError, match=r".*base_margin.*"):
+            Xy.set_base_margin(base_margin)
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -394,3 +394,14 @@ def train_result(
    assert booster.feature_types == dmat.feature_types

    return result
+
+
+class ResetStrategy(xgb.callback.TrainingCallback):
+    """Callback for testing multi-output."""
+
+    def after_iteration(self, model: xgb.Booster, epoch: int, evals_log: dict) -> bool:
+        if epoch % 2 == 0:
+            model.set_param({"multi_strategy": "multi_output_tree"})
+        else:
+            model.set_param({"multi_strategy": "one_output_per_tree"})
+        return False