merge latest, Jan 12 2024
This commit is contained in:
@@ -62,11 +62,31 @@ class TrainingCallback(ABC):
|
||||
return model
|
||||
|
||||
def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
|
||||
"""Run before each iteration. Return True when training should stop."""
|
||||
"""Run before each iteration. Returns True when training should stop. See
|
||||
:py:meth:`after_iteration` for details.
|
||||
|
||||
"""
|
||||
return False
|
||||
|
||||
def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
|
||||
"""Run after each iteration. Return True when training should stop."""
|
||||
"""Run after each iteration. Returns `True` when training should stop.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
model :
|
||||
Eeither a :py:class:`~xgboost.Booster` object or a CVPack if the cv function
|
||||
in xgboost is being used.
|
||||
epoch :
|
||||
The current training iteration.
|
||||
evals_log :
|
||||
A dictionary containing the evaluation history:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
{"data_name": {"metric_name": [0.5, ...]}}
|
||||
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
@@ -547,14 +567,16 @@ class TrainingCheckPoint(TrainingCallback):
|
||||
|
||||
.. versionadded:: 1.3.0
|
||||
|
||||
Since XGBoost 2.1.0, the default format is changed to UBJSON.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
directory :
|
||||
Output model directory.
|
||||
name :
|
||||
pattern of output model file. Models will be saved as name_0.json, name_1.json,
|
||||
name_2.json ....
|
||||
pattern of output model file. Models will be saved as name_0.ubj, name_1.ubj,
|
||||
name_2.ubj ....
|
||||
as_pickle :
|
||||
When set to True, all training parameters will be saved in pickle format,
|
||||
instead of saving only the model.
|
||||
@@ -564,6 +586,8 @@ class TrainingCheckPoint(TrainingCallback):
|
||||
|
||||
"""
|
||||
|
||||
default_format = "ubj"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
directory: Union[str, os.PathLike],
|
||||
@@ -592,7 +616,7 @@ class TrainingCheckPoint(TrainingCallback):
|
||||
self._name
|
||||
+ "_"
|
||||
+ (str(epoch + self._start))
|
||||
+ (".pkl" if self._as_pickle else ".json"),
|
||||
+ (".pkl" if self._as_pickle else f".{self.default_format}"),
|
||||
)
|
||||
self._epoch = 0 # reset counter
|
||||
if collective.get_rank() == 0:
|
||||
|
||||
@@ -100,6 +100,16 @@ def is_cupy_available() -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def import_cupy() -> types.ModuleType:
|
||||
"""Import cupy."""
|
||||
if not is_cupy_available():
|
||||
raise ImportError("`cupy` is required for handling CUDA buffer.")
|
||||
|
||||
import cupy # pylint: disable=import-error
|
||||
|
||||
return cupy
|
||||
|
||||
|
||||
try:
|
||||
import scipy.sparse as scipy_sparse
|
||||
from scipy.sparse import csr_matrix as scipy_csr
|
||||
@@ -128,9 +138,9 @@ def concat(value: Sequence[_T]) -> _T: # pylint: disable=too-many-return-statem
|
||||
from cudf import concat as CUDF_concat # pylint: disable=import-error
|
||||
|
||||
return CUDF_concat(value, axis=0)
|
||||
from .data import _is_cupy_array
|
||||
from .data import _is_cupy_alike
|
||||
|
||||
if _is_cupy_array(value[0]):
|
||||
if _is_cupy_alike(value[0]):
|
||||
import cupy # pylint: disable=import-error
|
||||
|
||||
# pylint: disable=c-extension-no-member,no-member
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
"""Core XGBoost Library."""
|
||||
import copy
|
||||
import ctypes
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -45,7 +44,6 @@ from ._typing import (
|
||||
CStrPptr,
|
||||
CStrPtr,
|
||||
CTypeT,
|
||||
CupyT,
|
||||
DataType,
|
||||
FeatureInfo,
|
||||
FeatureNames,
|
||||
@@ -55,7 +53,7 @@ from ._typing import (
|
||||
TransformedData,
|
||||
c_bst_ulong,
|
||||
)
|
||||
from .compat import PANDAS_INSTALLED, DataFrame, py_str
|
||||
from .compat import PANDAS_INSTALLED, DataFrame, import_cupy, py_str
|
||||
from .libpath import find_lib_path
|
||||
|
||||
|
||||
@@ -213,6 +211,7 @@ def _load_lib() -> ctypes.CDLL:
|
||||
lib = ctypes.cdll.LoadLibrary(lib_path)
|
||||
setattr(lib, "path", os.path.normpath(lib_path))
|
||||
lib_success = True
|
||||
break
|
||||
except OSError as e:
|
||||
os_error_list.append(str(e))
|
||||
continue
|
||||
@@ -358,10 +357,13 @@ def _numpy2ctypes_type(dtype: Type[np.number]) -> Type[CNumeric]:
|
||||
return _NUMPY_TO_CTYPES_MAPPING[dtype]
|
||||
|
||||
|
||||
def _array_hasobject(data: DataType) -> bool:
|
||||
return hasattr(data.dtype, "hasobject") and data.dtype.hasobject
|
||||
|
||||
|
||||
def _cuda_array_interface(data: DataType) -> bytes:
|
||||
assert (
|
||||
data.dtype.hasobject is False
|
||||
), "Input data contains `object` dtype. Expecting numeric data."
|
||||
if _array_hasobject(data):
|
||||
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
|
||||
interface = data.__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
@@ -380,34 +382,6 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n
|
||||
return res
|
||||
|
||||
|
||||
def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT:
|
||||
"""Convert a ctypes pointer array to a cupy array."""
|
||||
# pylint: disable=import-error
|
||||
import cupy
|
||||
from cupy.cuda.memory import MemoryPointer, UnownedMemory
|
||||
|
||||
CUPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = {
|
||||
cupy.float32: ctypes.c_float,
|
||||
cupy.uint32: ctypes.c_uint,
|
||||
}
|
||||
if dtype not in CUPY_TO_CTYPES_MAPPING:
|
||||
raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
|
||||
addr = ctypes.cast(cptr, ctypes.c_void_p).value
|
||||
# pylint: disable=c-extension-no-member,no-member
|
||||
device = cupy.cuda.runtime.pointerGetAttributes(addr).device
|
||||
# The owner field is just used to keep the memory alive with ref count. As
|
||||
# unowned's life time is scoped within this function we don't need that.
|
||||
unownd = UnownedMemory(
|
||||
addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None
|
||||
)
|
||||
memptr = MemoryPointer(unownd, 0)
|
||||
# pylint: disable=unexpected-keyword-arg
|
||||
mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr)
|
||||
assert mem.device.id == device
|
||||
arr = cupy.array(mem, copy=True)
|
||||
return arr
|
||||
|
||||
|
||||
def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
|
||||
"""Convert ctypes pointer to buffer type."""
|
||||
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
|
||||
@@ -466,14 +440,8 @@ def from_array_interface(interface: dict) -> NumpyOrCupy:
|
||||
|
||||
if "stream" in interface:
|
||||
# CUDA stream is presented, this is a __cuda_array_interface__.
|
||||
spec = importlib.util.find_spec("cupy")
|
||||
if spec is None:
|
||||
raise ImportError("`cupy` is required for handling CUDA buffer.")
|
||||
|
||||
import cupy as cp # pylint: disable=import-error
|
||||
|
||||
arr.__cuda_array_interface__ = interface
|
||||
out = cp.array(arr, copy=True)
|
||||
out = import_cupy().array(arr, copy=True)
|
||||
else:
|
||||
arr.__array_interface__ = interface
|
||||
out = np.array(arr, copy=True)
|
||||
@@ -481,17 +449,42 @@ def from_array_interface(interface: dict) -> NumpyOrCupy:
|
||||
return out
|
||||
|
||||
|
||||
def make_array_interface(
|
||||
ptr: CNumericPtr, shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool
|
||||
) -> Dict[str, Union[int, tuple, None]]:
|
||||
"""Make an __(cuda)_array_interface__ from a pointer."""
|
||||
# Use an empty array to handle typestr and descr
|
||||
if is_cuda:
|
||||
empty = import_cupy().empty(shape=(0,), dtype=dtype)
|
||||
array = empty.__cuda_array_interface__ # pylint: disable=no-member
|
||||
else:
|
||||
empty = np.empty(shape=(0,), dtype=dtype)
|
||||
array = empty.__array_interface__ # pylint: disable=no-member
|
||||
|
||||
addr = ctypes.cast(ptr, ctypes.c_void_p).value
|
||||
length = int(np.prod(shape))
|
||||
# Handle empty dataset.
|
||||
assert addr is not None or length == 0
|
||||
|
||||
if addr is None:
|
||||
return array
|
||||
|
||||
array["data"] = (addr, True)
|
||||
if is_cuda:
|
||||
array["stream"] = 2
|
||||
array["shape"] = shape
|
||||
array["strides"] = None
|
||||
return array
|
||||
|
||||
|
||||
def _prediction_output(
|
||||
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
|
||||
) -> NumpyOrCupy:
|
||||
arr_shape = ctypes2numpy(shape, dims.value, np.uint64)
|
||||
length = int(np.prod(arr_shape))
|
||||
if is_cuda:
|
||||
arr_predict = ctypes2cupy(predts, length, np.float32)
|
||||
else:
|
||||
arr_predict = ctypes2numpy(predts, length, np.float32)
|
||||
arr_predict = arr_predict.reshape(arr_shape)
|
||||
return arr_predict
|
||||
arr_shape = tuple(ctypes2numpy(shape, dims.value, np.uint64).flatten())
|
||||
array = from_array_interface(
|
||||
make_array_interface(predts, arr_shape, np.float32, is_cuda)
|
||||
)
|
||||
return array
|
||||
|
||||
|
||||
class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
@@ -795,7 +788,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
so it doesn't make sense to assign weights to individual data points.
|
||||
|
||||
base_margin :
|
||||
Base margin used for boosting from existing model.
|
||||
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
|
||||
missing :
|
||||
Value in the input data which needs to be present as a missing value. If
|
||||
None, defaults to np.nan.
|
||||
@@ -832,9 +825,19 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
|
||||
.. note:: This parameter is experimental
|
||||
|
||||
Experimental support of specializing for categorical features. Do not set
|
||||
to True unless you are interested in development. Also, JSON/UBJSON
|
||||
serialization format is required.
|
||||
Experimental support of specializing for categorical features.
|
||||
|
||||
If passing 'True' and 'data' is a data frame (from supported libraries such
|
||||
as Pandas, Modin or cuDF), columns of categorical types will automatically
|
||||
be set to be of categorical type (feature_type='c') in the resulting
|
||||
DMatrix.
|
||||
|
||||
If passing 'False' and 'data' is a data frame with categorical columns,
|
||||
it will result in an error being thrown.
|
||||
|
||||
If 'data' is not a data frame, this argument is ignored.
|
||||
|
||||
JSON/UBJSON serialization format is required for this.
|
||||
|
||||
"""
|
||||
if group is not None and qid is not None:
|
||||
@@ -1441,6 +1444,12 @@ class _ProxyDMatrix(DMatrix):
|
||||
_LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data))
|
||||
)
|
||||
|
||||
def _set_data_from_pandas(self, data: DataType) -> None:
|
||||
"""Set data from a pandas DataFrame. The input is a PandasTransformed instance."""
|
||||
_check_call(
|
||||
_LIB.XGProxyDMatrixSetDataColumnar(self.handle, data.array_interface())
|
||||
)
|
||||
|
||||
def _set_data_from_csr(self, csr: scipy.sparse.csr_matrix) -> None:
|
||||
"""Set data from scipy csr"""
|
||||
from .data import _array_interface
|
||||
@@ -2096,7 +2105,7 @@ class Booster:
|
||||
_array_interface,
|
||||
_cuda_array_interface,
|
||||
_ensure_np_dtype,
|
||||
_is_cupy_array,
|
||||
_is_cupy_alike,
|
||||
)
|
||||
|
||||
self._assign_dmatrix_features(dtrain)
|
||||
@@ -2110,7 +2119,7 @@ class Booster:
|
||||
"Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian."
|
||||
f" Got: {type(array)}"
|
||||
)
|
||||
if not isinstance(array, np.ndarray) and not _is_cupy_array(array):
|
||||
if not isinstance(array, np.ndarray) and not _is_cupy_alike(array):
|
||||
raise TypeError(msg)
|
||||
|
||||
n_samples = dtrain.num_row()
|
||||
@@ -2125,7 +2134,7 @@ class Booster:
|
||||
if isinstance(array, np.ndarray):
|
||||
array, _ = _ensure_np_dtype(array, array.dtype)
|
||||
interface = _array_interface(array)
|
||||
elif _is_cupy_array(array):
|
||||
elif _is_cupy_alike(array):
|
||||
interface = _cuda_array_interface(array)
|
||||
else:
|
||||
raise TypeError(msg)
|
||||
@@ -2450,11 +2459,12 @@ class Booster:
|
||||
assert proxy is None or isinstance(proxy, _ProxyDMatrix)
|
||||
|
||||
from .data import (
|
||||
PandasTransformed,
|
||||
_array_interface,
|
||||
_arrow_transform,
|
||||
_is_arrow,
|
||||
_is_cudf_df,
|
||||
_is_cupy_array,
|
||||
_is_cupy_alike,
|
||||
_is_list,
|
||||
_is_np_array_like,
|
||||
_is_pandas_df,
|
||||
@@ -2504,6 +2514,19 @@ class Booster:
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, False)
|
||||
if isinstance(data, PandasTransformed):
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromColumnar(
|
||||
self.handle,
|
||||
data.array_interface(),
|
||||
args,
|
||||
p_handle,
|
||||
ctypes.byref(shape),
|
||||
ctypes.byref(dims),
|
||||
ctypes.byref(preds),
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, False)
|
||||
if isinstance(data, scipy.sparse.csr_matrix):
|
||||
from .data import transform_scipy_sparse
|
||||
|
||||
@@ -2523,7 +2546,7 @@ class Booster:
|
||||
)
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, False)
|
||||
if _is_cupy_array(data):
|
||||
if _is_cupy_alike(data):
|
||||
from .data import _transform_cupy_array
|
||||
|
||||
data = _transform_cupy_array(data)
|
||||
@@ -2571,9 +2594,8 @@ class Booster:
|
||||
|
||||
The model is saved in an XGBoost internal format which is universal among the
|
||||
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
|
||||
(such as feature_names) will not be saved when using binary format. To save
|
||||
those attributes, use JSON/UBJ instead. See :doc:`Model IO
|
||||
</tutorials/saving_model>` for more info.
|
||||
(such as feature_names) are only saved when using JSON or UBJSON (default)
|
||||
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -2593,15 +2615,18 @@ class Booster:
|
||||
else:
|
||||
raise TypeError("fname must be a string or os PathLike")
|
||||
|
||||
def save_raw(self, raw_format: str = "deprecated") -> bytearray:
|
||||
def save_raw(self, raw_format: str = "ubj") -> bytearray:
|
||||
"""Save the model to a in memory buffer representation instead of file.
|
||||
|
||||
The model is saved in an XGBoost internal format which is universal among the
|
||||
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
|
||||
(such as feature_names) are only saved when using JSON or UBJSON (default)
|
||||
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_format :
|
||||
Format of output buffer. Can be `json`, `ubj` or `deprecated`. Right now
|
||||
the default is `deprecated` but it will be changed to `ubj` (univeral binary
|
||||
json) in the future.
|
||||
Format of output buffer. Can be `json`, `ubj` or `deprecated`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -2620,11 +2645,10 @@ class Booster:
|
||||
def load_model(self, fname: ModelIn) -> None:
|
||||
"""Load the model from a file or a bytearray.
|
||||
|
||||
The model is loaded from XGBoost format which is universal among the various
|
||||
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
|
||||
feature_names) will not be loaded when using binary format. To save those
|
||||
attributes, use JSON/UBJ instead. See :doc:`Model IO </tutorials/saving_model>`
|
||||
for more info.
|
||||
The model is saved in an XGBoost internal format which is universal among the
|
||||
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
|
||||
(such as feature_names) are only saved when using JSON or UBJSON (default)
|
||||
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -2749,9 +2773,9 @@ class Booster:
|
||||
with_stats: bool = False,
|
||||
dump_format: str = "text",
|
||||
) -> List[str]:
|
||||
"""Returns the model dump as a list of strings. Unlike :py:meth:`save_model`, the output
|
||||
format is primarily used for visualization or interpretation, hence it's more
|
||||
human readable but cannot be loaded back to XGBoost.
|
||||
"""Returns the model dump as a list of strings. Unlike :py:meth:`save_model`,
|
||||
the output format is primarily used for visualization or interpretation, hence
|
||||
it's more human readable but cannot be loaded back to XGBoost.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
@@ -75,7 +75,7 @@ from xgboost.core import (
|
||||
_deprecate_positional_args,
|
||||
_expect,
|
||||
)
|
||||
from xgboost.data import _is_cudf_ser, _is_cupy_array
|
||||
from xgboost.data import _is_cudf_ser, _is_cupy_alike
|
||||
from xgboost.sklearn import (
|
||||
XGBClassifier,
|
||||
XGBClassifierBase,
|
||||
@@ -1909,7 +1909,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
|
||||
self.classes_ = await self.client.compute(y.drop_duplicates())
|
||||
if _is_cudf_ser(self.classes_):
|
||||
self.classes_ = self.classes_.to_cupy()
|
||||
if _is_cupy_array(self.classes_):
|
||||
if _is_cupy_alike(self.classes_):
|
||||
self.classes_ = self.classes_.get()
|
||||
self.classes_ = numpy.array(self.classes_)
|
||||
self.n_classes_ = len(self.classes_)
|
||||
|
||||
@@ -26,6 +26,7 @@ from .core import (
|
||||
DataIter,
|
||||
DataSplitMode,
|
||||
DMatrix,
|
||||
_array_hasobject,
|
||||
_check_call,
|
||||
_cuda_array_interface,
|
||||
_ProxyDMatrix,
|
||||
@@ -57,21 +58,36 @@ def _check_data_shape(data: DataType) -> None:
|
||||
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
|
||||
|
||||
|
||||
def _is_scipy_csr(data: DataType) -> bool:
|
||||
def is_scipy_csr(data: DataType) -> bool:
|
||||
"""Predicate for scipy CSR input."""
|
||||
is_array = False
|
||||
is_matrix = False
|
||||
try:
|
||||
import scipy.sparse
|
||||
from scipy.sparse import csr_array
|
||||
|
||||
is_array = isinstance(data, csr_array)
|
||||
except ImportError:
|
||||
return False
|
||||
return isinstance(data, scipy.sparse.csr_matrix)
|
||||
pass
|
||||
try:
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
is_matrix = isinstance(data, csr_matrix)
|
||||
except ImportError:
|
||||
pass
|
||||
return is_array or is_matrix
|
||||
|
||||
|
||||
def _array_interface(data: np.ndarray) -> bytes:
|
||||
assert (
|
||||
data.dtype.hasobject is False
|
||||
), "Input data contains `object` dtype. Expecting numeric data."
|
||||
def _array_interface_dict(data: np.ndarray) -> dict:
|
||||
if _array_hasobject(data):
|
||||
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
|
||||
interface = data.__array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__array_interface__
|
||||
return interface
|
||||
|
||||
|
||||
def _array_interface(data: np.ndarray) -> bytes:
|
||||
interface = _array_interface_dict(data)
|
||||
interface_str = bytes(json.dumps(interface), "utf-8")
|
||||
return interface_str
|
||||
|
||||
@@ -130,12 +146,23 @@ def _from_scipy_csr(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_scipy_csc(data: DataType) -> bool:
|
||||
def is_scipy_csc(data: DataType) -> bool:
|
||||
"""Predicate for scipy CSC input."""
|
||||
is_array = False
|
||||
is_matrix = False
|
||||
try:
|
||||
import scipy.sparse
|
||||
from scipy.sparse import csc_array
|
||||
|
||||
is_array = isinstance(data, csc_array)
|
||||
except ImportError:
|
||||
return False
|
||||
return isinstance(data, scipy.sparse.csc_matrix)
|
||||
pass
|
||||
try:
|
||||
from scipy.sparse import csc_matrix
|
||||
|
||||
is_matrix = isinstance(data, csc_matrix)
|
||||
except ImportError:
|
||||
pass
|
||||
return is_array or is_matrix
|
||||
|
||||
|
||||
def _from_scipy_csc(
|
||||
@@ -166,12 +193,23 @@ def _from_scipy_csc(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_scipy_coo(data: DataType) -> bool:
|
||||
def is_scipy_coo(data: DataType) -> bool:
|
||||
"""Predicate for scipy COO input."""
|
||||
is_array = False
|
||||
is_matrix = False
|
||||
try:
|
||||
import scipy.sparse
|
||||
from scipy.sparse import coo_array
|
||||
|
||||
is_array = isinstance(data, coo_array)
|
||||
except ImportError:
|
||||
return False
|
||||
return isinstance(data, scipy.sparse.coo_matrix)
|
||||
pass
|
||||
try:
|
||||
from scipy.sparse import coo_matrix
|
||||
|
||||
is_matrix = isinstance(data, coo_matrix)
|
||||
except ImportError:
|
||||
pass
|
||||
return is_array or is_matrix
|
||||
|
||||
|
||||
def _is_np_array_like(data: DataType) -> bool:
|
||||
@@ -181,7 +219,7 @@ def _is_np_array_like(data: DataType) -> bool:
|
||||
def _ensure_np_dtype(
|
||||
data: DataType, dtype: Optional[NumpyDType]
|
||||
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
|
||||
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
|
||||
if _array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
|
||||
dtype = np.float32
|
||||
data = data.astype(dtype, copy=False)
|
||||
if not data.flags.aligned:
|
||||
@@ -265,24 +303,24 @@ pandas_nullable_mapper = {
|
||||
"Int16": "int",
|
||||
"Int32": "int",
|
||||
"Int64": "int",
|
||||
"UInt8": "i",
|
||||
"UInt16": "i",
|
||||
"UInt32": "i",
|
||||
"UInt64": "i",
|
||||
"UInt8": "int",
|
||||
"UInt16": "int",
|
||||
"UInt32": "int",
|
||||
"UInt64": "int",
|
||||
"Float32": "float",
|
||||
"Float64": "float",
|
||||
"boolean": "i",
|
||||
}
|
||||
|
||||
pandas_pyarrow_mapper = {
|
||||
"int8[pyarrow]": "i",
|
||||
"int16[pyarrow]": "i",
|
||||
"int32[pyarrow]": "i",
|
||||
"int64[pyarrow]": "i",
|
||||
"uint8[pyarrow]": "i",
|
||||
"uint16[pyarrow]": "i",
|
||||
"uint32[pyarrow]": "i",
|
||||
"uint64[pyarrow]": "i",
|
||||
"int8[pyarrow]": "int",
|
||||
"int16[pyarrow]": "int",
|
||||
"int32[pyarrow]": "int",
|
||||
"int64[pyarrow]": "int",
|
||||
"uint8[pyarrow]": "int",
|
||||
"uint16[pyarrow]": "int",
|
||||
"uint32[pyarrow]": "int",
|
||||
"uint64[pyarrow]": "int",
|
||||
"float[pyarrow]": "float",
|
||||
"float32[pyarrow]": "float",
|
||||
"double[pyarrow]": "float",
|
||||
@@ -295,7 +333,7 @@ _pandas_dtype_mapper.update(pandas_pyarrow_mapper)
|
||||
|
||||
|
||||
_ENABLE_CAT_ERR = (
|
||||
"When categorical type is supplied, The experimental DMatrix parameter"
|
||||
"When categorical type is supplied, the experimental DMatrix parameter"
|
||||
"`enable_categorical` must be set to `True`."
|
||||
)
|
||||
|
||||
@@ -407,89 +445,122 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
|
||||
return is_sparse(dtype)
|
||||
|
||||
|
||||
def pandas_cat_null(data: DataFrame) -> DataFrame:
|
||||
"""Handle categorical dtype and nullable extension types from pandas."""
|
||||
import pandas as pd
|
||||
|
||||
# handle category codes and nullable.
|
||||
cat_columns = []
|
||||
nul_columns = []
|
||||
# avoid an unnecessary conversion if possible
|
||||
for col, dtype in zip(data.columns, data.dtypes):
|
||||
if is_pd_cat_dtype(dtype):
|
||||
cat_columns.append(col)
|
||||
elif is_pa_ext_categorical_dtype(dtype):
|
||||
raise ValueError(
|
||||
"pyarrow dictionary type is not supported. Use pandas category instead."
|
||||
)
|
||||
elif is_nullable_dtype(dtype):
|
||||
nul_columns.append(col)
|
||||
|
||||
if cat_columns or nul_columns:
|
||||
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
|
||||
# fragmented
|
||||
transformed = data.copy(deep=False)
|
||||
else:
|
||||
transformed = data
|
||||
|
||||
def cat_codes(ser: pd.Series) -> pd.Series:
|
||||
if is_pd_cat_dtype(ser.dtype):
|
||||
return ser.cat.codes
|
||||
assert is_pa_ext_categorical_dtype(ser.dtype)
|
||||
# Not yet supported, the index is not ordered for some reason. Alternately:
|
||||
# `combine_chunks().to_pandas().cat.codes`. The result is the same.
|
||||
return ser.array.__arrow_array__().combine_chunks().dictionary_encode().indices
|
||||
|
||||
if cat_columns:
|
||||
# DF doesn't have the cat attribute, as a result, we use apply here
|
||||
transformed[cat_columns] = (
|
||||
transformed[cat_columns]
|
||||
.apply(cat_codes)
|
||||
.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
)
|
||||
if nul_columns:
|
||||
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
|
||||
|
||||
# TODO(jiamingy): Investigate the possibility of using dataframe protocol or arrow
|
||||
# IPC format for pandas so that we can apply the data transformation inside XGBoost
|
||||
# for better memory efficiency.
|
||||
|
||||
return transformed
|
||||
|
||||
|
||||
def pandas_ext_num_types(data: DataFrame) -> DataFrame:
|
||||
"""Experimental suppport for handling pyarrow extension numeric types."""
|
||||
def pandas_pa_type(ser: Any) -> np.ndarray:
|
||||
"""Handle pandas pyarrow extention."""
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
|
||||
# No copy, callstack:
|
||||
# pandas.core.internals.managers.SingleBlockManager.array_values()
|
||||
# pandas.core.internals.blocks.EABackedBlock.values
|
||||
d_array: pd.arrays.ArrowExtensionArray = ser.array
|
||||
# no copy in __arrow_array__
|
||||
# ArrowExtensionArray._data is a chunked array
|
||||
aa: pa.ChunkedArray = d_array.__arrow_array__()
|
||||
# combine_chunks takes the most significant amount of time
|
||||
chunk: pa.Array = aa.combine_chunks()
|
||||
# When there's null value, we have to use copy
|
||||
zero_copy = chunk.null_count == 0
|
||||
# Alternately, we can use chunk.buffers(), which returns a list of buffers and
|
||||
# we need to concatenate them ourselves.
|
||||
# FIXME(jiamingy): Is there a better way to access the arrow buffer along with
|
||||
# its mask?
|
||||
# Buffers from chunk.buffers() have the address attribute, but don't expose the
|
||||
# mask.
|
||||
arr: np.ndarray = chunk.to_numpy(zero_copy_only=zero_copy, writable=False)
|
||||
arr, _ = _ensure_np_dtype(arr, arr.dtype)
|
||||
return arr
|
||||
|
||||
|
||||
def pandas_transform_data(data: DataFrame) -> List[np.ndarray]:
|
||||
"""Handle categorical dtype and extension types from pandas."""
|
||||
import pandas as pd
|
||||
from pandas import Float32Dtype, Float64Dtype
|
||||
|
||||
result: List[np.ndarray] = []
|
||||
|
||||
def cat_codes(ser: pd.Series) -> np.ndarray:
|
||||
if is_pd_cat_dtype(ser.dtype):
|
||||
return _ensure_np_dtype(
|
||||
ser.cat.codes.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
.to_numpy(na_value=np.nan),
|
||||
np.float32,
|
||||
)[0]
|
||||
# Not yet supported, the index is not ordered for some reason. Alternately:
|
||||
# `combine_chunks().to_pandas().cat.codes`. The result is the same.
|
||||
assert is_pa_ext_categorical_dtype(ser.dtype)
|
||||
return (
|
||||
ser.array.__arrow_array__()
|
||||
.combine_chunks()
|
||||
.dictionary_encode()
|
||||
.indices.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
)
|
||||
|
||||
def nu_type(ser: pd.Series) -> np.ndarray:
|
||||
# Avoid conversion when possible
|
||||
if isinstance(dtype, Float32Dtype):
|
||||
res_dtype: NumpyDType = np.float32
|
||||
elif isinstance(dtype, Float64Dtype):
|
||||
res_dtype = np.float64
|
||||
else:
|
||||
res_dtype = np.float32
|
||||
return _ensure_np_dtype(
|
||||
ser.to_numpy(dtype=res_dtype, na_value=np.nan), res_dtype
|
||||
)[0]
|
||||
|
||||
def oth_type(ser: pd.Series) -> np.ndarray:
|
||||
# The dtypes module is added in 1.25.
|
||||
npdtypes = np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0")
|
||||
npdtypes = npdtypes and isinstance(
|
||||
ser.dtype,
|
||||
(
|
||||
# pylint: disable=no-member
|
||||
np.dtypes.Float32DType, # type: ignore
|
||||
# pylint: disable=no-member
|
||||
np.dtypes.Float64DType, # type: ignore
|
||||
),
|
||||
)
|
||||
|
||||
if npdtypes or dtype in {np.float32, np.float64}:
|
||||
array = ser.to_numpy()
|
||||
else:
|
||||
# Specifying the dtype can significantly slow down the conversion (about
|
||||
# 15% slow down for dense inplace-predict)
|
||||
array = ser.to_numpy(dtype=np.float32, na_value=np.nan)
|
||||
return _ensure_np_dtype(array, array.dtype)[0]
|
||||
|
||||
for col, dtype in zip(data.columns, data.dtypes):
|
||||
if not is_pa_ext_dtype(dtype):
|
||||
continue
|
||||
# No copy, callstack:
|
||||
# pandas.core.internals.managers.SingleBlockManager.array_values()
|
||||
# pandas.core.internals.blocks.EABackedBlock.values
|
||||
d_array: pd.arrays.ArrowExtensionArray = data[col].array
|
||||
# no copy in __arrow_array__
|
||||
# ArrowExtensionArray._data is a chunked array
|
||||
aa: pa.ChunkedArray = d_array.__arrow_array__()
|
||||
chunk: pa.Array = aa.combine_chunks()
|
||||
# Alternately, we can use chunk.buffers(), which returns a list of buffers and
|
||||
# we need to concatenate them ourselves.
|
||||
arr = chunk.__array__()
|
||||
data[col] = arr
|
||||
return data
|
||||
if is_pa_ext_categorical_dtype(dtype):
|
||||
raise ValueError(
|
||||
"pyarrow dictionary type is not supported. Use pandas category instead."
|
||||
)
|
||||
if is_pd_cat_dtype(dtype):
|
||||
result.append(cat_codes(data[col]))
|
||||
elif is_pa_ext_dtype(dtype):
|
||||
result.append(pandas_pa_type(data[col]))
|
||||
elif is_nullable_dtype(dtype):
|
||||
result.append(nu_type(data[col]))
|
||||
elif is_pd_sparse_dtype(dtype):
|
||||
arr = cast(pd.arrays.SparseArray, data[col].values)
|
||||
arr = arr.to_dense()
|
||||
if _is_np_array_like(arr):
|
||||
arr, _ = _ensure_np_dtype(arr, arr.dtype)
|
||||
result.append(arr)
|
||||
else:
|
||||
result.append(oth_type(data[col]))
|
||||
|
||||
# FIXME(jiamingy): Investigate the possibility of using dataframe protocol or arrow
|
||||
# IPC format for pandas so that we can apply the data transformation inside XGBoost
|
||||
# for better memory efficiency.
|
||||
return result
|
||||
|
||||
|
||||
def _transform_pandas_df(
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[NumpyDType] = None,
|
||||
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
pyarrow_extension = False
|
||||
def pandas_check_dtypes(data: DataFrame, enable_categorical: bool) -> None:
|
||||
"""Validate the input types, returns True if the dataframe is backed by arrow."""
|
||||
sparse_extension = False
|
||||
|
||||
for dtype in data.dtypes:
|
||||
if not (
|
||||
(dtype.name in _pandas_dtype_mapper)
|
||||
@@ -498,27 +569,65 @@ def _transform_pandas_df(
|
||||
or is_pa_ext_dtype(dtype)
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
if is_pa_ext_dtype(dtype):
|
||||
pyarrow_extension = True
|
||||
|
||||
if is_pd_sparse_dtype(dtype):
|
||||
sparse_extension = True
|
||||
|
||||
if sparse_extension:
|
||||
warnings.warn("Sparse arrays from pandas are converted into dense.")
|
||||
|
||||
|
||||
class PandasTransformed:
|
||||
"""A storage class for transformed pandas DataFrame."""
|
||||
|
||||
def __init__(self, columns: List[np.ndarray]) -> None:
|
||||
self.columns = columns
|
||||
|
||||
def array_interface(self) -> bytes:
|
||||
"""Return a byte string for JSON encoded array interface."""
|
||||
aitfs = list(map(_array_interface_dict, self.columns))
|
||||
sarrays = bytes(json.dumps(aitfs), "utf-8")
|
||||
return sarrays
|
||||
|
||||
@property
|
||||
def shape(self) -> Tuple[int, int]:
|
||||
"""Return shape of the transformed DataFrame."""
|
||||
return self.columns[0].shape[0], len(self.columns)
|
||||
|
||||
|
||||
def _transform_pandas_df(
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
meta: Optional[str] = None,
|
||||
) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
pandas_check_dtypes(data, enable_categorical)
|
||||
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
|
||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||
|
||||
feature_names, feature_types = pandas_feature_info(
|
||||
data, meta, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
|
||||
transformed = pandas_cat_null(data)
|
||||
if pyarrow_extension:
|
||||
if transformed is data:
|
||||
transformed = data.copy(deep=False)
|
||||
transformed = pandas_ext_num_types(transformed)
|
||||
arrays = pandas_transform_data(data)
|
||||
return PandasTransformed(arrays), feature_names, feature_types
|
||||
|
||||
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
|
||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||
|
||||
dtype = meta_type if meta_type else np.float32
|
||||
arr: np.ndarray = transformed.values
|
||||
if meta_type:
|
||||
arr = arr.astype(dtype)
|
||||
return arr, feature_names, feature_types
|
||||
def _meta_from_pandas_df(
|
||||
data: DataType,
|
||||
name: str,
|
||||
dtype: Optional[NumpyDType],
|
||||
handle: ctypes.c_void_p,
|
||||
) -> None:
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name)
|
||||
if len(data.columns) == 1:
|
||||
array = data.columns[0]
|
||||
else:
|
||||
array = np.stack(data.columns).T
|
||||
|
||||
array, dtype = _ensure_np_dtype(array, dtype)
|
||||
_meta_from_numpy(array, name, dtype, handle)
|
||||
|
||||
|
||||
def _from_pandas_df(
|
||||
@@ -530,12 +639,21 @@ def _from_pandas_df(
|
||||
feature_types: Optional[FeatureTypes],
|
||||
data_split_mode: DataSplitMode = DataSplitMode.ROW,
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
data, feature_names, feature_types = _transform_pandas_df(
|
||||
df, feature_names, feature_types = _transform_pandas_df(
|
||||
data, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
return _from_numpy_array(
|
||||
data, missing, nthread, feature_names, feature_types, data_split_mode
|
||||
|
||||
handle = ctypes.c_void_p()
|
||||
_check_call(
|
||||
_LIB.XGDMatrixCreateFromColumnar(
|
||||
df.array_interface(),
|
||||
make_jcargs(
|
||||
nthread=nthread, missing=missing, data_split_mode=data_split_mode
|
||||
),
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
)
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_pandas_series(data: DataType) -> bool:
|
||||
@@ -550,7 +668,12 @@ def _meta_from_pandas_series(
|
||||
data: DataType, name: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
|
||||
) -> None:
|
||||
"""Help transform pandas series for meta data like labels"""
|
||||
data = data.values.astype("float")
|
||||
if is_pd_sparse_dtype(data.dtype):
|
||||
data = data.values.to_dense().astype(np.float32)
|
||||
elif is_pa_ext_dtype(data.dtype):
|
||||
data = pandas_pa_type(data)
|
||||
else:
|
||||
data = data.to_numpy(np.float32, na_value=np.nan)
|
||||
|
||||
if is_pd_sparse_dtype(getattr(data, "dtype", data)):
|
||||
data = data.to_dense() # type: ignore
|
||||
@@ -732,6 +855,8 @@ def _arrow_transform(data: DataType) -> Any:
|
||||
return pd.ArrowDtype(pa.bool_())
|
||||
return None
|
||||
|
||||
# For common cases, this is zero-copy, can check with:
|
||||
# pa.total_allocated_bytes()
|
||||
df = data.to_pandas(types_mapper=type_mapper)
|
||||
return df
|
||||
|
||||
@@ -859,11 +984,10 @@ def _from_cudf_df(
|
||||
)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
handle = ctypes.c_void_p()
|
||||
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
||||
_check_call(
|
||||
_LIB.XGDMatrixCreateFromCudaColumnar(
|
||||
interfaces_str,
|
||||
config,
|
||||
make_jcargs(nthread=nthread, missing=missing),
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
)
|
||||
@@ -874,11 +998,8 @@ def _is_cudf_ser(data: DataType) -> bool:
|
||||
return lazy_isinstance(data, "cudf.core.series", "Series")
|
||||
|
||||
|
||||
def _is_cupy_array(data: DataType) -> bool:
|
||||
return any(
|
||||
lazy_isinstance(data, n, "ndarray")
|
||||
for n in ("cupy.core.core", "cupy", "cupy._core.core")
|
||||
)
|
||||
def _is_cupy_alike(data: DataType) -> bool:
|
||||
return hasattr(data, "__cuda_array_interface__")
|
||||
|
||||
|
||||
def _transform_cupy_array(data: DataType) -> CupyT:
|
||||
@@ -886,7 +1007,7 @@ def _transform_cupy_array(data: DataType) -> CupyT:
|
||||
|
||||
if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"):
|
||||
data = cupy.array(data, copy=False)
|
||||
if data.dtype.hasobject or data.dtype in [cupy.bool_]:
|
||||
if _array_hasobject(data) or data.dtype in [cupy.bool_]:
|
||||
data = data.astype(cupy.float32, copy=False)
|
||||
return data
|
||||
|
||||
@@ -1047,15 +1168,15 @@ def dispatch_data_backend(
|
||||
"""Dispatch data for DMatrix."""
|
||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||
_check_data_shape(data)
|
||||
if _is_scipy_csr(data):
|
||||
if is_scipy_csr(data):
|
||||
return _from_scipy_csr(
|
||||
data, missing, threads, feature_names, feature_types, data_split_mode
|
||||
)
|
||||
if _is_scipy_csc(data):
|
||||
if is_scipy_csc(data):
|
||||
return _from_scipy_csc(
|
||||
data, missing, threads, feature_names, feature_types, data_split_mode
|
||||
)
|
||||
if _is_scipy_coo(data):
|
||||
if is_scipy_coo(data):
|
||||
return _from_scipy_csr(
|
||||
data.tocsr(),
|
||||
missing,
|
||||
@@ -1098,7 +1219,7 @@ def dispatch_data_backend(
|
||||
return _from_cudf_df(
|
||||
data, missing, threads, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
if _is_cupy_array(data):
|
||||
if _is_cupy_alike(data):
|
||||
return _from_cupy_array(data, missing, threads, feature_names, feature_types)
|
||||
if _is_cupy_csr(data):
|
||||
raise TypeError("cupyx CSR is not supported yet.")
|
||||
@@ -1221,8 +1342,7 @@ def dispatch_meta_backend(
|
||||
if _is_arrow(data):
|
||||
data = _arrow_transform(data)
|
||||
if _is_pandas_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
_meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
|
||||
return
|
||||
if _is_pandas_series(data):
|
||||
_meta_from_pandas_series(data, name, dtype, handle)
|
||||
@@ -1231,7 +1351,7 @@ def dispatch_meta_backend(
|
||||
data = _transform_dlpack(data)
|
||||
_meta_from_cupy_array(data, name, handle)
|
||||
return
|
||||
if _is_cupy_array(data):
|
||||
if _is_cupy_alike(data):
|
||||
_meta_from_cupy_array(data, name, handle)
|
||||
return
|
||||
if _is_cudf_ser(data):
|
||||
@@ -1244,8 +1364,7 @@ def dispatch_meta_backend(
|
||||
_meta_from_dt(data, name, dtype, handle)
|
||||
return
|
||||
if _is_modin_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
_meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
|
||||
return
|
||||
if _is_modin_series(data):
|
||||
data = data.values.astype("float")
|
||||
@@ -1297,7 +1416,7 @@ def _proxy_transform(
|
||||
return _transform_cudf_df(
|
||||
data, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
if _is_cupy_array(data):
|
||||
if _is_cupy_alike(data):
|
||||
data = _transform_cupy_array(data)
|
||||
return data, None, feature_names, feature_types
|
||||
if _is_dlpack(data):
|
||||
@@ -1307,9 +1426,15 @@ def _proxy_transform(
|
||||
if _is_np_array_like(data):
|
||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||
return data, None, feature_names, feature_types
|
||||
if _is_scipy_csr(data):
|
||||
if is_scipy_csr(data):
|
||||
data = transform_scipy_sparse(data, True)
|
||||
return data, None, feature_names, feature_types
|
||||
if is_scipy_csc(data):
|
||||
data = transform_scipy_sparse(data.tocsr(), True)
|
||||
return data, None, feature_names, feature_types
|
||||
if is_scipy_coo(data):
|
||||
data = transform_scipy_sparse(data.tocsr(), True)
|
||||
return data, None, feature_names, feature_types
|
||||
if _is_pandas_series(data):
|
||||
import pandas as pd
|
||||
|
||||
@@ -1317,11 +1442,10 @@ def _proxy_transform(
|
||||
if _is_arrow(data):
|
||||
data = _arrow_transform(data)
|
||||
if _is_pandas_df(data):
|
||||
arr, feature_names, feature_types = _transform_pandas_df(
|
||||
df, feature_names, feature_types = _transform_pandas_df(
|
||||
data, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
arr, _ = _ensure_np_dtype(arr, arr.dtype)
|
||||
return arr, None, feature_names, feature_types
|
||||
return df, None, feature_names, feature_types
|
||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||
|
||||
|
||||
@@ -1343,7 +1467,7 @@ def dispatch_proxy_set_data(
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
|
||||
return
|
||||
if _is_cupy_array(data):
|
||||
if _is_cupy_alike(data):
|
||||
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
||||
return
|
||||
if _is_dlpack(data):
|
||||
@@ -1356,11 +1480,14 @@ def dispatch_proxy_set_data(
|
||||
if not allow_host:
|
||||
raise err
|
||||
|
||||
if isinstance(data, PandasTransformed):
|
||||
proxy._set_data_from_pandas(data) # pylint: disable=W0212
|
||||
return
|
||||
if _is_np_array_like(data):
|
||||
_check_data_shape(data)
|
||||
proxy._set_data_from_array(data) # pylint: disable=W0212
|
||||
return
|
||||
if _is_scipy_csr(data):
|
||||
if is_scipy_csr(data):
|
||||
proxy._set_data_from_csr(data) # pylint: disable=W0212
|
||||
return
|
||||
raise err
|
||||
|
||||
@@ -39,7 +39,7 @@ from .core import (
|
||||
_deprecate_positional_args,
|
||||
_parse_eval_str,
|
||||
)
|
||||
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
|
||||
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_alike, _is_pandas_df
|
||||
from .training import train
|
||||
|
||||
|
||||
@@ -192,11 +192,16 @@ __model_doc = f"""
|
||||
Boosting learning rate (xgb's "eta")
|
||||
verbosity : Optional[int]
|
||||
The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
|
||||
|
||||
objective : {SklObjective}
|
||||
Specify the learning task and the corresponding learning objective or
|
||||
a custom objective function to be used (see note below).
|
||||
|
||||
Specify the learning task and the corresponding learning objective or a custom
|
||||
objective function to be used. For custom objective, see
|
||||
:doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
|
||||
information.
|
||||
|
||||
booster: Optional[str]
|
||||
Specify which booster to use: gbtree, gblinear or dart.
|
||||
Specify which booster to use: `gbtree`, `gblinear` or `dart`.
|
||||
tree_method: Optional[str]
|
||||
Specify which tree method to use. Default to auto. If this parameter is set to
|
||||
default, XGBoost will choose the most conservative option available. It's
|
||||
@@ -276,13 +281,7 @@ __model_doc = f"""
|
||||
|
||||
enable_categorical : bool
|
||||
|
||||
.. versionadded:: 1.5.0
|
||||
|
||||
.. note:: This parameter is experimental
|
||||
|
||||
Experimental support for categorical data. When enabled, cudf/pandas.DataFrame
|
||||
should be used to specify categorical data type. Also, JSON/UBJSON
|
||||
serialization format is required.
|
||||
See the same parameter of :py:class:`DMatrix` for details.
|
||||
|
||||
feature_types : Optional[FeatureTypes]
|
||||
|
||||
@@ -334,21 +333,21 @@ __model_doc = f"""
|
||||
|
||||
Metric used for monitoring the training result and early stopping. It can be a
|
||||
string or list of strings as names of predefined metric in XGBoost (See
|
||||
doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any other
|
||||
user defined metric that looks like `sklearn.metrics`.
|
||||
doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
|
||||
other user defined metric that looks like `sklearn.metrics`.
|
||||
|
||||
If custom objective is also provided, then custom metric should implement the
|
||||
corresponding reverse link function.
|
||||
|
||||
Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
|
||||
object is provided, it's assumed to be a cost function and by default XGBoost will
|
||||
minimize the result during early stopping.
|
||||
object is provided, it's assumed to be a cost function and by default XGBoost
|
||||
will minimize the result during early stopping.
|
||||
|
||||
For advanced usage on Early stopping like directly choosing to maximize instead of
|
||||
minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
|
||||
For advanced usage on Early stopping like directly choosing to maximize instead
|
||||
of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
|
||||
|
||||
See :doc:`Custom Objective and Evaluation Metric </tutorials/custom_metric_obj>`
|
||||
for more.
|
||||
See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
|
||||
information.
|
||||
|
||||
.. note::
|
||||
|
||||
@@ -1012,7 +1011,7 @@ class XGBModel(XGBModelBase):
|
||||
sample_weight :
|
||||
instance weights
|
||||
base_margin :
|
||||
global bias for each instance.
|
||||
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
|
||||
eval_set :
|
||||
A list of (X, y) tuple pairs to use as validation sets, for which
|
||||
metrics will be computed.
|
||||
@@ -1152,7 +1151,7 @@ class XGBModel(XGBModelBase):
|
||||
When this is True, validate that the Booster's and data's feature_names are
|
||||
identical. Otherwise, it is assumed that the feature_names are the same.
|
||||
base_margin :
|
||||
Margin added to prediction.
|
||||
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
|
||||
iteration_range :
|
||||
Specifies which layer of trees are used in prediction. For example, if a
|
||||
random forest is trained with 100 rounds. Specifying ``iteration_range=(10,
|
||||
@@ -1178,7 +1177,7 @@ class XGBModel(XGBModelBase):
|
||||
base_margin=base_margin,
|
||||
validate_features=validate_features,
|
||||
)
|
||||
if _is_cupy_array(predts):
|
||||
if _is_cupy_alike(predts):
|
||||
import cupy # pylint: disable=import-error
|
||||
|
||||
predts = cupy.asnumpy(predts) # ensure numpy array is used.
|
||||
@@ -1459,7 +1458,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
classes = cp.unique(y.values)
|
||||
self.n_classes_ = len(classes)
|
||||
expected_classes = cp.array(self.classes_)
|
||||
elif _is_cupy_array(y):
|
||||
elif _is_cupy_alike(y):
|
||||
import cupy as cp # pylint: disable=E0401
|
||||
|
||||
classes = cp.unique(y)
|
||||
@@ -1605,7 +1604,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
When this is True, validate that the Booster's and data's feature_names are
|
||||
identical. Otherwise, it is assumed that the feature_names are the same.
|
||||
base_margin :
|
||||
Margin added to prediction.
|
||||
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
|
||||
iteration_range :
|
||||
Specifies which layer of trees are used in prediction. For example, if a
|
||||
random forest is trained with 100 rounds. Specifying `iteration_range=(10,
|
||||
@@ -1948,7 +1947,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
|
||||
weights to individual data points.
|
||||
|
||||
base_margin :
|
||||
Global bias for each instance.
|
||||
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
|
||||
eval_set :
|
||||
A list of (X, y) tuple pairs to use as validation sets, for which
|
||||
metrics will be computed.
|
||||
|
||||
@@ -630,7 +630,7 @@ sparse_datasets_strategy = strategies.sampled_from(
|
||||
|
||||
def make_datasets_with_margin(
|
||||
unweighted_strategy: strategies.SearchStrategy,
|
||||
) -> Callable:
|
||||
) -> Callable[[], strategies.SearchStrategy[TestDataset]]:
|
||||
"""Factory function for creating strategies that generates datasets with weight and
|
||||
base margin.
|
||||
|
||||
@@ -668,8 +668,7 @@ def make_datasets_with_margin(
|
||||
|
||||
# A strategy for drawing from a set of example datasets. May add random weights to the
|
||||
# dataset
|
||||
@memory.cache
|
||||
def make_dataset_strategy() -> Callable:
|
||||
def make_dataset_strategy() -> strategies.SearchStrategy[TestDataset]:
|
||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset(
|
||||
@@ -815,6 +814,13 @@ def softprob_obj(
|
||||
return objective
|
||||
|
||||
|
||||
def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Least squared error."""
|
||||
grad = y_pred - y_true
|
||||
hess = np.ones(len(y_true))
|
||||
return grad, hess
|
||||
|
||||
|
||||
class DirectoryExcursion:
|
||||
"""Change directory. Change back and optionally cleaning up the directory when
|
||||
exit.
|
||||
|
||||
@@ -3,7 +3,18 @@
|
||||
import os
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Generator, List, NamedTuple, Optional, Tuple, Union
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
NamedTuple,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
from urllib import request
|
||||
|
||||
import numpy as np
|
||||
@@ -15,6 +26,11 @@ from scipy import sparse
|
||||
import xgboost
|
||||
from xgboost.data import pandas_pyarrow_mapper
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..compat import DataFrame as DataFrameT
|
||||
else:
|
||||
DataFrameT = Any
|
||||
|
||||
joblib = pytest.importorskip("joblib")
|
||||
memory = joblib.Memory("./cachedir", verbose=0)
|
||||
|
||||
@@ -246,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
|
||||
return X, y
|
||||
|
||||
|
||||
# pylint: disable=too-many-statements
|
||||
@memory.cache
|
||||
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
|
||||
"""Get a synthetic version of the amse housing dataset.
|
||||
|
||||
The real one can be obtained via:
|
||||
|
||||
.. code-block::
|
||||
|
||||
from sklearn import datasets
|
||||
|
||||
datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||
|
||||
Number of samples: 1460
|
||||
Number of features: 20
|
||||
Number of categorical features: 10
|
||||
Number of numerical features: 10
|
||||
"""
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||
pytest.importorskip("pandas")
|
||||
import pandas as pd
|
||||
|
||||
categorical_columns_subset: List[str] = [
|
||||
"BldgType", # 5 cats, no nan
|
||||
"GarageFinish", # 3 cats, nan
|
||||
"LotConfig", # 5 cats, no nan
|
||||
"Functional", # 7 cats, no nan
|
||||
"MasVnrType", # 4 cats, nan
|
||||
"HouseStyle", # 8 cats, no nan
|
||||
"FireplaceQu", # 5 cats, nan
|
||||
"ExterCond", # 5 cats, no nan
|
||||
"ExterQual", # 4 cats, no nan
|
||||
"PoolQC", # 3 cats, nan
|
||||
]
|
||||
rng = np.random.default_rng(1994)
|
||||
n_samples = 1460
|
||||
df = pd.DataFrame()
|
||||
|
||||
numerical_columns_subset: List[str] = [
|
||||
"3SsnPorch",
|
||||
"Fireplaces",
|
||||
"BsmtHalfBath",
|
||||
"HalfBath",
|
||||
"GarageCars",
|
||||
"TotRmsAbvGrd",
|
||||
"BsmtFinSF1",
|
||||
"BsmtFinSF2",
|
||||
"GrLivArea",
|
||||
"ScreenPorch",
|
||||
]
|
||||
def synth_cat(
|
||||
name_proba: Dict[Union[str, float], float], density: float
|
||||
) -> pd.Series:
|
||||
n_nulls = int(n_samples * (1 - density))
|
||||
has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
|
||||
if has_nan:
|
||||
sparsity = 1.0 - density
|
||||
name_proba[np.nan] = sparsity
|
||||
|
||||
X = X[categorical_columns_subset + numerical_columns_subset]
|
||||
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
|
||||
return X, y
|
||||
keys = list(name_proba.keys())
|
||||
p = list(name_proba.values())
|
||||
p[-1] += 1.0 - np.sum(p) # Fix floating point error
|
||||
x = rng.choice(keys, size=n_samples, p=p)
|
||||
|
||||
series = pd.Series(
|
||||
x,
|
||||
dtype=pd.CategoricalDtype(
|
||||
# not NA
|
||||
filter(lambda x: isinstance(x, str), keys)
|
||||
),
|
||||
)
|
||||
return series
|
||||
|
||||
df["BldgType"] = synth_cat(
|
||||
{
|
||||
"1Fam": 0.835616,
|
||||
"2fmCon": 0.078082,
|
||||
"Duplex": 0.035616,
|
||||
"Twnhs": 0.029452,
|
||||
"TwnhsE": 0.021233,
|
||||
},
|
||||
1.0,
|
||||
)
|
||||
df["GarageFinish"] = synth_cat(
|
||||
{"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
|
||||
)
|
||||
df["LotConfig"] = synth_cat(
|
||||
{
|
||||
"Corner": 0.180137,
|
||||
"CulDSac": 0.064384,
|
||||
"FR2": 0.032192,
|
||||
"FR3": 0.002740,
|
||||
},
|
||||
1.0,
|
||||
)
|
||||
df["Functional"] = synth_cat(
|
||||
{
|
||||
"Typ": 0.931506,
|
||||
"Min2": 0.023287,
|
||||
"Min1": 0.021232,
|
||||
"Mod": 0.010273,
|
||||
"Maj1": 0.009589,
|
||||
"Maj2": 0.003424,
|
||||
"Sev": 0.000684,
|
||||
},
|
||||
1.0,
|
||||
)
|
||||
df["MasVnrType"] = synth_cat(
|
||||
{
|
||||
"None": 0.591780,
|
||||
"BrkFace": 0.304794,
|
||||
"Stone": 0.087671,
|
||||
"BrkCmn": 0.010273,
|
||||
},
|
||||
0.99452,
|
||||
)
|
||||
df["HouseStyle"] = synth_cat(
|
||||
{
|
||||
"1Story": 0.497260,
|
||||
"2Story": 0.304794,
|
||||
"1.5Fin": 0.105479,
|
||||
"SLvl": 0.044520,
|
||||
"SFoyer": 0.025342,
|
||||
"1.5Unf": 0.009589,
|
||||
"2.5Unf": 0.007534,
|
||||
"2.5Fin": 0.005479,
|
||||
},
|
||||
1.0,
|
||||
)
|
||||
df["FireplaceQu"] = synth_cat(
|
||||
{
|
||||
"Gd": 0.260273,
|
||||
"TA": 0.214383,
|
||||
"Fa": 0.022602,
|
||||
"Ex": 0.016438,
|
||||
"Po": 0.013698,
|
||||
},
|
||||
0.527397,
|
||||
)
|
||||
df["ExterCond"] = synth_cat(
|
||||
{
|
||||
"TA": 0.878082,
|
||||
"Gd": 0.1,
|
||||
"Fa": 0.019178,
|
||||
"Ex": 0.002054,
|
||||
"Po": 0.000684,
|
||||
},
|
||||
1.0,
|
||||
)
|
||||
df["ExterQual"] = synth_cat(
|
||||
{
|
||||
"TA": 0.620547,
|
||||
"Gd": 0.334246,
|
||||
"Ex": 0.035616,
|
||||
"Fa": 0.009589,
|
||||
},
|
||||
1.0,
|
||||
)
|
||||
df["PoolQC"] = synth_cat(
|
||||
{
|
||||
"Gd": 0.002054,
|
||||
"Ex": 0.001369,
|
||||
"Fa": 0.001369,
|
||||
},
|
||||
0.004794,
|
||||
)
|
||||
|
||||
# We focus on the cateogircal values here, for numerical features, simple normal
|
||||
# distribution is used, which doesn't match the original data.
|
||||
def synth_num(loc: float, std: float, density: float) -> pd.Series:
|
||||
x = rng.normal(loc=loc, scale=std, size=n_samples)
|
||||
n_nulls = int(n_samples * (1 - density))
|
||||
if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
|
||||
null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
|
||||
x[null_idx] = np.nan
|
||||
return pd.Series(x, dtype=np.float64)
|
||||
|
||||
df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
|
||||
df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
|
||||
df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
|
||||
df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
|
||||
df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
|
||||
df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
|
||||
df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
|
||||
df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
|
||||
df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
|
||||
df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
|
||||
|
||||
columns = list(df.columns)
|
||||
rng.shuffle(columns)
|
||||
df = df[columns]
|
||||
|
||||
# linear interaction for testing purposes.
|
||||
y = np.zeros(shape=(n_samples,))
|
||||
for c in df.columns:
|
||||
if isinstance(df[c].dtype, pd.CategoricalDtype):
|
||||
y += df[c].cat.codes.astype(np.float64)
|
||||
else:
|
||||
y += df[c].values
|
||||
|
||||
# Shift and scale to match the original y.
|
||||
y *= 79442.50288288662 / y.std()
|
||||
y += 180921.19589041095 - y.mean()
|
||||
|
||||
return df, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
@@ -603,3 +759,51 @@ def sort_ltr_samples(
|
||||
data = X, clicks, y, qid
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def run_base_margin_info(
|
||||
DType: Callable, DMatrixT: Type[xgboost.DMatrix], device: str
|
||||
) -> None:
|
||||
"""Run tests for base margin."""
|
||||
rng = np.random.default_rng()
|
||||
X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
|
||||
if hasattr(X, "iloc"):
|
||||
y = X.iloc[:, 0]
|
||||
else:
|
||||
y = X[:, 0]
|
||||
base_margin = X
|
||||
# no error at set
|
||||
Xy = DMatrixT(X, y, base_margin=base_margin)
|
||||
# Error at train, caused by check in predictor.
|
||||
with pytest.raises(ValueError, match=r".*base_margin.*"):
|
||||
xgboost.train({"tree_method": "hist", "device": device}, Xy)
|
||||
|
||||
if not hasattr(X, "iloc"):
|
||||
# column major matrix
|
||||
got = DType(Xy.get_base_margin().reshape(50, 2))
|
||||
assert (got == base_margin).all()
|
||||
|
||||
assert base_margin.T.flags.c_contiguous is False
|
||||
assert base_margin.T.flags.f_contiguous is True
|
||||
Xy.set_info(base_margin=base_margin.T)
|
||||
got = DType(Xy.get_base_margin().reshape(2, 50))
|
||||
assert (got == base_margin.T).all()
|
||||
|
||||
# Row vs col vec.
|
||||
base_margin = y
|
||||
Xy.set_base_margin(base_margin)
|
||||
bm_col = Xy.get_base_margin()
|
||||
Xy.set_base_margin(base_margin.reshape(1, base_margin.size))
|
||||
bm_row = Xy.get_base_margin()
|
||||
assert (bm_row == bm_col).all()
|
||||
|
||||
# type
|
||||
base_margin = base_margin.astype(np.float64)
|
||||
Xy.set_base_margin(base_margin)
|
||||
bm_f64 = Xy.get_base_margin()
|
||||
assert (bm_f64 == bm_col).all()
|
||||
|
||||
# too many dimensions
|
||||
base_margin = X.reshape(2, 5, 2, 5)
|
||||
with pytest.raises(ValueError, match=r".*base_margin.*"):
|
||||
Xy.set_base_margin(base_margin)
|
||||
|
||||
@@ -394,3 +394,14 @@ def train_result(
|
||||
assert booster.feature_types == dmat.feature_types
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class ResetStrategy(xgb.callback.TrainingCallback):
|
||||
"""Callback for testing multi-output."""
|
||||
|
||||
def after_iteration(self, model: xgb.Booster, epoch: int, evals_log: dict) -> bool:
|
||||
if epoch % 2 == 0:
|
||||
model.set_param({"multi_strategy": "multi_output_tree"})
|
||||
else:
|
||||
model.set_param({"multi_strategy": "one_output_per_tree"})
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user