merge latest, Jan 12 2024

This commit is contained in:
Hui Liu
2024-01-12 09:57:11 -08:00
251 changed files with 9023 additions and 5012 deletions

View File

@@ -62,11 +62,31 @@ class TrainingCallback(ABC):
return model
def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
"""Run before each iteration. Return True when training should stop."""
"""Run before each iteration. Returns True when training should stop. See
:py:meth:`after_iteration` for details.
"""
return False
def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
"""Run after each iteration. Return True when training should stop."""
"""Run after each iteration. Returns `True` when training should stop.
Parameters
----------
model :
Eeither a :py:class:`~xgboost.Booster` object or a CVPack if the cv function
in xgboost is being used.
epoch :
The current training iteration.
evals_log :
A dictionary containing the evaluation history:
.. code-block:: python
{"data_name": {"metric_name": [0.5, ...]}}
"""
return False
@@ -547,14 +567,16 @@ class TrainingCheckPoint(TrainingCallback):
.. versionadded:: 1.3.0
Since XGBoost 2.1.0, the default format is changed to UBJSON.
Parameters
----------
directory :
Output model directory.
name :
pattern of output model file. Models will be saved as name_0.json, name_1.json,
name_2.json ....
pattern of output model file. Models will be saved as name_0.ubj, name_1.ubj,
name_2.ubj ....
as_pickle :
When set to True, all training parameters will be saved in pickle format,
instead of saving only the model.
@@ -564,6 +586,8 @@ class TrainingCheckPoint(TrainingCallback):
"""
default_format = "ubj"
def __init__(
self,
directory: Union[str, os.PathLike],
@@ -592,7 +616,7 @@ class TrainingCheckPoint(TrainingCallback):
self._name
+ "_"
+ (str(epoch + self._start))
+ (".pkl" if self._as_pickle else ".json"),
+ (".pkl" if self._as_pickle else f".{self.default_format}"),
)
self._epoch = 0 # reset counter
if collective.get_rank() == 0:

View File

@@ -100,6 +100,16 @@ def is_cupy_available() -> bool:
return False
def import_cupy() -> types.ModuleType:
"""Import cupy."""
if not is_cupy_available():
raise ImportError("`cupy` is required for handling CUDA buffer.")
import cupy # pylint: disable=import-error
return cupy
try:
import scipy.sparse as scipy_sparse
from scipy.sparse import csr_matrix as scipy_csr
@@ -128,9 +138,9 @@ def concat(value: Sequence[_T]) -> _T: # pylint: disable=too-many-return-statem
from cudf import concat as CUDF_concat # pylint: disable=import-error
return CUDF_concat(value, axis=0)
from .data import _is_cupy_array
from .data import _is_cupy_alike
if _is_cupy_array(value[0]):
if _is_cupy_alike(value[0]):
import cupy # pylint: disable=import-error
# pylint: disable=c-extension-no-member,no-member

View File

@@ -3,7 +3,6 @@
"""Core XGBoost Library."""
import copy
import ctypes
import importlib.util
import json
import os
import re
@@ -45,7 +44,6 @@ from ._typing import (
CStrPptr,
CStrPtr,
CTypeT,
CupyT,
DataType,
FeatureInfo,
FeatureNames,
@@ -55,7 +53,7 @@ from ._typing import (
TransformedData,
c_bst_ulong,
)
from .compat import PANDAS_INSTALLED, DataFrame, py_str
from .compat import PANDAS_INSTALLED, DataFrame, import_cupy, py_str
from .libpath import find_lib_path
@@ -213,6 +211,7 @@ def _load_lib() -> ctypes.CDLL:
lib = ctypes.cdll.LoadLibrary(lib_path)
setattr(lib, "path", os.path.normpath(lib_path))
lib_success = True
break
except OSError as e:
os_error_list.append(str(e))
continue
@@ -358,10 +357,13 @@ def _numpy2ctypes_type(dtype: Type[np.number]) -> Type[CNumeric]:
return _NUMPY_TO_CTYPES_MAPPING[dtype]
def _array_hasobject(data: DataType) -> bool:
return hasattr(data.dtype, "hasobject") and data.dtype.hasobject
def _cuda_array_interface(data: DataType) -> bytes:
assert (
data.dtype.hasobject is False
), "Input data contains `object` dtype. Expecting numeric data."
if _array_hasobject(data):
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
interface = data.__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
@@ -380,34 +382,6 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n
return res
def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT:
"""Convert a ctypes pointer array to a cupy array."""
# pylint: disable=import-error
import cupy
from cupy.cuda.memory import MemoryPointer, UnownedMemory
CUPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = {
cupy.float32: ctypes.c_float,
cupy.uint32: ctypes.c_uint,
}
if dtype not in CUPY_TO_CTYPES_MAPPING:
raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
addr = ctypes.cast(cptr, ctypes.c_void_p).value
# pylint: disable=c-extension-no-member,no-member
device = cupy.cuda.runtime.pointerGetAttributes(addr).device
# The owner field is just used to keep the memory alive with ref count. As
# unowned's life time is scoped within this function we don't need that.
unownd = UnownedMemory(
addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None
)
memptr = MemoryPointer(unownd, 0)
# pylint: disable=unexpected-keyword-arg
mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr)
assert mem.device.id == device
arr = cupy.array(mem, copy=True)
return arr
def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
"""Convert ctypes pointer to buffer type."""
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
@@ -466,14 +440,8 @@ def from_array_interface(interface: dict) -> NumpyOrCupy:
if "stream" in interface:
# CUDA stream is presented, this is a __cuda_array_interface__.
spec = importlib.util.find_spec("cupy")
if spec is None:
raise ImportError("`cupy` is required for handling CUDA buffer.")
import cupy as cp # pylint: disable=import-error
arr.__cuda_array_interface__ = interface
out = cp.array(arr, copy=True)
out = import_cupy().array(arr, copy=True)
else:
arr.__array_interface__ = interface
out = np.array(arr, copy=True)
@@ -481,17 +449,42 @@ def from_array_interface(interface: dict) -> NumpyOrCupy:
return out
def make_array_interface(
ptr: CNumericPtr, shape: Tuple[int, ...], dtype: Type[np.number], is_cuda: bool
) -> Dict[str, Union[int, tuple, None]]:
"""Make an __(cuda)_array_interface__ from a pointer."""
# Use an empty array to handle typestr and descr
if is_cuda:
empty = import_cupy().empty(shape=(0,), dtype=dtype)
array = empty.__cuda_array_interface__ # pylint: disable=no-member
else:
empty = np.empty(shape=(0,), dtype=dtype)
array = empty.__array_interface__ # pylint: disable=no-member
addr = ctypes.cast(ptr, ctypes.c_void_p).value
length = int(np.prod(shape))
# Handle empty dataset.
assert addr is not None or length == 0
if addr is None:
return array
array["data"] = (addr, True)
if is_cuda:
array["stream"] = 2
array["shape"] = shape
array["strides"] = None
return array
def _prediction_output(
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
) -> NumpyOrCupy:
arr_shape = ctypes2numpy(shape, dims.value, np.uint64)
length = int(np.prod(arr_shape))
if is_cuda:
arr_predict = ctypes2cupy(predts, length, np.float32)
else:
arr_predict = ctypes2numpy(predts, length, np.float32)
arr_predict = arr_predict.reshape(arr_shape)
return arr_predict
arr_shape = tuple(ctypes2numpy(shape, dims.value, np.uint64).flatten())
array = from_array_interface(
make_array_interface(predts, arr_shape, np.float32, is_cuda)
)
return array
class DataIter(ABC): # pylint: disable=too-many-instance-attributes
@@ -795,7 +788,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
so it doesn't make sense to assign weights to individual data points.
base_margin :
Base margin used for boosting from existing model.
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
missing :
Value in the input data which needs to be present as a missing value. If
None, defaults to np.nan.
@@ -832,9 +825,19 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
.. note:: This parameter is experimental
Experimental support of specializing for categorical features. Do not set
to True unless you are interested in development. Also, JSON/UBJSON
serialization format is required.
Experimental support of specializing for categorical features.
If passing 'True' and 'data' is a data frame (from supported libraries such
as Pandas, Modin or cuDF), columns of categorical types will automatically
be set to be of categorical type (feature_type='c') in the resulting
DMatrix.
If passing 'False' and 'data' is a data frame with categorical columns,
it will result in an error being thrown.
If 'data' is not a data frame, this argument is ignored.
JSON/UBJSON serialization format is required for this.
"""
if group is not None and qid is not None:
@@ -1441,6 +1444,12 @@ class _ProxyDMatrix(DMatrix):
_LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data))
)
def _set_data_from_pandas(self, data: DataType) -> None:
"""Set data from a pandas DataFrame. The input is a PandasTransformed instance."""
_check_call(
_LIB.XGProxyDMatrixSetDataColumnar(self.handle, data.array_interface())
)
def _set_data_from_csr(self, csr: scipy.sparse.csr_matrix) -> None:
"""Set data from scipy csr"""
from .data import _array_interface
@@ -2096,7 +2105,7 @@ class Booster:
_array_interface,
_cuda_array_interface,
_ensure_np_dtype,
_is_cupy_array,
_is_cupy_alike,
)
self._assign_dmatrix_features(dtrain)
@@ -2110,7 +2119,7 @@ class Booster:
"Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian."
f" Got: {type(array)}"
)
if not isinstance(array, np.ndarray) and not _is_cupy_array(array):
if not isinstance(array, np.ndarray) and not _is_cupy_alike(array):
raise TypeError(msg)
n_samples = dtrain.num_row()
@@ -2125,7 +2134,7 @@ class Booster:
if isinstance(array, np.ndarray):
array, _ = _ensure_np_dtype(array, array.dtype)
interface = _array_interface(array)
elif _is_cupy_array(array):
elif _is_cupy_alike(array):
interface = _cuda_array_interface(array)
else:
raise TypeError(msg)
@@ -2450,11 +2459,12 @@ class Booster:
assert proxy is None or isinstance(proxy, _ProxyDMatrix)
from .data import (
PandasTransformed,
_array_interface,
_arrow_transform,
_is_arrow,
_is_cudf_df,
_is_cupy_array,
_is_cupy_alike,
_is_list,
_is_np_array_like,
_is_pandas_df,
@@ -2504,6 +2514,19 @@ class Booster:
)
)
return _prediction_output(shape, dims, preds, False)
if isinstance(data, PandasTransformed):
_check_call(
_LIB.XGBoosterPredictFromColumnar(
self.handle,
data.array_interface(),
args,
p_handle,
ctypes.byref(shape),
ctypes.byref(dims),
ctypes.byref(preds),
)
)
return _prediction_output(shape, dims, preds, False)
if isinstance(data, scipy.sparse.csr_matrix):
from .data import transform_scipy_sparse
@@ -2523,7 +2546,7 @@ class Booster:
)
)
return _prediction_output(shape, dims, preds, False)
if _is_cupy_array(data):
if _is_cupy_alike(data):
from .data import _transform_cupy_array
data = _transform_cupy_array(data)
@@ -2571,9 +2594,8 @@ class Booster:
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) will not be saved when using binary format. To save
those attributes, use JSON/UBJ instead. See :doc:`Model IO
</tutorials/saving_model>` for more info.
(such as feature_names) are only saved when using JSON or UBJSON (default)
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
.. code-block:: python
@@ -2593,15 +2615,18 @@ class Booster:
else:
raise TypeError("fname must be a string or os PathLike")
def save_raw(self, raw_format: str = "deprecated") -> bytearray:
def save_raw(self, raw_format: str = "ubj") -> bytearray:
"""Save the model to a in memory buffer representation instead of file.
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) are only saved when using JSON or UBJSON (default)
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
Parameters
----------
raw_format :
Format of output buffer. Can be `json`, `ubj` or `deprecated`. Right now
the default is `deprecated` but it will be changed to `ubj` (univeral binary
json) in the future.
Format of output buffer. Can be `json`, `ubj` or `deprecated`.
Returns
-------
@@ -2620,11 +2645,10 @@ class Booster:
def load_model(self, fname: ModelIn) -> None:
"""Load the model from a file or a bytearray.
The model is loaded from XGBoost format which is universal among the various
XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
feature_names) will not be loaded when using binary format. To save those
attributes, use JSON/UBJ instead. See :doc:`Model IO </tutorials/saving_model>`
for more info.
The model is saved in an XGBoost internal format which is universal among the
various XGBoost interfaces. Auxiliary attributes of the Python Booster object
(such as feature_names) are only saved when using JSON or UBJSON (default)
format. See :doc:`Model IO </tutorials/saving_model>` for more info.
.. code-block:: python
@@ -2749,9 +2773,9 @@ class Booster:
with_stats: bool = False,
dump_format: str = "text",
) -> List[str]:
"""Returns the model dump as a list of strings. Unlike :py:meth:`save_model`, the output
format is primarily used for visualization or interpretation, hence it's more
human readable but cannot be loaded back to XGBoost.
"""Returns the model dump as a list of strings. Unlike :py:meth:`save_model`,
the output format is primarily used for visualization or interpretation, hence
it's more human readable but cannot be loaded back to XGBoost.
Parameters
----------

View File

@@ -75,7 +75,7 @@ from xgboost.core import (
_deprecate_positional_args,
_expect,
)
from xgboost.data import _is_cudf_ser, _is_cupy_array
from xgboost.data import _is_cudf_ser, _is_cupy_alike
from xgboost.sklearn import (
XGBClassifier,
XGBClassifierBase,
@@ -1909,7 +1909,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
self.classes_ = await self.client.compute(y.drop_duplicates())
if _is_cudf_ser(self.classes_):
self.classes_ = self.classes_.to_cupy()
if _is_cupy_array(self.classes_):
if _is_cupy_alike(self.classes_):
self.classes_ = self.classes_.get()
self.classes_ = numpy.array(self.classes_)
self.n_classes_ = len(self.classes_)

View File

@@ -26,6 +26,7 @@ from .core import (
DataIter,
DataSplitMode,
DMatrix,
_array_hasobject,
_check_call,
_cuda_array_interface,
_ProxyDMatrix,
@@ -57,21 +58,36 @@ def _check_data_shape(data: DataType) -> None:
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
def _is_scipy_csr(data: DataType) -> bool:
def is_scipy_csr(data: DataType) -> bool:
"""Predicate for scipy CSR input."""
is_array = False
is_matrix = False
try:
import scipy.sparse
from scipy.sparse import csr_array
is_array = isinstance(data, csr_array)
except ImportError:
return False
return isinstance(data, scipy.sparse.csr_matrix)
pass
try:
from scipy.sparse import csr_matrix
is_matrix = isinstance(data, csr_matrix)
except ImportError:
pass
return is_array or is_matrix
def _array_interface(data: np.ndarray) -> bytes:
assert (
data.dtype.hasobject is False
), "Input data contains `object` dtype. Expecting numeric data."
def _array_interface_dict(data: np.ndarray) -> dict:
if _array_hasobject(data):
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
interface = data.__array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__array_interface__
return interface
def _array_interface(data: np.ndarray) -> bytes:
interface = _array_interface_dict(data)
interface_str = bytes(json.dumps(interface), "utf-8")
return interface_str
@@ -130,12 +146,23 @@ def _from_scipy_csr(
return handle, feature_names, feature_types
def _is_scipy_csc(data: DataType) -> bool:
def is_scipy_csc(data: DataType) -> bool:
"""Predicate for scipy CSC input."""
is_array = False
is_matrix = False
try:
import scipy.sparse
from scipy.sparse import csc_array
is_array = isinstance(data, csc_array)
except ImportError:
return False
return isinstance(data, scipy.sparse.csc_matrix)
pass
try:
from scipy.sparse import csc_matrix
is_matrix = isinstance(data, csc_matrix)
except ImportError:
pass
return is_array or is_matrix
def _from_scipy_csc(
@@ -166,12 +193,23 @@ def _from_scipy_csc(
return handle, feature_names, feature_types
def _is_scipy_coo(data: DataType) -> bool:
def is_scipy_coo(data: DataType) -> bool:
"""Predicate for scipy COO input."""
is_array = False
is_matrix = False
try:
import scipy.sparse
from scipy.sparse import coo_array
is_array = isinstance(data, coo_array)
except ImportError:
return False
return isinstance(data, scipy.sparse.coo_matrix)
pass
try:
from scipy.sparse import coo_matrix
is_matrix = isinstance(data, coo_matrix)
except ImportError:
pass
return is_array or is_matrix
def _is_np_array_like(data: DataType) -> bool:
@@ -181,7 +219,7 @@ def _is_np_array_like(data: DataType) -> bool:
def _ensure_np_dtype(
data: DataType, dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
if _array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
dtype = np.float32
data = data.astype(dtype, copy=False)
if not data.flags.aligned:
@@ -265,24 +303,24 @@ pandas_nullable_mapper = {
"Int16": "int",
"Int32": "int",
"Int64": "int",
"UInt8": "i",
"UInt16": "i",
"UInt32": "i",
"UInt64": "i",
"UInt8": "int",
"UInt16": "int",
"UInt32": "int",
"UInt64": "int",
"Float32": "float",
"Float64": "float",
"boolean": "i",
}
pandas_pyarrow_mapper = {
"int8[pyarrow]": "i",
"int16[pyarrow]": "i",
"int32[pyarrow]": "i",
"int64[pyarrow]": "i",
"uint8[pyarrow]": "i",
"uint16[pyarrow]": "i",
"uint32[pyarrow]": "i",
"uint64[pyarrow]": "i",
"int8[pyarrow]": "int",
"int16[pyarrow]": "int",
"int32[pyarrow]": "int",
"int64[pyarrow]": "int",
"uint8[pyarrow]": "int",
"uint16[pyarrow]": "int",
"uint32[pyarrow]": "int",
"uint64[pyarrow]": "int",
"float[pyarrow]": "float",
"float32[pyarrow]": "float",
"double[pyarrow]": "float",
@@ -295,7 +333,7 @@ _pandas_dtype_mapper.update(pandas_pyarrow_mapper)
_ENABLE_CAT_ERR = (
"When categorical type is supplied, The experimental DMatrix parameter"
"When categorical type is supplied, the experimental DMatrix parameter"
"`enable_categorical` must be set to `True`."
)
@@ -407,89 +445,122 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
return is_sparse(dtype)
def pandas_cat_null(data: DataFrame) -> DataFrame:
"""Handle categorical dtype and nullable extension types from pandas."""
import pandas as pd
# handle category codes and nullable.
cat_columns = []
nul_columns = []
# avoid an unnecessary conversion if possible
for col, dtype in zip(data.columns, data.dtypes):
if is_pd_cat_dtype(dtype):
cat_columns.append(col)
elif is_pa_ext_categorical_dtype(dtype):
raise ValueError(
"pyarrow dictionary type is not supported. Use pandas category instead."
)
elif is_nullable_dtype(dtype):
nul_columns.append(col)
if cat_columns or nul_columns:
# Avoid transformation due to: PerformanceWarning: DataFrame is highly
# fragmented
transformed = data.copy(deep=False)
else:
transformed = data
def cat_codes(ser: pd.Series) -> pd.Series:
if is_pd_cat_dtype(ser.dtype):
return ser.cat.codes
assert is_pa_ext_categorical_dtype(ser.dtype)
# Not yet supported, the index is not ordered for some reason. Alternately:
# `combine_chunks().to_pandas().cat.codes`. The result is the same.
return ser.array.__arrow_array__().combine_chunks().dictionary_encode().indices
if cat_columns:
# DF doesn't have the cat attribute, as a result, we use apply here
transformed[cat_columns] = (
transformed[cat_columns]
.apply(cat_codes)
.astype(np.float32)
.replace(-1.0, np.NaN)
)
if nul_columns:
transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
# TODO(jiamingy): Investigate the possibility of using dataframe protocol or arrow
# IPC format for pandas so that we can apply the data transformation inside XGBoost
# for better memory efficiency.
return transformed
def pandas_ext_num_types(data: DataFrame) -> DataFrame:
"""Experimental suppport for handling pyarrow extension numeric types."""
def pandas_pa_type(ser: Any) -> np.ndarray:
"""Handle pandas pyarrow extention."""
import pandas as pd
import pyarrow as pa
# No copy, callstack:
# pandas.core.internals.managers.SingleBlockManager.array_values()
# pandas.core.internals.blocks.EABackedBlock.values
d_array: pd.arrays.ArrowExtensionArray = ser.array
# no copy in __arrow_array__
# ArrowExtensionArray._data is a chunked array
aa: pa.ChunkedArray = d_array.__arrow_array__()
# combine_chunks takes the most significant amount of time
chunk: pa.Array = aa.combine_chunks()
# When there's null value, we have to use copy
zero_copy = chunk.null_count == 0
# Alternately, we can use chunk.buffers(), which returns a list of buffers and
# we need to concatenate them ourselves.
# FIXME(jiamingy): Is there a better way to access the arrow buffer along with
# its mask?
# Buffers from chunk.buffers() have the address attribute, but don't expose the
# mask.
arr: np.ndarray = chunk.to_numpy(zero_copy_only=zero_copy, writable=False)
arr, _ = _ensure_np_dtype(arr, arr.dtype)
return arr
def pandas_transform_data(data: DataFrame) -> List[np.ndarray]:
"""Handle categorical dtype and extension types from pandas."""
import pandas as pd
from pandas import Float32Dtype, Float64Dtype
result: List[np.ndarray] = []
def cat_codes(ser: pd.Series) -> np.ndarray:
if is_pd_cat_dtype(ser.dtype):
return _ensure_np_dtype(
ser.cat.codes.astype(np.float32)
.replace(-1.0, np.NaN)
.to_numpy(na_value=np.nan),
np.float32,
)[0]
# Not yet supported, the index is not ordered for some reason. Alternately:
# `combine_chunks().to_pandas().cat.codes`. The result is the same.
assert is_pa_ext_categorical_dtype(ser.dtype)
return (
ser.array.__arrow_array__()
.combine_chunks()
.dictionary_encode()
.indices.astype(np.float32)
.replace(-1.0, np.NaN)
)
def nu_type(ser: pd.Series) -> np.ndarray:
# Avoid conversion when possible
if isinstance(dtype, Float32Dtype):
res_dtype: NumpyDType = np.float32
elif isinstance(dtype, Float64Dtype):
res_dtype = np.float64
else:
res_dtype = np.float32
return _ensure_np_dtype(
ser.to_numpy(dtype=res_dtype, na_value=np.nan), res_dtype
)[0]
def oth_type(ser: pd.Series) -> np.ndarray:
# The dtypes module is added in 1.25.
npdtypes = np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0")
npdtypes = npdtypes and isinstance(
ser.dtype,
(
# pylint: disable=no-member
np.dtypes.Float32DType, # type: ignore
# pylint: disable=no-member
np.dtypes.Float64DType, # type: ignore
),
)
if npdtypes or dtype in {np.float32, np.float64}:
array = ser.to_numpy()
else:
# Specifying the dtype can significantly slow down the conversion (about
# 15% slow down for dense inplace-predict)
array = ser.to_numpy(dtype=np.float32, na_value=np.nan)
return _ensure_np_dtype(array, array.dtype)[0]
for col, dtype in zip(data.columns, data.dtypes):
if not is_pa_ext_dtype(dtype):
continue
# No copy, callstack:
# pandas.core.internals.managers.SingleBlockManager.array_values()
# pandas.core.internals.blocks.EABackedBlock.values
d_array: pd.arrays.ArrowExtensionArray = data[col].array
# no copy in __arrow_array__
# ArrowExtensionArray._data is a chunked array
aa: pa.ChunkedArray = d_array.__arrow_array__()
chunk: pa.Array = aa.combine_chunks()
# Alternately, we can use chunk.buffers(), which returns a list of buffers and
# we need to concatenate them ourselves.
arr = chunk.__array__()
data[col] = arr
return data
if is_pa_ext_categorical_dtype(dtype):
raise ValueError(
"pyarrow dictionary type is not supported. Use pandas category instead."
)
if is_pd_cat_dtype(dtype):
result.append(cat_codes(data[col]))
elif is_pa_ext_dtype(dtype):
result.append(pandas_pa_type(data[col]))
elif is_nullable_dtype(dtype):
result.append(nu_type(data[col]))
elif is_pd_sparse_dtype(dtype):
arr = cast(pd.arrays.SparseArray, data[col].values)
arr = arr.to_dense()
if _is_np_array_like(arr):
arr, _ = _ensure_np_dtype(arr, arr.dtype)
result.append(arr)
else:
result.append(oth_type(data[col]))
# FIXME(jiamingy): Investigate the possibility of using dataframe protocol or arrow
# IPC format for pandas so that we can apply the data transformation inside XGBoost
# for better memory efficiency.
return result
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
pyarrow_extension = False
def pandas_check_dtypes(data: DataFrame, enable_categorical: bool) -> None:
"""Validate the input types, returns True if the dataframe is backed by arrow."""
sparse_extension = False
for dtype in data.dtypes:
if not (
(dtype.name in _pandas_dtype_mapper)
@@ -498,27 +569,65 @@ def _transform_pandas_df(
or is_pa_ext_dtype(dtype)
):
_invalid_dataframe_dtype(data)
if is_pa_ext_dtype(dtype):
pyarrow_extension = True
if is_pd_sparse_dtype(dtype):
sparse_extension = True
if sparse_extension:
warnings.warn("Sparse arrays from pandas are converted into dense.")
class PandasTransformed:
"""A storage class for transformed pandas DataFrame."""
def __init__(self, columns: List[np.ndarray]) -> None:
self.columns = columns
def array_interface(self) -> bytes:
"""Return a byte string for JSON encoded array interface."""
aitfs = list(map(_array_interface_dict, self.columns))
sarrays = bytes(json.dumps(aitfs), "utf-8")
return sarrays
@property
def shape(self) -> Tuple[int, int]:
"""Return shape of the transformed DataFrame."""
return self.columns[0].shape[0], len(self.columns)
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
meta: Optional[str] = None,
) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]:
pandas_check_dtypes(data, enable_categorical)
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
feature_names, feature_types = pandas_feature_info(
data, meta, feature_names, feature_types, enable_categorical
)
transformed = pandas_cat_null(data)
if pyarrow_extension:
if transformed is data:
transformed = data.copy(deep=False)
transformed = pandas_ext_num_types(transformed)
arrays = pandas_transform_data(data)
return PandasTransformed(arrays), feature_names, feature_types
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype = meta_type if meta_type else np.float32
arr: np.ndarray = transformed.values
if meta_type:
arr = arr.astype(dtype)
return arr, feature_names, feature_types
def _meta_from_pandas_df(
data: DataType,
name: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p,
) -> None:
data, _, _ = _transform_pandas_df(data, False, meta=name)
if len(data.columns) == 1:
array = data.columns[0]
else:
array = np.stack(data.columns).T
array, dtype = _ensure_np_dtype(array, dtype)
_meta_from_numpy(array, name, dtype, handle)
def _from_pandas_df(
@@ -530,12 +639,21 @@ def _from_pandas_df(
feature_types: Optional[FeatureTypes],
data_split_mode: DataSplitMode = DataSplitMode.ROW,
) -> DispatchedDataBackendReturnType:
data, feature_names, feature_types = _transform_pandas_df(
df, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
return _from_numpy_array(
data, missing, nthread, feature_names, feature_types, data_split_mode
handle = ctypes.c_void_p()
_check_call(
_LIB.XGDMatrixCreateFromColumnar(
df.array_interface(),
make_jcargs(
nthread=nthread, missing=missing, data_split_mode=data_split_mode
),
ctypes.byref(handle),
)
)
return handle, feature_names, feature_types
def _is_pandas_series(data: DataType) -> bool:
@@ -550,7 +668,12 @@ def _meta_from_pandas_series(
data: DataType, name: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p
) -> None:
"""Help transform pandas series for meta data like labels"""
data = data.values.astype("float")
if is_pd_sparse_dtype(data.dtype):
data = data.values.to_dense().astype(np.float32)
elif is_pa_ext_dtype(data.dtype):
data = pandas_pa_type(data)
else:
data = data.to_numpy(np.float32, na_value=np.nan)
if is_pd_sparse_dtype(getattr(data, "dtype", data)):
data = data.to_dense() # type: ignore
@@ -732,6 +855,8 @@ def _arrow_transform(data: DataType) -> Any:
return pd.ArrowDtype(pa.bool_())
return None
# For common cases, this is zero-copy, can check with:
# pa.total_allocated_bytes()
df = data.to_pandas(types_mapper=type_mapper)
return df
@@ -859,11 +984,10 @@ def _from_cudf_df(
)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
handle = ctypes.c_void_p()
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
_check_call(
_LIB.XGDMatrixCreateFromCudaColumnar(
interfaces_str,
config,
make_jcargs(nthread=nthread, missing=missing),
ctypes.byref(handle),
)
)
@@ -874,11 +998,8 @@ def _is_cudf_ser(data: DataType) -> bool:
return lazy_isinstance(data, "cudf.core.series", "Series")
def _is_cupy_array(data: DataType) -> bool:
return any(
lazy_isinstance(data, n, "ndarray")
for n in ("cupy.core.core", "cupy", "cupy._core.core")
)
def _is_cupy_alike(data: DataType) -> bool:
return hasattr(data, "__cuda_array_interface__")
def _transform_cupy_array(data: DataType) -> CupyT:
@@ -886,7 +1007,7 @@ def _transform_cupy_array(data: DataType) -> CupyT:
if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"):
data = cupy.array(data, copy=False)
if data.dtype.hasobject or data.dtype in [cupy.bool_]:
if _array_hasobject(data) or data.dtype in [cupy.bool_]:
data = data.astype(cupy.float32, copy=False)
return data
@@ -1047,15 +1168,15 @@ def dispatch_data_backend(
"""Dispatch data for DMatrix."""
if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data)
if _is_scipy_csr(data):
if is_scipy_csr(data):
return _from_scipy_csr(
data, missing, threads, feature_names, feature_types, data_split_mode
)
if _is_scipy_csc(data):
if is_scipy_csc(data):
return _from_scipy_csc(
data, missing, threads, feature_names, feature_types, data_split_mode
)
if _is_scipy_coo(data):
if is_scipy_coo(data):
return _from_scipy_csr(
data.tocsr(),
missing,
@@ -1098,7 +1219,7 @@ def dispatch_data_backend(
return _from_cudf_df(
data, missing, threads, feature_names, feature_types, enable_categorical
)
if _is_cupy_array(data):
if _is_cupy_alike(data):
return _from_cupy_array(data, missing, threads, feature_names, feature_types)
if _is_cupy_csr(data):
raise TypeError("cupyx CSR is not supported yet.")
@@ -1221,8 +1342,7 @@ def dispatch_meta_backend(
if _is_arrow(data):
data = _arrow_transform(data)
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
_meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
return
if _is_pandas_series(data):
_meta_from_pandas_series(data, name, dtype, handle)
@@ -1231,7 +1351,7 @@ def dispatch_meta_backend(
data = _transform_dlpack(data)
_meta_from_cupy_array(data, name, handle)
return
if _is_cupy_array(data):
if _is_cupy_alike(data):
_meta_from_cupy_array(data, name, handle)
return
if _is_cudf_ser(data):
@@ -1244,8 +1364,7 @@ def dispatch_meta_backend(
_meta_from_dt(data, name, dtype, handle)
return
if _is_modin_df(data):
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
_meta_from_pandas_df(data, name, dtype=dtype, handle=handle)
return
if _is_modin_series(data):
data = data.values.astype("float")
@@ -1297,7 +1416,7 @@ def _proxy_transform(
return _transform_cudf_df(
data, feature_names, feature_types, enable_categorical
)
if _is_cupy_array(data):
if _is_cupy_alike(data):
data = _transform_cupy_array(data)
return data, None, feature_names, feature_types
if _is_dlpack(data):
@@ -1307,9 +1426,15 @@ def _proxy_transform(
if _is_np_array_like(data):
data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
if is_scipy_csr(data):
data = transform_scipy_sparse(data, True)
return data, None, feature_names, feature_types
if is_scipy_csc(data):
data = transform_scipy_sparse(data.tocsr(), True)
return data, None, feature_names, feature_types
if is_scipy_coo(data):
data = transform_scipy_sparse(data.tocsr(), True)
return data, None, feature_names, feature_types
if _is_pandas_series(data):
import pandas as pd
@@ -1317,11 +1442,10 @@ def _proxy_transform(
if _is_arrow(data):
data = _arrow_transform(data)
if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df(
df, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
arr, _ = _ensure_np_dtype(arr, arr.dtype)
return arr, None, feature_names, feature_types
return df, None, feature_names, feature_types
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
@@ -1343,7 +1467,7 @@ def dispatch_proxy_set_data(
# pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
return
if _is_cupy_array(data):
if _is_cupy_alike(data):
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
return
if _is_dlpack(data):
@@ -1356,11 +1480,14 @@ def dispatch_proxy_set_data(
if not allow_host:
raise err
if isinstance(data, PandasTransformed):
proxy._set_data_from_pandas(data) # pylint: disable=W0212
return
if _is_np_array_like(data):
_check_data_shape(data)
proxy._set_data_from_array(data) # pylint: disable=W0212
return
if _is_scipy_csr(data):
if is_scipy_csr(data):
proxy._set_data_from_csr(data) # pylint: disable=W0212
return
raise err

View File

@@ -39,7 +39,7 @@ from .core import (
_deprecate_positional_args,
_parse_eval_str,
)
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_alike, _is_pandas_df
from .training import train
@@ -192,11 +192,16 @@ __model_doc = f"""
Boosting learning rate (xgb's "eta")
verbosity : Optional[int]
The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
objective : {SklObjective}
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
Specify the learning task and the corresponding learning objective or a custom
objective function to be used. For custom objective, see
:doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
information.
booster: Optional[str]
Specify which booster to use: gbtree, gblinear or dart.
Specify which booster to use: `gbtree`, `gblinear` or `dart`.
tree_method: Optional[str]
Specify which tree method to use. Default to auto. If this parameter is set to
default, XGBoost will choose the most conservative option available. It's
@@ -276,13 +281,7 @@ __model_doc = f"""
enable_categorical : bool
.. versionadded:: 1.5.0
.. note:: This parameter is experimental
Experimental support for categorical data. When enabled, cudf/pandas.DataFrame
should be used to specify categorical data type. Also, JSON/UBJSON
serialization format is required.
See the same parameter of :py:class:`DMatrix` for details.
feature_types : Optional[FeatureTypes]
@@ -334,21 +333,21 @@ __model_doc = f"""
Metric used for monitoring the training result and early stopping. It can be a
string or list of strings as names of predefined metric in XGBoost (See
doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any other
user defined metric that looks like `sklearn.metrics`.
doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
other user defined metric that looks like `sklearn.metrics`.
If custom objective is also provided, then custom metric should implement the
corresponding reverse link function.
Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
object is provided, it's assumed to be a cost function and by default XGBoost will
minimize the result during early stopping.
object is provided, it's assumed to be a cost function and by default XGBoost
will minimize the result during early stopping.
For advanced usage on Early stopping like directly choosing to maximize instead of
minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
For advanced usage on Early stopping like directly choosing to maximize instead
of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
See :doc:`Custom Objective and Evaluation Metric </tutorials/custom_metric_obj>`
for more.
See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
information.
.. note::
@@ -1012,7 +1011,7 @@ class XGBModel(XGBModelBase):
sample_weight :
instance weights
base_margin :
global bias for each instance.
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
eval_set :
A list of (X, y) tuple pairs to use as validation sets, for which
metrics will be computed.
@@ -1152,7 +1151,7 @@ class XGBModel(XGBModelBase):
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
base_margin :
Margin added to prediction.
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
iteration_range :
Specifies which layer of trees are used in prediction. For example, if a
random forest is trained with 100 rounds. Specifying ``iteration_range=(10,
@@ -1178,7 +1177,7 @@ class XGBModel(XGBModelBase):
base_margin=base_margin,
validate_features=validate_features,
)
if _is_cupy_array(predts):
if _is_cupy_alike(predts):
import cupy # pylint: disable=import-error
predts = cupy.asnumpy(predts) # ensure numpy array is used.
@@ -1459,7 +1458,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
classes = cp.unique(y.values)
self.n_classes_ = len(classes)
expected_classes = cp.array(self.classes_)
elif _is_cupy_array(y):
elif _is_cupy_alike(y):
import cupy as cp # pylint: disable=E0401
classes = cp.unique(y)
@@ -1605,7 +1604,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
When this is True, validate that the Booster's and data's feature_names are
identical. Otherwise, it is assumed that the feature_names are the same.
base_margin :
Margin added to prediction.
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
iteration_range :
Specifies which layer of trees are used in prediction. For example, if a
random forest is trained with 100 rounds. Specifying `iteration_range=(10,
@@ -1948,7 +1947,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
weights to individual data points.
base_margin :
Global bias for each instance.
Global bias for each instance. See :doc:`/tutorials/intercept` for details.
eval_set :
A list of (X, y) tuple pairs to use as validation sets, for which
metrics will be computed.

View File

@@ -630,7 +630,7 @@ sparse_datasets_strategy = strategies.sampled_from(
def make_datasets_with_margin(
unweighted_strategy: strategies.SearchStrategy,
) -> Callable:
) -> Callable[[], strategies.SearchStrategy[TestDataset]]:
"""Factory function for creating strategies that generates datasets with weight and
base margin.
@@ -668,8 +668,7 @@ def make_datasets_with_margin(
# A strategy for drawing from a set of example datasets. May add random weights to the
# dataset
@memory.cache
def make_dataset_strategy() -> Callable:
def make_dataset_strategy() -> strategies.SearchStrategy[TestDataset]:
_unweighted_datasets_strategy = strategies.sampled_from(
[
TestDataset(
@@ -815,6 +814,13 @@ def softprob_obj(
return objective
def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Least squared error."""
grad = y_pred - y_true
hess = np.ones(len(y_true))
return grad, hess
class DirectoryExcursion:
"""Change directory. Change back and optionally cleaning up the directory when
exit.

View File

@@ -3,7 +3,18 @@
import os
import zipfile
from dataclasses import dataclass
from typing import Any, Generator, List, NamedTuple, Optional, Tuple, Union
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
NamedTuple,
Optional,
Tuple,
Type,
Union,
)
from urllib import request
import numpy as np
@@ -15,6 +26,11 @@ from scipy import sparse
import xgboost
from xgboost.data import pandas_pyarrow_mapper
if TYPE_CHECKING:
from ..compat import DataFrame as DataFrameT
else:
DataFrameT = Any
joblib = pytest.importorskip("joblib")
memory = joblib.Memory("./cachedir", verbose=0)
@@ -246,46 +262,186 @@ def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
return X, y
# pylint: disable=too-many-statements
@memory.cache
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
"""
def get_ames_housing() -> Tuple[DataFrameT, np.ndarray]:
"""Get a synthetic version of the amse housing dataset.
The real one can be obtained via:
.. code-block::
from sklearn import datasets
datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
Number of samples: 1460
Number of features: 20
Number of categorical features: 10
Number of numerical features: 10
"""
datasets = pytest.importorskip("sklearn.datasets")
X, y = datasets.fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
pytest.importorskip("pandas")
import pandas as pd
categorical_columns_subset: List[str] = [
"BldgType", # 5 cats, no nan
"GarageFinish", # 3 cats, nan
"LotConfig", # 5 cats, no nan
"Functional", # 7 cats, no nan
"MasVnrType", # 4 cats, nan
"HouseStyle", # 8 cats, no nan
"FireplaceQu", # 5 cats, nan
"ExterCond", # 5 cats, no nan
"ExterQual", # 4 cats, no nan
"PoolQC", # 3 cats, nan
]
rng = np.random.default_rng(1994)
n_samples = 1460
df = pd.DataFrame()
numerical_columns_subset: List[str] = [
"3SsnPorch",
"Fireplaces",
"BsmtHalfBath",
"HalfBath",
"GarageCars",
"TotRmsAbvGrd",
"BsmtFinSF1",
"BsmtFinSF2",
"GrLivArea",
"ScreenPorch",
]
def synth_cat(
name_proba: Dict[Union[str, float], float], density: float
) -> pd.Series:
n_nulls = int(n_samples * (1 - density))
has_nan = np.abs(1.0 - density) > 1e-6 and n_nulls > 0
if has_nan:
sparsity = 1.0 - density
name_proba[np.nan] = sparsity
X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
return X, y
keys = list(name_proba.keys())
p = list(name_proba.values())
p[-1] += 1.0 - np.sum(p) # Fix floating point error
x = rng.choice(keys, size=n_samples, p=p)
series = pd.Series(
x,
dtype=pd.CategoricalDtype(
# not NA
filter(lambda x: isinstance(x, str), keys)
),
)
return series
df["BldgType"] = synth_cat(
{
"1Fam": 0.835616,
"2fmCon": 0.078082,
"Duplex": 0.035616,
"Twnhs": 0.029452,
"TwnhsE": 0.021233,
},
1.0,
)
df["GarageFinish"] = synth_cat(
{"Unf": 0.414384, "RFn": 0.289041, "Fin": 0.241096}, 0.94452
)
df["LotConfig"] = synth_cat(
{
"Corner": 0.180137,
"CulDSac": 0.064384,
"FR2": 0.032192,
"FR3": 0.002740,
},
1.0,
)
df["Functional"] = synth_cat(
{
"Typ": 0.931506,
"Min2": 0.023287,
"Min1": 0.021232,
"Mod": 0.010273,
"Maj1": 0.009589,
"Maj2": 0.003424,
"Sev": 0.000684,
},
1.0,
)
df["MasVnrType"] = synth_cat(
{
"None": 0.591780,
"BrkFace": 0.304794,
"Stone": 0.087671,
"BrkCmn": 0.010273,
},
0.99452,
)
df["HouseStyle"] = synth_cat(
{
"1Story": 0.497260,
"2Story": 0.304794,
"1.5Fin": 0.105479,
"SLvl": 0.044520,
"SFoyer": 0.025342,
"1.5Unf": 0.009589,
"2.5Unf": 0.007534,
"2.5Fin": 0.005479,
},
1.0,
)
df["FireplaceQu"] = synth_cat(
{
"Gd": 0.260273,
"TA": 0.214383,
"Fa": 0.022602,
"Ex": 0.016438,
"Po": 0.013698,
},
0.527397,
)
df["ExterCond"] = synth_cat(
{
"TA": 0.878082,
"Gd": 0.1,
"Fa": 0.019178,
"Ex": 0.002054,
"Po": 0.000684,
},
1.0,
)
df["ExterQual"] = synth_cat(
{
"TA": 0.620547,
"Gd": 0.334246,
"Ex": 0.035616,
"Fa": 0.009589,
},
1.0,
)
df["PoolQC"] = synth_cat(
{
"Gd": 0.002054,
"Ex": 0.001369,
"Fa": 0.001369,
},
0.004794,
)
# We focus on the cateogircal values here, for numerical features, simple normal
# distribution is used, which doesn't match the original data.
def synth_num(loc: float, std: float, density: float) -> pd.Series:
x = rng.normal(loc=loc, scale=std, size=n_samples)
n_nulls = int(n_samples * (1 - density))
if np.abs(1.0 - density) > 1e-6 and n_nulls > 0:
null_idx = rng.choice(n_samples, size=n_nulls, replace=False)
x[null_idx] = np.nan
return pd.Series(x, dtype=np.float64)
df["3SsnPorch"] = synth_num(3.4095890410958902, 29.31733055678188, 1.0)
df["Fireplaces"] = synth_num(0.613013698630137, 0.6446663863122295, 1.0)
df["BsmtHalfBath"] = synth_num(0.057534246575342465, 0.23875264627921178, 1.0)
df["HalfBath"] = synth_num(0.38287671232876713, 0.5028853810928914, 1.0)
df["GarageCars"] = synth_num(1.7671232876712328, 0.7473150101111095, 1.0)
df["TotRmsAbvGrd"] = synth_num(6.517808219178082, 1.6253932905840505, 1.0)
df["BsmtFinSF1"] = synth_num(443.6397260273973, 456.0980908409277, 1.0)
df["BsmtFinSF2"] = synth_num(46.54931506849315, 161.31927280654173, 1.0)
df["GrLivArea"] = synth_num(1515.463698630137, 525.4803834232025, 1.0)
df["ScreenPorch"] = synth_num(15.060958904109588, 55.757415281874174, 1.0)
columns = list(df.columns)
rng.shuffle(columns)
df = df[columns]
# linear interaction for testing purposes.
y = np.zeros(shape=(n_samples,))
for c in df.columns:
if isinstance(df[c].dtype, pd.CategoricalDtype):
y += df[c].cat.codes.astype(np.float64)
else:
y += df[c].values
# Shift and scale to match the original y.
y *= 79442.50288288662 / y.std()
y += 180921.19589041095 - y.mean()
return df, y
@memory.cache
@@ -603,3 +759,51 @@ def sort_ltr_samples(
data = X, clicks, y, qid
return data
def run_base_margin_info(
DType: Callable, DMatrixT: Type[xgboost.DMatrix], device: str
) -> None:
"""Run tests for base margin."""
rng = np.random.default_rng()
X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
if hasattr(X, "iloc"):
y = X.iloc[:, 0]
else:
y = X[:, 0]
base_margin = X
# no error at set
Xy = DMatrixT(X, y, base_margin=base_margin)
# Error at train, caused by check in predictor.
with pytest.raises(ValueError, match=r".*base_margin.*"):
xgboost.train({"tree_method": "hist", "device": device}, Xy)
if not hasattr(X, "iloc"):
# column major matrix
got = DType(Xy.get_base_margin().reshape(50, 2))
assert (got == base_margin).all()
assert base_margin.T.flags.c_contiguous is False
assert base_margin.T.flags.f_contiguous is True
Xy.set_info(base_margin=base_margin.T)
got = DType(Xy.get_base_margin().reshape(2, 50))
assert (got == base_margin.T).all()
# Row vs col vec.
base_margin = y
Xy.set_base_margin(base_margin)
bm_col = Xy.get_base_margin()
Xy.set_base_margin(base_margin.reshape(1, base_margin.size))
bm_row = Xy.get_base_margin()
assert (bm_row == bm_col).all()
# type
base_margin = base_margin.astype(np.float64)
Xy.set_base_margin(base_margin)
bm_f64 = Xy.get_base_margin()
assert (bm_f64 == bm_col).all()
# too many dimensions
base_margin = X.reshape(2, 5, 2, 5)
with pytest.raises(ValueError, match=r".*base_margin.*"):
Xy.set_base_margin(base_margin)

View File

@@ -394,3 +394,14 @@ def train_result(
assert booster.feature_types == dmat.feature_types
return result
class ResetStrategy(xgb.callback.TrainingCallback):
"""Callback for testing multi-output."""
def after_iteration(self, model: xgb.Booster, epoch: int, evals_log: dict) -> bool:
if epoch % 2 == 0:
model.set_param({"multi_strategy": "multi_output_tree"})
else:
model.set_param({"multi_strategy": "one_output_per_tree"})
return False