Support multi-class with base margin. (#7381)

This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support
for most of the data types along with tests.
This commit is contained in:
Jiaming Yuan
2021-11-02 13:38:00 +08:00
committed by GitHub
parent 6295dc3b67
commit a13321148a
18 changed files with 274 additions and 92 deletions

View File

@@ -577,7 +577,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle = None
self.handle: Optional[ctypes.c_void_p] = None
return
from .data import dispatch_data_backend, _is_iter

View File

@@ -1432,9 +1432,7 @@ def inplace_predict( # pylint: disable=unused-argument
Value in the input data which needs to be present as a missing
value. If None, defaults to np.nan.
base_margin:
See :py:obj:`xgboost.DMatrix` for details. Right now classifier is not well
supported with base_margin as it requires the size of base margin to be `n_classes
* n_samples`.
See :py:obj:`xgboost.DMatrix` for details.
.. versionadded:: 1.4.0

View File

@@ -18,6 +18,11 @@ c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
CAT_T = "c"
# meta info that can be a matrix instead of vector.
# For now it's base_margin for multi-class, but it can be extended to label once we have
# multi-output.
_matrix_meta = {"base_margin"}
def _warn_unused_missing(data, missing):
if (missing is not None) and (not np.isnan(missing)):
@@ -217,7 +222,7 @@ _pandas_dtype_mapper = {
}
def _invalid_dataframe_dtype(data) -> None:
def _invalid_dataframe_dtype(data: Any) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
@@ -291,7 +296,7 @@ def _transform_pandas_df(
else:
transformed = data
if meta and len(data.columns) > 1:
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype = meta_type if meta_type else np.float32
@@ -323,6 +328,18 @@ def _is_pandas_series(data):
return isinstance(data, pd.Series)
def _meta_from_pandas_series(
data, name: str, dtype: Optional[str], handle: ctypes.c_void_p
) -> None:
"""Help transform pandas series for meta data like labels"""
data = data.values.astype('float')
from pandas.api.types import is_sparse
if is_sparse(data):
data = data.to_dense()
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
def _is_modin_series(data):
try:
import modin.pandas as pd
@@ -374,9 +391,9 @@ def _transform_dt_df(
):
"""Validate feature names and types if data table"""
if meta and data.shape[1] > 1:
raise ValueError(
'DataTable for label or weight cannot have multiple columns')
raise ValueError('DataTable for meta info cannot have multiple columns')
if meta:
meta_type = "float" if meta_type is None else meta_type
# below requires new dt version
# extract first column
data = data.to_numpy()[:, 0].astype(meta_type)
@@ -820,19 +837,27 @@ def _to_data_type(dtype: str, name: str):
return dtype_map[dtype]
def _validate_meta_shape(data, name: str) -> None:
def _validate_meta_shape(data: Any, name: str) -> None:
if hasattr(data, "shape"):
msg = f"Invalid shape: {data.shape} for {name}"
if name in _matrix_meta:
if len(data.shape) > 2:
raise ValueError(msg)
return
if len(data.shape) > 2 or (
len(data.shape) == 2 and (data.shape[1] != 0 and data.shape[1] != 1)
):
raise ValueError(f"Invalid shape: {data.shape} for {name}")
def _meta_from_numpy(data, field, dtype, handle):
def _meta_from_numpy(
data: np.ndarray, field: str, dtype, handle: ctypes.c_void_p
) -> None:
data = _maybe_np_slice(data, dtype)
interface = data.__array_interface__
assert interface.get('mask', None) is None, 'Masked array is not supported'
size = data.shape[0]
size = data.size
c_type = _to_data_type(str(data.dtype), field)
ptr = interface['data'][0]
@@ -855,17 +880,13 @@ def _meta_from_tuple(data, field, dtype, handle):
return _meta_from_list(data, field, dtype, handle)
def _meta_from_cudf_df(data, field, handle):
if len(data.columns) != 1:
raise ValueError(
'Expecting meta-info to contain a single column')
data = data[data.columns[0]]
interface = bytes(json.dumps([data.__cuda_array_interface__],
indent=2), 'utf-8')
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle,
c_str(field),
interface))
def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
if field not in _matrix_meta:
_meta_from_cudf_series(data.iloc[:, 0], field, handle)
else:
data = data.values
interface = _cuda_array_interface(data)
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
def _meta_from_cudf_series(data, field, handle):
@@ -885,14 +906,15 @@ def _meta_from_cupy_array(data, field, handle):
interface))
def _meta_from_dt(data, field, dtype, handle):
data, _, _ = _transform_dt_df(data, None, None)
def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p):
data, _, _ = _transform_dt_df(data, None, None, field, dtype)
_meta_from_numpy(data, field, dtype, handle)
def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
'''Dispatch for meta info.'''
handle = matrix.handle
assert handle is not None
_validate_meta_shape(data, name)
if data is None:
return
@@ -911,9 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_series(data):
data = data.values.astype('float')
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
_meta_from_pandas_series(data, name, dtype, handle)
return
if _is_dlpack(data):
data = _transform_dlpack(data)