Expose feature_types to sklearn interface. (#7821)

This commit is contained in:
Jiaming Yuan 2022-04-21 20:23:35 +08:00 committed by GitHub
parent 401d451569
commit c70fa502a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 131 additions and 48 deletions

View File

@ -1,7 +1,7 @@
"""Shared typing definition.""" """Shared typing definition."""
import ctypes import ctypes
import os import os
from typing import Optional, List, Any, TypeVar, Union from typing import Optional, Any, TypeVar, Union, Sequence
# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/ # os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
# cudf.DataFrame/cupy.array/dlpack # cudf.DataFrame/cupy.array/dlpack
@ -9,7 +9,8 @@ DataType = Any
# xgboost accepts some other possible types in practice due to historical reason, which is # xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string. # lesser tested. For now we encourage users to pass a simple list of string.
FeatureNames = Optional[List[str]] FeatureNames = Optional[Sequence[str]]
FeatureTypes = Optional[Sequence[str]]
ArrayLike = Any ArrayLike = Any
PathLike = Union[str, os.PathLike] PathLike = Union[str, os.PathLike]

View File

@ -31,6 +31,7 @@ from ._typing import (
CFloatPtr, CFloatPtr,
NumpyOrCupy, NumpyOrCupy,
FeatureNames, FeatureNames,
FeatureTypes,
_T, _T,
CupyT, CupyT,
) )
@ -553,7 +554,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
missing: Optional[float] = None, missing: Optional[float] = None,
silent: bool = False, silent: bool = False,
feature_names: FeatureNames = None, feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None, feature_types: FeatureTypes = None,
nthread: Optional[int] = None, nthread: Optional[int] = None,
group: Optional[ArrayLike] = None, group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None, qid: Optional[ArrayLike] = None,
@ -594,10 +595,15 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
Whether print messages during construction Whether print messages during construction
feature_names : list, optional feature_names : list, optional
Set names for features. Set names for features.
feature_types : feature_types : FeatureTypes
Set types for features. When `enable_categorical` is set to `True`, string Set types for features. When `enable_categorical` is set to `True`, string
"c" represents categorical data type. "c" represents categorical data type while "q" represents numerical feature
type. For categorical features, the input is assumed to be preprocessed and
encoded by the users. The encoding can be done via
:py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
`.cat.codes` method. This is useful when users want to specify categorical
features without having to construct a dataframe as input.
nthread : integer, optional nthread : integer, optional
Number of threads to use for loading data when parallelization is Number of threads to use for loading data when parallelization is
@ -1062,12 +1068,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
@property @property
def feature_types(self) -> Optional[List[str]]: def feature_types(self) -> Optional[List[str]]:
"""Get feature types (column types). """Get feature types. See :py:class:`DMatrix` for details."""
Returns
-------
feature_types : list or None
"""
length = c_bst_ulong() length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)() sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
@ -1083,8 +1084,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None: def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
"""Set feature types (column types). """Set feature types (column types).
This is for displaying the results and categorical data support. See doc string This is for displaying the results and categorical data support. See
of :py:obj:`xgboost.DMatrix` for details. :py:class:`DMatrix` for details.
Parameters Parameters
---------- ----------
@ -1647,7 +1648,7 @@ class Booster:
feature_info = from_cstr_to_pystr(sarr, length) feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None return feature_info if feature_info else None
def _set_feature_info(self, features: Optional[List[str]], field: str) -> None: def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None:
if features is not None: if features is not None:
assert isinstance(features, list) assert isinstance(features, list)
feature_info_bytes = [bytes(f, encoding="utf-8") for f in features] feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
@ -1667,7 +1668,7 @@ class Booster:
@property @property
def feature_types(self) -> Optional[List[str]]: def feature_types(self) -> Optional[List[str]]:
"""Feature types for this booster. Can be directly set by input data or by """Feature types for this booster. Can be directly set by input data or by
assignment. assignment. See :py:class:`DMatrix` for details.
""" """
return self._get_feature_info("feature_type") return self._get_feature_info("feature_type")

View File

@ -54,10 +54,11 @@ from .compat import scipy_sparse
from .compat import PANDAS_INSTALLED, DataFrame, Series, pandas_concat from .compat import PANDAS_INSTALLED, DataFrame, Series, pandas_concat
from .compat import lazy_isinstance from .compat import lazy_isinstance
from ._typing import FeatureNames, FeatureTypes
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric from .core import Objective, Metric
from .core import _deprecate_positional_args, _has_categorical from .core import _deprecate_positional_args, _has_categorical
from .data import FeatureNames
from .training import train as worker_train from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip from .tracker import RabitTracker, get_host_ip
from .sklearn import XGBModel, XGBClassifier, XGBRegressorBase, XGBClassifierBase from .sklearn import XGBModel, XGBClassifier, XGBRegressorBase, XGBClassifierBase
@ -327,7 +328,7 @@ class DaskDMatrix:
missing: float = None, missing: float = None,
silent: bool = False, # pylint: disable=unused-argument silent: bool = False, # pylint: disable=unused-argument
feature_names: FeatureNames = None, feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None, feature_types: FeatureTypes = None,
group: Optional[_DaskCollection] = None, group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None, qid: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None, label_lower_bound: Optional[_DaskCollection] = None,
@ -1601,7 +1602,11 @@ class DaskScikitLearnBase(XGBModel):
predts = predts.to_dask_array() predts = predts.to_dask_array()
else: else:
test_dmatrix = await DaskDMatrix( test_dmatrix = await DaskDMatrix(
self.client, data=data, base_margin=base_margin, missing=self.missing self.client,
data=data,
base_margin=base_margin,
missing=self.missing,
feature_types=self.feature_types
) )
predts = await predict( predts = await predict(
self.client, self.client,
@ -1640,7 +1645,9 @@ class DaskScikitLearnBase(XGBModel):
iteration_range: Optional[Tuple[int, int]] = None, iteration_range: Optional[Tuple[int, int]] = None,
) -> Any: ) -> Any:
iteration_range = self._get_iteration_range(iteration_range) iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = await DaskDMatrix(self.client, data=X, missing=self.missing) test_dmatrix = await DaskDMatrix(
self.client, data=X, missing=self.missing, feature_types=self.feature_types,
)
predts = await predict( predts = await predict(
self.client, self.client,
model=self.get_booster(), model=self.get_booster(),
@ -1755,6 +1762,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
eval_qid=None, eval_qid=None,
missing=self.missing, missing=self.missing,
enable_categorical=self.enable_categorical, enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
) )
if callable(self.objective): if callable(self.objective):
@ -1849,6 +1857,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
eval_qid=None, eval_qid=None,
missing=self.missing, missing=self.missing,
enable_categorical=self.enable_categorical, enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
) )
# pylint: disable=attribute-defined-outside-init # pylint: disable=attribute-defined-outside-init
@ -2054,6 +2063,7 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
eval_qid=eval_qid, eval_qid=eval_qid,
missing=self.missing, missing=self.missing,
enable_categorical=self.enable_categorical, enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
) )
if eval_metric is not None: if eval_metric is not None:
if callable(eval_metric): if callable(eval_metric):

View File

@ -13,6 +13,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
from ._typing import FeatureTypes
from .compat import lazy_isinstance, DataFrame from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@ -70,7 +71,7 @@ def _from_scipy_csr(
missing, missing,
nthread, nthread,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
"""Initialize data from a CSR matrix.""" """Initialize data from a CSR matrix."""
if len(data.indices) != len(data.data): if len(data.indices) != len(data.data):
@ -109,7 +110,7 @@ def _from_scipy_csc(
data, data,
missing, missing,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
if len(data.indices) != len(data.data): if len(data.indices) != len(data.data):
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}") raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
@ -165,7 +166,7 @@ def _from_numpy_array(
missing, missing,
nthread, nthread,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
"""Initialize data from a 2-D numpy matrix. """Initialize data from a 2-D numpy matrix.
@ -228,6 +229,12 @@ _pandas_dtype_mapper = {
} }
_ENABLE_CAT_ERR = (
"When categorical type is supplied, DMatrix parameter `enable_categorical` must "
"be set to `True`."
)
def _invalid_dataframe_dtype(data: Any) -> None: def _invalid_dataframe_dtype(data: Any) -> None:
# pandas series has `dtypes` but it's just a single object # pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`. # cudf series doesn't have `dtypes`.
@ -241,9 +248,8 @@ def _invalid_dataframe_dtype(data: Any) -> None:
else: else:
err = "" err = ""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When type_err = "DataFrame.dtypes for data must be int, float, bool or category."
categorical type is supplied, DMatrix parameter `enable_categorical` must msg = f"""{type_err} {_ENABLE_CAT_ERR} {err}"""
be set to `True`.""" + err
raise ValueError(msg) raise ValueError(msg)
@ -340,8 +346,8 @@ def _from_pandas_df(
missing: float, missing: float,
nthread: int, nthread: int,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]: ) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
data, feature_names, feature_types = _transform_pandas_df( data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types data, enable_categorical, feature_names, feature_types
) )
@ -382,7 +388,7 @@ def _from_pandas_series(
nthread: int, nthread: int,
enable_categorical: bool, enable_categorical: bool,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
from pandas.api.types import is_categorical_dtype from pandas.api.types import is_categorical_dtype
@ -413,7 +419,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df( def _transform_dt_df(
data, data,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
meta=None, meta=None,
meta_type=None, meta_type=None,
): ):
@ -454,9 +460,9 @@ def _from_dt_df(
missing, missing,
nthread, nthread,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]: ) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
if enable_categorical: if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.") raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df( data, feature_names, feature_types = _transform_dt_df(
@ -542,10 +548,10 @@ def _from_arrow(
data, data,
missing: float, missing: float,
nthread: int, nthread: int,
feature_names: Optional[List[str]], feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]: ) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
import pyarrow as pa import pyarrow as pa
if not all( if not all(
@ -621,7 +627,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df( def _transform_cudf_df(
data, data,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
enable_categorical: bool, enable_categorical: bool,
): ):
try: try:
@ -687,7 +693,7 @@ def _from_cudf_df(
missing, missing,
nthread, nthread,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]: ) -> Tuple[ctypes.c_void_p, Any, Any]:
data, cat_codes, feature_names, feature_types = _transform_cudf_df( data, cat_codes, feature_names, feature_types = _transform_cudf_df(
@ -735,7 +741,7 @@ def _from_cupy_array(
missing, missing,
nthread, nthread,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
"""Initialize DMatrix from cupy ndarray.""" """Initialize DMatrix from cupy ndarray."""
data = _transform_cupy_array(data) data = _transform_cupy_array(data)
@ -782,7 +788,7 @@ def _from_dlpack(
missing, missing,
nthread, nthread,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
data = _transform_dlpack(data) data = _transform_dlpack(data)
return _from_cupy_array(data, missing, nthread, feature_names, return _from_cupy_array(data, missing, nthread, feature_names,
@ -797,7 +803,7 @@ def _from_uri(
data, data,
missing, missing,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
_warn_unused_missing(data, missing) _warn_unused_missing(data, missing)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
@ -817,7 +823,7 @@ def _from_list(
missing, missing,
n_threads, n_threads,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
array = np.array(data) array = np.array(data)
_check_data_shape(data) _check_data_shape(data)
@ -833,7 +839,7 @@ def _from_tuple(
missing, missing,
n_threads, n_threads,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
): ):
return _from_list(data, missing, n_threads, feature_names, feature_types) return _from_list(data, missing, n_threads, feature_names, feature_types)
@ -869,7 +875,7 @@ def dispatch_data_backend(
missing, missing,
threads, threads,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
enable_categorical: bool = False, enable_categorical: bool = False,
): ):
'''Dispatch data for DMatrix.''' '''Dispatch data for DMatrix.'''
@ -884,8 +890,7 @@ def dispatch_data_backend(
data.tocsr(), missing, threads, feature_names, feature_types data.tocsr(), missing, threads, feature_names, feature_types
) )
if _is_numpy_array(data): if _is_numpy_array(data):
return _from_numpy_array(data, missing, threads, feature_names, return _from_numpy_array(data, missing, threads, feature_names, feature_types)
feature_types)
if _is_uri(data): if _is_uri(data):
return _from_uri(data, missing, feature_names, feature_types) return _from_uri(data, missing, feature_names, feature_types)
if _is_list(data): if _is_list(data):
@ -1101,7 +1106,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform( def _proxy_transform(
data, data,
feature_names: FeatureNames, feature_names: FeatureNames,
feature_types: Optional[List[str]], feature_types: FeatureTypes,
enable_categorical: bool, enable_categorical: bool,
): ):
if _is_cudf_df(data) or _is_cudf_ser(data): if _is_cudf_df(data) or _is_cudf_ser(data):

View File

@ -14,7 +14,7 @@ from .core import Metric
from .training import train from .training import train
from .callback import TrainingCallback from .callback import TrainingCallback
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
from ._typing import ArrayLike from ._typing import ArrayLike, FeatureTypes
# Do not use class names on scikit-learn directly. Re-define the classes on # Do not use class names on scikit-learn directly. Re-define the classes on
# .compat to guarantee the behavior without scikit-learn # .compat to guarantee the behavior without scikit-learn
@ -211,6 +211,13 @@ __model_doc = f'''
should be used to specify categorical data type. Also, JSON/UBJSON should be used to specify categorical data type. Also, JSON/UBJSON
serialization format is required. serialization format is required.
feature_types : FeatureTypes
.. versionadded:: 2.0.0
Used for specifying feature types without constructing a dataframe. See
:py:class:`DMatrix` for details.
max_cat_to_onehot : Optional[int] max_cat_to_onehot : Optional[int]
.. versionadded:: 1.6.0 .. versionadded:: 1.6.0
@ -394,6 +401,7 @@ def _wrap_evaluation_matrices(
eval_qid: Optional[Sequence[Any]], eval_qid: Optional[Sequence[Any]],
create_dmatrix: Callable, create_dmatrix: Callable,
enable_categorical: bool, enable_categorical: bool,
feature_types: FeatureTypes,
) -> Tuple[Any, List[Tuple[Any, str]]]: ) -> Tuple[Any, List[Tuple[Any, str]]]:
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the way. """Convert array_like evaluation matrices into DMatrix. Perform validation on the way.
@ -408,6 +416,7 @@ def _wrap_evaluation_matrices(
feature_weights=feature_weights, feature_weights=feature_weights,
missing=missing, missing=missing,
enable_categorical=enable_categorical, enable_categorical=enable_categorical,
feature_types=feature_types,
) )
n_validation = 0 if eval_set is None else len(eval_set) n_validation = 0 if eval_set is None else len(eval_set)
@ -455,6 +464,7 @@ def _wrap_evaluation_matrices(
base_margin=base_margin_eval_set[i], base_margin=base_margin_eval_set[i],
missing=missing, missing=missing,
enable_categorical=enable_categorical, enable_categorical=enable_categorical,
feature_types=feature_types,
) )
evals.append(m) evals.append(m)
nevals = len(evals) nevals = len(evals)
@ -518,6 +528,7 @@ class XGBModel(XGBModelBase):
validate_parameters: Optional[bool] = None, validate_parameters: Optional[bool] = None,
predictor: Optional[str] = None, predictor: Optional[str] = None,
enable_categorical: bool = False, enable_categorical: bool = False,
feature_types: FeatureTypes = None,
max_cat_to_onehot: Optional[int] = None, max_cat_to_onehot: Optional[int] = None,
eval_metric: Optional[Union[str, List[str], Callable]] = None, eval_metric: Optional[Union[str, List[str], Callable]] = None,
early_stopping_rounds: Optional[int] = None, early_stopping_rounds: Optional[int] = None,
@ -562,6 +573,7 @@ class XGBModel(XGBModelBase):
self.validate_parameters = validate_parameters self.validate_parameters = validate_parameters
self.predictor = predictor self.predictor = predictor
self.enable_categorical = enable_categorical self.enable_categorical = enable_categorical
self.feature_types = feature_types
self.max_cat_to_onehot = max_cat_to_onehot self.max_cat_to_onehot = max_cat_to_onehot
self.eval_metric = eval_metric self.eval_metric = eval_metric
self.early_stopping_rounds = early_stopping_rounds self.early_stopping_rounds = early_stopping_rounds
@ -684,6 +696,7 @@ class XGBModel(XGBModelBase):
"enable_categorical", "enable_categorical",
"early_stopping_rounds", "early_stopping_rounds",
"callbacks", "callbacks",
"feature_types",
} }
filtered = {} filtered = {}
for k, v in params.items(): for k, v in params.items():
@ -715,6 +728,10 @@ class XGBModel(XGBModelBase):
# numpy array is not JSON serializable # numpy array is not JSON serializable
meta['classes_'] = self.classes_.tolist() meta['classes_'] = self.classes_.tolist()
continue continue
if k == "feature_types":
# Use the `feature_types` attribute from booster instead.
meta["feature_types"] = None
continue
try: try:
json.dumps({k: v}) json.dumps({k: v})
meta[k] = v meta[k] = v
@ -754,6 +771,9 @@ class XGBModel(XGBModelBase):
if k == 'classes_': if k == 'classes_':
self.classes_ = np.array(v) self.classes_ = np.array(v)
continue continue
if k == "feature_types":
self.feature_types = self.get_booster().feature_types
continue
if k == "_estimator_type": if k == "_estimator_type":
if self._get_type() != v: if self._get_type() != v:
raise TypeError( raise TypeError(
@ -944,6 +964,7 @@ class XGBModel(XGBModelBase):
eval_qid=None, eval_qid=None,
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
enable_categorical=self.enable_categorical, enable_categorical=self.enable_categorical,
feature_types=self.feature_types
) )
params = self.get_xgb_params() params = self.get_xgb_params()
@ -1063,9 +1084,11 @@ class XGBModel(XGBModelBase):
pass pass
test = DMatrix( test = DMatrix(
X, base_margin=base_margin, X,
base_margin=base_margin,
missing=self.missing, missing=self.missing,
nthread=self.n_jobs, nthread=self.n_jobs,
feature_types=self.feature_types,
enable_categorical=self.enable_categorical enable_categorical=self.enable_categorical
) )
return self.get_booster().predict( return self.get_booster().predict(
@ -1106,7 +1129,9 @@ class XGBModel(XGBModelBase):
self.get_booster(), ntree_limit, iteration_range self.get_booster(), ntree_limit, iteration_range
) )
iteration_range = self._get_iteration_range(iteration_range) iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = DMatrix(X, missing=self.missing, nthread=self.n_jobs) test_dmatrix = DMatrix(
X, missing=self.missing, feature_types=self.feature_types, nthread=self.n_jobs
)
return self.get_booster().predict( return self.get_booster().predict(
test_dmatrix, test_dmatrix,
pred_leaf=True, pred_leaf=True,
@ -1397,6 +1422,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
eval_qid=None, eval_qid=None,
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
enable_categorical=self.enable_categorical, enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
) )
self._Booster = train( self._Booster = train(
@ -1828,6 +1854,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
eval_qid=eval_qid, eval_qid=eval_qid,
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
enable_categorical=self.enable_categorical, enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
) )
evals_result: TrainingCallback.EvalsLog = {} evals_result: TrainingCallback.EvalsLog = {}

View File

@ -306,6 +306,13 @@ def test_categorical(client: "Client") -> None:
run_categorical(client, "approx", X, X_onehot, y) run_categorical(client, "approx", X, X_onehot, y)
run_categorical(client, "hist", X, X_onehot, y) run_categorical(client, "hist", X, X_onehot, y)
ft = ["c"] * X.shape[1]
reg = xgb.dask.DaskXGBRegressor(
tree_method="hist", feature_types=ft, enable_categorical=True
)
reg.fit(X, y)
assert reg.get_booster().feature_types == ft
def test_dask_predict_shape_infer(client: "Client") -> None: def test_dask_predict_shape_infer(client: "Client") -> None:
X, y = make_classification(n_samples=1000, n_informative=5, n_classes=3) X, y = make_classification(n_samples=1000, n_informative=5, n_classes=3)

View File

@ -1273,6 +1273,38 @@ def test_estimator_reg(estimator, check):
check(estimator) check(estimator)
def test_categorical():
X, y = tm.make_categorical(n_samples=32, n_features=2, n_categories=3, onehot=False)
ft = ["c"] * X.shape[1]
reg = xgb.XGBRegressor(
tree_method="hist",
feature_types=ft,
max_cat_to_onehot=1,
enable_categorical=True,
)
reg.fit(X.values, y, eval_set=[(X.values, y)])
from_cat = reg.evals_result()["validation_0"]["rmse"]
predt_cat = reg.predict(X.values)
assert reg.get_booster().feature_types == ft
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.json")
reg.save_model(path)
reg = xgb.XGBRegressor()
reg.load_model(path)
assert reg.feature_types == ft
onehot, y = tm.make_categorical(
n_samples=32, n_features=2, n_categories=3, onehot=True
)
reg = xgb.XGBRegressor(tree_method="hist")
reg.fit(onehot, y, eval_set=[(onehot, y)])
from_enc = reg.evals_result()["validation_0"]["rmse"]
predt_enc = reg.predict(onehot)
np.testing.assert_allclose(from_cat, from_enc)
np.testing.assert_allclose(predt_cat, predt_enc)
def test_prediction_config(): def test_prediction_config():
reg = xgb.XGBRegressor() reg = xgb.XGBRegressor()
assert reg._can_use_inplace_predict() is True assert reg._can_use_inplace_predict() is True