Add Type Hints for Python Package (#7742)

Co-authored-by: Chengyang Gu <bridgream@gmail.com>
Co-authored-by: Jiamingy <jm.yuan@outlook.com>
This commit is contained in:
Chengyang 2022-05-17 10:14:09 -04:00 committed by GitHub
parent 71d3b2e036
commit 806c92c80b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 486 additions and 342 deletions

View File

@ -1,21 +1,32 @@
"""Shared typing definition.""" """Shared typing definition."""
import ctypes import ctypes
import os import os
from typing import Optional, Any, TypeVar, Union, Sequence from typing import Any, TypeVar, Union, Type, Sequence, Callable, List, Dict
# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/ # os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
# cudf.DataFrame/cupy.array/dlpack # cudf.DataFrame/cupy.array/dlpack
import numpy as np
DataType = Any DataType = Any
# xgboost accepts some other possible types in practice due to historical reason, which is # xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string. # lesser tested. For now we encourage users to pass a simple list of string.
FeatureNames = Optional[Sequence[str]] FeatureInfo = Sequence[str]
FeatureTypes = Optional[Sequence[str]] FeatureNames = FeatureInfo
FeatureTypes = FeatureInfo
BoosterParam = Union[List, Dict] # better be sequence
ArrayLike = Any ArrayLike = Any
PathLike = Union[str, os.PathLike] PathLike = Union[str, os.PathLike]
CupyT = ArrayLike # maybe need a stub for cupy arrays CupyT = ArrayLike # maybe need a stub for cupy arrays
NumpyOrCupy = Any NumpyOrCupy = Any
NumpyDType = Union[str, Type[np.number]]
PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
FloatCompatible = Union[float, np.float32, np.float64]
# callables
FPreProcCallable = Callable
# ctypes # ctypes
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h # c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
@ -59,3 +70,4 @@ CNumericPtr = ctypes.pointer
# template parameter # template parameter
_T = TypeVar("_T") _T = TypeVar("_T")
_F = TypeVar("_F", bound=Callable[..., Any])

View File

@ -10,8 +10,7 @@ from abc import ABC
import collections import collections
import os import os
import pickle import pickle
from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast, Sequence, Any
from typing import Sequence
import numpy import numpy
from . import rabit from . import rabit
@ -24,11 +23,14 @@ __all__ = [
"EarlyStopping", "EarlyStopping",
"EvaluationMonitor", "EvaluationMonitor",
"TrainingCheckPoint", "TrainingCheckPoint",
"CallbackContainer"
] ]
_Score = Union[float, Tuple[float, float]] _Score = Union[float, Tuple[float, float]]
_ScoreList = Union[List[float], List[Tuple[float, float]]] _ScoreList = Union[List[float], List[Tuple[float, float]]]
_Model = Any # real type is Union[Booster, CVPack]; need more work
# pylint: disable=unused-argument # pylint: disable=unused-argument
class TrainingCallback(ABC): class TrainingCallback(ABC):
@ -43,19 +45,19 @@ class TrainingCallback(ABC):
def __init__(self) -> None: def __init__(self) -> None:
pass pass
def before_training(self, model): def before_training(self, model: _Model) -> _Model:
'''Run before training starts.''' '''Run before training starts.'''
return model return model
def after_training(self, model): def after_training(self, model: _Model) -> _Model:
'''Run after training is finished.''' '''Run after training is finished.'''
return model return model
def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool: def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
'''Run before each iteration. Return True when training should stop.''' '''Run before each iteration. Return True when training should stop.'''
return False return False
def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool: def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
'''Run after each iteration. Return True when training should stop.''' '''Run after each iteration. Return True when training should stop.'''
return False return False
@ -140,7 +142,7 @@ class CallbackContainer:
if self.is_cv: if self.is_cv:
self.aggregated_cv = None self.aggregated_cv = None
def before_training(self, model): def before_training(self, model: _Model) -> _Model:
'''Function called before training.''' '''Function called before training.'''
for c in self.callbacks: for c in self.callbacks:
model = c.before_training(model=model) model = c.before_training(model=model)
@ -151,7 +153,7 @@ class CallbackContainer:
assert isinstance(model, Booster), msg assert isinstance(model, Booster), msg
return model return model
def after_training(self, model): def after_training(self, model: _Model) -> _Model:
'''Function called after training.''' '''Function called after training.'''
for c in self.callbacks: for c in self.callbacks:
model = c.after_training(model=model) model = c.after_training(model=model)
@ -182,7 +184,7 @@ class CallbackContainer:
return model return model
def before_iteration( def before_iteration(
self, model, epoch: int, dtrain: DMatrix, evals: List[Tuple[DMatrix, str]] self, model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]]
) -> bool: ) -> bool:
'''Function called before training iteration.''' '''Function called before training iteration.'''
return any(c.before_iteration(model, epoch, self.history) return any(c.before_iteration(model, epoch, self.history)
@ -220,7 +222,7 @@ class CallbackContainer:
def after_iteration( def after_iteration(
self, self,
model, model: _Model,
epoch: int, epoch: int,
dtrain: DMatrix, dtrain: DMatrix,
evals: Optional[List[Tuple[DMatrix, str]]], evals: Optional[List[Tuple[DMatrix, str]]],
@ -276,7 +278,7 @@ class LearningRateScheduler(TrainingCallback):
super().__init__() super().__init__()
def after_iteration( def after_iteration(
self, model, epoch: int, evals_log: TrainingCallback.EvalsLog self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
) -> bool: ) -> bool:
model.set_param("learning_rate", self.learning_rates(epoch)) model.set_param("learning_rate", self.learning_rates(epoch))
return False return False
@ -344,12 +346,12 @@ class EarlyStopping(TrainingCallback):
self.starting_round: int = 0 self.starting_round: int = 0
super().__init__() super().__init__()
def before_training(self, model): def before_training(self, model: _Model) -> _Model:
self.starting_round = model.num_boosted_rounds() self.starting_round = model.num_boosted_rounds()
return model return model
def _update_rounds( def _update_rounds(
self, score: _Score, name: str, metric: str, model, epoch: int self, score: _Score, name: str, metric: str, model: _Model, epoch: int
) -> bool: ) -> bool:
def get_s(x: _Score) -> float: def get_s(x: _Score) -> float:
"""get score if it's cross validation history.""" """get score if it's cross validation history."""
@ -403,7 +405,7 @@ class EarlyStopping(TrainingCallback):
return True return True
return False return False
def after_iteration(self, model, epoch: int, def after_iteration(self, model: _Model, epoch: int,
evals_log: TrainingCallback.EvalsLog) -> bool: evals_log: TrainingCallback.EvalsLog) -> bool:
epoch += self.starting_round # training continuation epoch += self.starting_round # training continuation
msg = 'Must have at least 1 validation dataset for early stopping.' msg = 'Must have at least 1 validation dataset for early stopping.'
@ -431,7 +433,7 @@ class EarlyStopping(TrainingCallback):
score = data_log[metric_name][-1] score = data_log[metric_name][-1]
return self._update_rounds(score, data_name, metric_name, model, epoch) return self._update_rounds(score, data_name, metric_name, model, epoch)
def after_training(self, model): def after_training(self, model: _Model) -> _Model:
try: try:
if self.save_best: if self.save_best:
model = model[: int(model.attr("best_iteration")) + 1] model = model[: int(model.attr("best_iteration")) + 1]
@ -477,7 +479,7 @@ class EvaluationMonitor(TrainingCallback):
msg = f"\t{data + '-' + metric}:{score:.5f}" msg = f"\t{data + '-' + metric}:{score:.5f}"
return msg return msg
def after_iteration(self, model, epoch: int, def after_iteration(self, model: _Model, epoch: int,
evals_log: TrainingCallback.EvalsLog) -> bool: evals_log: TrainingCallback.EvalsLog) -> bool:
if not evals_log: if not evals_log:
return False return False
@ -503,7 +505,7 @@ class EvaluationMonitor(TrainingCallback):
self._latest = msg self._latest = msg
return False return False
def after_training(self, model): def after_training(self, model: _Model) -> _Model:
if rabit.get_rank() == self.printer_rank and self._latest is not None: if rabit.get_rank() == self.printer_rank and self._latest is not None:
rabit.tracker_print(self._latest) rabit.tracker_print(self._latest)
return model return model
@ -544,7 +546,7 @@ class TrainingCheckPoint(TrainingCallback):
self._epoch = 0 self._epoch = 0
super().__init__() super().__init__()
def after_iteration(self, model, epoch: int, def after_iteration(self, model: _Model, epoch: int,
evals_log: TrainingCallback.EvalsLog) -> bool: evals_log: TrainingCallback.EvalsLog) -> bool:
if self._epoch == self._iterations: if self._epoch == self._iterations:
path = os.path.join(self._path, self._name + '_' + str(epoch) + path = os.path.join(self._path, self._name + '_' + str(epoch) +

View File

@ -1,30 +1,32 @@
# coding: utf-8 # coding: utf-8
# pylint: disable= invalid-name, unused-import # pylint: disable= invalid-name, unused-import
"""For compatibility and optional dependencies.""" """For compatibility and optional dependencies."""
from typing import Any from typing import Any, Type, Dict, Optional, List
import sys import sys
import types import types
import importlib.util import importlib.util
import logging import logging
import numpy as np import numpy as np
from xgboost._typing import CStrPtr
assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.' assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.'
def py_str(x): def py_str(x: CStrPtr) -> str:
"""convert c string back to python string""" """convert c string back to python string"""
return x.decode('utf-8') return x.decode('utf-8') # type: ignore
def lazy_isinstance(instance, module, name): def lazy_isinstance(instance: Type[object], module: str, name: str) -> bool:
"""Use string representation to identify a type.""" """Use string representation to identify a type."""
# Notice, we use .__class__ as opposed to type() in order # Notice, we use .__class__ as opposed to type() in order
# to support object proxies such as weakref.proxy # to support object proxies such as weakref.proxy
cls = instance.__class__ cls = instance.__class__
module = cls.__module__ == module is_same_module = cls.__module__ == module
name = cls.__name__ == name has_same_name = cls.__name__ == name
return module and name return is_same_module and has_same_name
# pandas # pandas
@ -37,34 +39,49 @@ try:
except ImportError: except ImportError:
MultiIndex = object MultiIndex = object
DataFrame: Any = object DataFrame = object
Series = object Series = object
pandas_concat = None pandas_concat = None
PANDAS_INSTALLED = False PANDAS_INSTALLED = False
# sklearn # sklearn
try: try:
from sklearn.base import BaseEstimator from sklearn.base import (
from sklearn.base import RegressorMixin, ClassifierMixin BaseEstimator as XGBModelBase,
RegressorMixin as XGBRegressorBase,
ClassifierMixin as XGBClassifierBase
)
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
try: try:
from sklearn.model_selection import KFold, StratifiedKFold from sklearn.model_selection import (
KFold as XGBKFold,
StratifiedKFold as XGBStratifiedKFold
)
except ImportError: except ImportError:
from sklearn.cross_validation import KFold, StratifiedKFold from sklearn.cross_validation import (
KFold as XGBKFold,
StratifiedKFold as XGBStratifiedKFold
)
SKLEARN_INSTALLED = True SKLEARN_INSTALLED = True
XGBModelBase = BaseEstimator except ImportError:
XGBRegressorBase = RegressorMixin SKLEARN_INSTALLED = False
XGBClassifierBase = ClassifierMixin
# used for compatibility without sklearn
XGBModelBase = object
XGBClassifierBase = object
XGBRegressorBase = object
LabelEncoder = object
XGBKFold = None
XGBStratifiedKFold = None
XGBKFold = KFold
XGBStratifiedKFold = StratifiedKFold
class XGBoostLabelEncoder(LabelEncoder): class XGBoostLabelEncoder(LabelEncoder):
'''Label encoder with JSON serialization methods.''' '''Label encoder with JSON serialization methods.'''
def to_json(self): def to_json(self) -> Dict:
'''Returns a JSON compatible dictionary''' '''Returns a JSON compatible dictionary'''
meta = {} meta = {}
for k, v in self.__dict__.items(): for k, v in self.__dict__.items():
@ -74,7 +91,7 @@ try:
meta[k] = v meta[k] = v
return meta return meta
def from_json(self, doc): def from_json(self, doc: Dict) -> None:
# pylint: disable=attribute-defined-outside-init # pylint: disable=attribute-defined-outside-init
'''Load the encoder back from a JSON compatible dict.''' '''Load the encoder back from a JSON compatible dict.'''
meta = {} meta = {}
@ -84,17 +101,6 @@ try:
continue continue
meta[k] = v meta[k] = v
self.__dict__.update(meta) self.__dict__.update(meta)
except ImportError:
SKLEARN_INSTALLED = False
# used for compatibility without sklearn
XGBModelBase = object
XGBClassifierBase = object
XGBRegressorBase = object
XGBKFold = None
XGBStratifiedKFold = None
XGBoostLabelEncoder = None
# dask # dask
@ -113,7 +119,7 @@ try:
SCIPY_INSTALLED = True SCIPY_INSTALLED = True
except ImportError: except ImportError:
scipy_sparse = False scipy_sparse = False
scipy_csr: Any = object scipy_csr = object
SCIPY_INSTALLED = False SCIPY_INSTALLED = False
@ -136,15 +142,21 @@ class LazyLoader(types.ModuleType):
"""Lazily import a module, mainly to avoid pulling in large dependencies. """Lazily import a module, mainly to avoid pulling in large dependencies.
""" """
def __init__(self, local_name, parent_module_globals, name, warning=None): def __init__(
self,
local_name: str,
parent_module_globals: Dict,
name: str,
warning: Optional[str] = None
) -> None:
self._local_name = local_name self._local_name = local_name
self._parent_module_globals = parent_module_globals self._parent_module_globals = parent_module_globals
self._warning = warning self._warning = warning
self.module = None self.module: Optional[types.ModuleType] = None
super().__init__(name) super().__init__(name)
def _load(self): def _load(self) -> types.ModuleType:
"""Load the module and insert it into the parent's globals.""" """Load the module and insert it into the parent's globals."""
# Import the target module and insert it into the parent's namespace # Import the target module and insert it into the parent's namespace
module = importlib.import_module(self.__name__) module = importlib.import_module(self.__name__)
@ -163,12 +175,12 @@ class LazyLoader(types.ModuleType):
return module return module
def __getattr__(self, item): def __getattr__(self, item: str) -> Any:
if not self.module: if not self.module:
self.module = self._load() self.module = self._load()
return getattr(self.module, item) return getattr(self.module, item)
def __dir__(self): def __dir__(self) -> List[str]:
if not self.module: if not self.module:
self.module = self._load() self.module = self._load()
return dir(self.module) return dir(self.module)

View File

@ -4,12 +4,20 @@ import ctypes
import json import json
from contextlib import contextmanager from contextlib import contextmanager
from functools import wraps from functools import wraps
from typing import Optional, Callable, Any, Dict, cast, Iterator
from .core import _LIB, _check_call, c_str, py_str from .core import _LIB, _check_call, c_str, py_str
from ._typing import _F
def config_doc(*, header=None, extra_note=None, parameters=None, returns=None, def config_doc(
see_also=None): *,
header: Optional[str] = None,
extra_note: Optional[str] = None,
parameters: Optional[str] = None,
returns: Optional[str] = None,
see_also: Optional[str] = None
) -> Callable[[_F], _F]:
"""Decorator to format docstring for config functions. """Decorator to format docstring for config functions.
Parameters Parameters
@ -64,19 +72,19 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
assert xgb.get_config()['verbosity'] == 2 # old value restored assert xgb.get_config()['verbosity'] == 2 # old value restored
""" """
def none_to_str(value): def none_to_str(value: Optional[str]) -> str:
return '' if value is None else value return '' if value is None else value
def config_doc_decorator(func): def config_doc_decorator(func: _F) -> _F:
func.__doc__ = (doc_template.format(header=none_to_str(header), func.__doc__ = (doc_template.format(header=none_to_str(header),
extra_note=none_to_str(extra_note)) extra_note=none_to_str(extra_note))
+ none_to_str(parameters) + none_to_str(returns) + none_to_str(parameters) + none_to_str(returns)
+ none_to_str(common_example) + none_to_str(see_also)) + none_to_str(common_example) + none_to_str(see_also))
@wraps(func) @wraps(func)
def wrap(*args, **kwargs): def wrap(*args: Any, **kwargs: Any) -> Any:
return func(*args, **kwargs) return func(*args, **kwargs)
return wrap return cast(_F, wrap)
return config_doc_decorator return config_doc_decorator
@ -89,7 +97,7 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
new_config: Dict[str, Any] new_config: Dict[str, Any]
Keyword arguments representing the parameters and their values Keyword arguments representing the parameters and their values
""") """)
def set_config(**new_config): def set_config(**new_config: Any) -> None:
config = json.dumps(new_config) config = json.dumps(new_config)
_check_call(_LIB.XGBSetGlobalConfig(c_str(config))) _check_call(_LIB.XGBSetGlobalConfig(c_str(config)))
@ -103,7 +111,7 @@ def set_config(**new_config):
args: Dict[str, Any] args: Dict[str, Any]
The list of global parameters and their values The list of global parameters and their values
""") """)
def get_config(): def get_config() -> Dict[str, Any]:
config_str = ctypes.c_char_p() config_str = ctypes.c_char_p()
_check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str))) _check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str)))
config = json.loads(py_str(config_str.value)) config = json.loads(py_str(config_str.value))
@ -132,7 +140,7 @@ def get_config():
set_config: Set global XGBoost configuration set_config: Set global XGBoost configuration
get_config: Get current values of the global configuration get_config: Get current values of the global configuration
""") """)
def config_context(**new_config): def config_context(**new_config: Any) -> Iterator[None]:
old_config = get_config().copy() old_config = get_config().copy()
set_config(**new_config) set_config(**new_config)

View File

@ -30,10 +30,12 @@ from ._typing import (
ArrayLike, ArrayLike,
CFloatPtr, CFloatPtr,
NumpyOrCupy, NumpyOrCupy,
FeatureNames, FeatureInfo,
FeatureTypes, FeatureTypes,
FeatureNames,
_T, _T,
CupyT, CupyT,
BoosterParam
) )
@ -273,7 +275,7 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n
if not isinstance(cptr, ctypes.POINTER(ctype)): if not isinstance(cptr, ctypes.POINTER(ctype)):
raise RuntimeError(f"expected {ctype} pointer") raise RuntimeError(f"expected {ctype} pointer")
res = np.zeros(length, dtype=dtype) res = np.zeros(length, dtype=dtype)
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): # type: ignore
raise RuntimeError("memmove failed") raise RuntimeError("memmove failed")
return res return res
@ -310,7 +312,7 @@ def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
raise RuntimeError('expected char pointer') raise RuntimeError('expected char pointer')
res = bytearray(length) res = bytearray(length)
rptr = (ctypes.c_char * length).from_buffer(res) rptr = (ctypes.c_char * length).from_buffer(res)
if not ctypes.memmove(rptr, cptr, length): if not ctypes.memmove(rptr, cptr, length): # type: ignore
raise RuntimeError('memmove failed') raise RuntimeError('memmove failed')
return res return res
@ -434,8 +436,8 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
def data_handle( def data_handle(
data: Any, data: Any,
*, *,
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[FeatureTypes] = None,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
from .data import dispatch_proxy_set_data from .data import dispatch_proxy_set_data
@ -555,8 +557,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
base_margin: Optional[ArrayLike] = None, base_margin: Optional[ArrayLike] = None,
missing: Optional[float] = None, missing: Optional[float] = None,
silent: bool = False, silent: bool = False,
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: FeatureTypes = None, feature_types: Optional[FeatureTypes] = None,
nthread: Optional[int] = None, nthread: Optional[int] = None,
group: Optional[ArrayLike] = None, group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None, qid: Optional[ArrayLike] = None,
@ -718,8 +720,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
qid: Optional[ArrayLike] = None, qid: Optional[ArrayLike] = None,
label_lower_bound: Optional[ArrayLike] = None, label_lower_bound: Optional[ArrayLike] = None,
label_upper_bound: Optional[ArrayLike] = None, label_upper_bound: Optional[ArrayLike] = None,
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[FeatureTypes] = None,
feature_weights: Optional[ArrayLike] = None feature_weights: Optional[ArrayLike] = None
) -> None: ) -> None:
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`.""" """Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
@ -1000,7 +1002,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return res return res
@property @property
def feature_names(self) -> Optional[List[str]]: def feature_names(self) -> Optional[FeatureNames]:
"""Get feature names (column labels). """Get feature names (column labels).
Returns Returns
@ -1023,7 +1025,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return feature_names return feature_names
@feature_names.setter @feature_names.setter
def feature_names(self, feature_names: FeatureNames) -> None: def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
"""Set feature names (column labels). """Set feature names (column labels).
Parameters Parameters
@ -1039,7 +1041,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
else: else:
feature_names = [feature_names] feature_names = [feature_names]
except TypeError: except TypeError:
feature_names = [feature_names] feature_names = [cast(str, feature_names)]
if len(feature_names) != len(set(feature_names)): if len(feature_names) != len(set(feature_names)):
raise ValueError('feature_names must be unique') raise ValueError('feature_names must be unique')
@ -1069,8 +1071,13 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
self.feature_types = None self.feature_types = None
@property @property
def feature_types(self) -> Optional[List[str]]: def feature_types(self) -> Optional[FeatureTypes]:
"""Get feature types. See :py:class:`DMatrix` for details.""" """Get feature types (column types).
Returns
-------
feature_types : list or None
"""
length = c_bst_ulong() length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)() sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
@ -1111,7 +1118,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
else: else:
feature_types = [feature_types] feature_types = [feature_types]
except TypeError: except TypeError:
feature_types = [feature_types] feature_types = [cast(str, feature_types)]
feature_types_bytes = [bytes(f, encoding='utf-8') feature_types_bytes = [bytes(f, encoding='utf-8')
for f in feature_types] for f in feature_types]
c_feature_types = (ctypes.c_char_p * c_feature_types = (ctypes.c_char_p *
@ -1203,8 +1210,8 @@ class DeviceQuantileDMatrix(DMatrix):
base_margin: Optional[ArrayLike] = None, base_margin: Optional[ArrayLike] = None,
missing: Optional[float] = None, missing: Optional[float] = None,
silent: bool = False, silent: bool = False,
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: Optional[List[str]] = None, feature_types: Optional[FeatureTypes] = None,
nthread: Optional[int] = None, nthread: Optional[int] = None,
max_bin: int = 256, max_bin: int = 256,
group: Optional[ArrayLike] = None, group: Optional[ArrayLike] = None,
@ -1323,7 +1330,7 @@ def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
return num_parallel_tree, num_groups return num_parallel_tree, num_groups
def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]: def _configure_metrics(params: BoosterParam) -> BoosterParam:
if ( if (
isinstance(params, dict) isinstance(params, dict)
and "eval_metric" in params and "eval_metric" in params
@ -1349,7 +1356,7 @@ class Booster:
def __init__( def __init__(
self, self,
params: Optional[Dict] = None, params: Optional[BoosterParam] = None,
cache: Optional[Sequence[DMatrix]] = None, cache: Optional[Sequence[DMatrix]] = None,
model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None
) -> None: ) -> None:
@ -1444,7 +1451,7 @@ class Booster:
"Constrained features are not a subset of training data feature names" "Constrained features are not a subset of training data feature names"
) from e ) from e
def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]: def _configure_constraints(self, params: BoosterParam) -> BoosterParam:
if isinstance(params, dict): if isinstance(params, dict):
value = params.get("monotone_constraints") value = params.get("monotone_constraints")
if value is not None: if value is not None:
@ -1607,7 +1614,7 @@ class Booster:
return py_str(ret.value) return py_str(ret.value)
return None return None
def attributes(self) -> Dict[str, str]: def attributes(self) -> Dict[str, Optional[str]]:
"""Get attributes stored in the Booster as a dictionary. """Get attributes stored in the Booster as a dictionary.
Returns Returns
@ -1639,7 +1646,7 @@ class Booster:
_check_call(_LIB.XGBoosterSetAttr( _check_call(_LIB.XGBoosterSetAttr(
self.handle, c_str(key), value)) self.handle, c_str(key), value))
def _get_feature_info(self, field: str) -> Optional[List[str]]: def _get_feature_info(self, field: str) -> Optional[FeatureInfo]:
length = c_bst_ulong() length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)() sarr = ctypes.POINTER(ctypes.c_char_p)()
if not hasattr(self, "handle") or self.handle is None: if not hasattr(self, "handle") or self.handle is None:
@ -1652,7 +1659,7 @@ class Booster:
feature_info = from_cstr_to_pystr(sarr, length) feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None return feature_info if feature_info else None
def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None: def _set_feature_info(self, features: Optional[FeatureInfo], field: str) -> None:
if features is not None: if features is not None:
assert isinstance(features, list) assert isinstance(features, list)
feature_info_bytes = [bytes(f, encoding="utf-8") for f in features] feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
@ -1670,7 +1677,7 @@ class Booster:
) )
@property @property
def feature_types(self) -> Optional[List[str]]: def feature_types(self) -> Optional[FeatureTypes]:
"""Feature types for this booster. Can be directly set by input data or by """Feature types for this booster. Can be directly set by input data or by
assignment. See :py:class:`DMatrix` for details. assignment. See :py:class:`DMatrix` for details.
@ -1678,11 +1685,11 @@ class Booster:
return self._get_feature_info("feature_type") return self._get_feature_info("feature_type")
@feature_types.setter @feature_types.setter
def feature_types(self, features: Optional[List[str]]) -> None: def feature_types(self, features: Optional[FeatureTypes]) -> None:
self._set_feature_info(features, "feature_type") self._set_feature_info(features, "feature_type")
@property @property
def feature_names(self) -> Optional[List[str]]: def feature_names(self) -> Optional[FeatureNames]:
"""Feature names for this booster. Can be directly set by input data or by """Feature names for this booster. Can be directly set by input data or by
assignment. assignment.
@ -1690,7 +1697,7 @@ class Booster:
return self._get_feature_info("feature_name") return self._get_feature_info("feature_name")
@feature_names.setter @feature_names.setter
def feature_names(self, features: FeatureNames) -> None: def feature_names(self, features: Optional[FeatureNames]) -> None:
self._set_feature_info(features, "feature_name") self._set_feature_info(features, "feature_name")
def set_param( def set_param(
@ -1711,7 +1718,7 @@ class Booster:
params = params.items() params = params.items()
elif isinstance(params, str) and value is not None: elif isinstance(params, str) and value is not None:
params = [(params, value)] params = [(params, value)]
for key, val in params: for key, val in cast(Iterable[Tuple[str, str]], params):
if val is not None: if val is not None:
_check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key), _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key),
c_str(str(val)))) c_str(str(val))))
@ -2564,8 +2571,10 @@ class Booster:
) )
# Booster can't accept data with different feature names # Booster can't accept data with different feature names
if self.feature_names != data.feature_names: if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names) dat_missing = set(cast(FeatureNames, self.feature_names)) - \
my_missing = set(data.feature_names) - set(self.feature_names) set(cast(FeatureNames, data.feature_names))
my_missing = set(cast(FeatureNames, data.feature_names)) - \
set(cast(FeatureNames, self.feature_names))
msg = 'feature_names mismatch: {0} {1}' msg = 'feature_names mismatch: {0} {1}'

View File

@ -318,7 +318,7 @@ class DaskDMatrix:
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
missing: float = None, missing: float = None,
silent: bool = False, # pylint: disable=unused-argument silent: bool = False, # pylint: disable=unused-argument
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: FeatureTypes = None, feature_types: FeatureTypes = None,
group: Optional[_DaskCollection] = None, group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None, qid: Optional[_DaskCollection] = None,
@ -594,7 +594,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
qid: Optional[List[Any]] = None, qid: Optional[List[Any]] = None,
label_lower_bound: Optional[List[Any]] = None, label_lower_bound: Optional[List[Any]] = None,
label_upper_bound: Optional[List[Any]] = None, label_upper_bound: Optional[List[Any]] = None,
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: Optional[Union[Any, List[Any]]] = None, feature_types: Optional[Union[Any, List[Any]]] = None,
) -> None: ) -> None:
self._data = data self._data = data
@ -637,7 +637,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
if self._iter == len(self._data): if self._iter == len(self._data):
# Return 0 when there's no more batch. # Return 0 when there's no more batch.
return 0 return 0
feature_names: FeatureNames = None feature_names: Optional[FeatureNames] = None
if self._feature_names: if self._feature_names:
feature_names = self._feature_names feature_names = self._feature_names
else: else:
@ -688,7 +688,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
missing: float = None, missing: float = None,
silent: bool = False, # disable=unused-argument silent: bool = False, # disable=unused-argument
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: Optional[Union[Any, List[Any]]] = None, feature_types: Optional[Union[Any, List[Any]]] = None,
max_bin: int = 256, max_bin: int = 256,
group: Optional[_DaskCollection] = None, group: Optional[_DaskCollection] = None,
@ -725,7 +725,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
def _create_device_quantile_dmatrix( def _create_device_quantile_dmatrix(
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: Optional[Union[Any, List[Any]]], feature_types: Optional[Union[Any, List[Any]]],
feature_weights: Optional[Any], feature_weights: Optional[Any],
missing: float, missing: float,
@ -766,7 +766,7 @@ def _create_device_quantile_dmatrix(
def _create_dmatrix( def _create_dmatrix(
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: Optional[Union[Any, List[Any]]], feature_types: Optional[Union[Any, List[Any]]],
feature_weights: Optional[Any], feature_weights: Optional[Any],
missing: float, missing: float,

View File

@ -5,17 +5,26 @@ import ctypes
import json import json
import warnings import warnings
import os import os
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast
import numpy as np import numpy as np
from .core import c_array, _LIB, _check_call, c_str from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames from .core import DataIter, _ProxyDMatrix, DMatrix
from ._typing import FeatureTypes
from .compat import lazy_isinstance, DataFrame from .compat import lazy_isinstance, DataFrame
from ._typing import (
c_bst_ulong,
DataType,
FeatureTypes,
FeatureNames,
NumpyDType,
CupyT,
FloatCompatible, PandasDType
)
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name DispatchedDataBackendReturnType = Tuple[
ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]]
CAT_T = "c" CAT_T = "c"
@ -23,14 +32,14 @@ CAT_T = "c"
_matrix_meta = {"base_margin", "label"} _matrix_meta = {"base_margin", "label"}
def _warn_unused_missing(data, missing): def _warn_unused_missing(data: DataType, missing: Optional[FloatCompatible]) -> None:
if (missing is not None) and (not np.isnan(missing)): if (missing is not None) and (not np.isnan(missing)):
warnings.warn( warnings.warn(
'`missing` is not used for current input data type:' + '`missing` is not used for current input data type:' +
str(type(data)), UserWarning) str(type(data)), UserWarning)
def _check_complex(data): def _check_complex(data: DataType) -> None:
'''Test whether data is complex using `dtype` attribute.''' '''Test whether data is complex using `dtype` attribute.'''
complex_dtypes = (np.complex128, np.complex64, complex_dtypes = (np.complex128, np.complex64,
np.cfloat, np.cdouble, np.clongdouble) np.cfloat, np.cdouble, np.clongdouble)
@ -38,16 +47,15 @@ def _check_complex(data):
raise ValueError('Complex data not supported') raise ValueError('Complex data not supported')
def _check_data_shape(data: Any) -> None: def _check_data_shape(data: DataType) -> None:
if hasattr(data, "shape") and len(data.shape) != 2: if hasattr(data, "shape") and len(data.shape) != 2:
raise ValueError("Please reshape the input data into 2-dimensional matrix.") raise ValueError("Please reshape the input data into 2-dimensional matrix.")
def _is_scipy_csr(data): def _is_scipy_csr(data: DataType) -> bool:
try: try:
import scipy import scipy.sparse
except ImportError: except ImportError:
scipy = None
return False return False
return isinstance(data, scipy.sparse.csr_matrix) return isinstance(data, scipy.sparse.csr_matrix)
@ -64,12 +72,12 @@ def _array_interface(data: np.ndarray) -> bytes:
def _from_scipy_csr( def _from_scipy_csr(
data, data: DataType,
missing, missing: FloatCompatible,
nthread, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
"""Initialize data from a CSR matrix.""" """Initialize data from a CSR matrix."""
if len(data.indices) != len(data.data): if len(data.indices) != len(data.data):
raise ValueError( raise ValueError(
@ -94,21 +102,20 @@ def _from_scipy_csr(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_scipy_csc(data): def _is_scipy_csc(data: DataType) -> bool:
try: try:
import scipy import scipy.sparse
except ImportError: except ImportError:
scipy = None
return False return False
return isinstance(data, scipy.sparse.csc_matrix) return isinstance(data, scipy.sparse.csc_matrix)
def _from_scipy_csc( def _from_scipy_csc(
data, data: DataType,
missing, missing: Optional[FloatCompatible],
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
if len(data.indices) != len(data.data): if len(data.indices) != len(data.data):
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}") raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
_warn_unused_missing(data, missing) _warn_unused_missing(data, missing)
@ -124,27 +131,29 @@ def _from_scipy_csc(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_scipy_coo(data): def _is_scipy_coo(data: DataType) -> bool:
try: try:
import scipy import scipy.sparse
except ImportError: except ImportError:
scipy = None
return False return False
return isinstance(data, scipy.sparse.coo_matrix) return isinstance(data, scipy.sparse.coo_matrix)
def _is_numpy_array(data): def _is_numpy_array(data: DataType) -> bool:
return isinstance(data, (np.ndarray, np.matrix)) return isinstance(data, (np.ndarray, np.matrix))
def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]: def _ensure_np_dtype(
data: DataType,
dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]: if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
data = data.astype(np.float32, copy=False) data = data.astype(np.float32, copy=False)
dtype = np.float32 dtype = np.float32
return data, dtype return data, dtype
def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray: def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
'''Handle numpy slice. This can be removed if we use __array_interface__. '''Handle numpy slice. This can be removed if we use __array_interface__.
''' '''
try: try:
@ -159,12 +168,12 @@ def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
def _from_numpy_array( def _from_numpy_array(
data, data: DataType,
missing, missing: FloatCompatible,
nthread, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
"""Initialize data from a 2-D numpy matrix. """Initialize data from a 2-D numpy matrix.
""" """
@ -189,7 +198,7 @@ def _from_numpy_array(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_pandas_df(data): def _is_pandas_df(data: DataType) -> bool:
try: try:
import pandas as pd import pandas as pd
except ImportError: except ImportError:
@ -197,7 +206,7 @@ def _is_pandas_df(data):
return isinstance(data, pd.DataFrame) return isinstance(data, pd.DataFrame)
def _is_modin_df(data): def _is_modin_df(data: DataType) -> bool:
try: try:
import modin.pandas as pd import modin.pandas as pd
except ImportError: except ImportError:
@ -232,7 +241,7 @@ _ENABLE_CAT_ERR = (
) )
def _invalid_dataframe_dtype(data: Any) -> None: def _invalid_dataframe_dtype(data: DataType) -> None:
# pandas series has `dtypes` but it's just a single object # pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`. # cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"): if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
@ -253,10 +262,10 @@ def _invalid_dataframe_dtype(data: Any) -> None:
def _pandas_feature_info( def _pandas_feature_info(
data: DataFrame, data: DataFrame,
meta: Optional[str], meta: Optional[str],
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[FeatureNames, FeatureTypes]: ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
import pandas as pd import pandas as pd
from pandas.api.types import ( from pandas.api.types import (
is_sparse, is_sparse,
@ -285,13 +294,13 @@ def _pandas_feature_info(
return feature_names, feature_types return feature_names, feature_types
def is_nullable_dtype(dtype: Any) -> bool: def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Wether dtype is a pandas nullable type.""" """Wether dtype is a pandas nullable type."""
from pandas.api.types import is_integer_dtype, is_bool_dtype from pandas.api.types import is_integer_dtype, is_bool_dtype
# dtype: pd.core.arrays.numeric.NumericDtype # dtype: pd.core.arrays.numeric.NumericDtype
nullable_alias = {"Int16", "Int32", "Int64"} nullable_alias = {"Int16", "Int32", "Int64"}
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. # np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool return is_int or is_bool
@ -331,11 +340,11 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame:
def _transform_pandas_df( def _transform_pandas_df(
data: DataFrame, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
feature_names: FeatureNames = None, feature_names: Optional[FeatureNames] = None,
feature_types: FeatureTypes = None, feature_types: Optional[FeatureTypes] = None,
meta: Optional[str] = None, meta: Optional[str] = None,
meta_type: Optional[str] = None, meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]: ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
from pandas.api.types import ( from pandas.api.types import (
is_sparse, is_sparse,
is_categorical_dtype, is_categorical_dtype,
@ -359,7 +368,7 @@ def _transform_pandas_df(
if meta and len(data.columns) > 1 and meta not in _matrix_meta: if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns") raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32 dtype = meta_type if meta_type else np.float32
arr: np.ndarray = transformed.values arr: np.ndarray = transformed.values
if meta_type: if meta_type:
arr = arr.astype(dtype) arr = arr.astype(dtype)
@ -369,18 +378,18 @@ def _transform_pandas_df(
def _from_pandas_df( def _from_pandas_df(
data: DataFrame, data: DataFrame,
enable_categorical: bool, enable_categorical: bool,
missing: float, missing: FloatCompatible,
nthread: int, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: ) -> DispatchedDataBackendReturnType:
data, feature_names, feature_types = _transform_pandas_df( data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types data, enable_categorical, feature_names, feature_types
) )
return _from_numpy_array(data, missing, nthread, feature_names, feature_types) return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
def _is_pandas_series(data): def _is_pandas_series(data: DataType) -> bool:
try: try:
import pandas as pd import pandas as pd
except ImportError: except ImportError:
@ -389,18 +398,21 @@ def _is_pandas_series(data):
def _meta_from_pandas_series( def _meta_from_pandas_series(
data, name: str, dtype: Optional[str], handle: ctypes.c_void_p data: DataType,
name: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None: ) -> None:
"""Help transform pandas series for meta data like labels""" """Help transform pandas series for meta data like labels"""
data = data.values.astype('float') data = data.values.astype('float')
from pandas.api.types import is_sparse from pandas.api.types import is_sparse
if is_sparse(data): if is_sparse(data):
data = data.to_dense() data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
def _is_modin_series(data): def _is_modin_series(data: DataType) -> bool:
try: try:
import modin.pandas as pd import modin.pandas as pd
except ImportError: except ImportError:
@ -409,13 +421,13 @@ def _is_modin_series(data):
def _from_pandas_series( def _from_pandas_series(
data, data: DataType,
missing: float, missing: FloatCompatible,
nthread: int, nthread: int,
enable_categorical: bool, enable_categorical: bool,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
from pandas.api.types import is_categorical_dtype from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not ( if (data.dtype.name not in _pandas_dtype_mapper) and not (
@ -433,7 +445,7 @@ def _from_pandas_series(
) )
def _is_dt_df(data): def _is_dt_df(data: DataType) -> bool:
return lazy_isinstance(data, 'datatable', 'Frame') or \ return lazy_isinstance(data, 'datatable', 'Frame') or \
lazy_isinstance(data, 'datatable', 'DataTable') lazy_isinstance(data, 'datatable', 'DataTable')
@ -443,12 +455,12 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df( def _transform_dt_df(
data, data: DataType,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
meta=None, meta: Optional[str] = None,
meta_type=None, meta_type: Optional[NumpyDType] = None,
): ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
"""Validate feature names and types if data table""" """Validate feature names and types if data table"""
if meta and data.shape[1] > 1: if meta and data.shape[1] > 1:
raise ValueError('DataTable for meta info cannot have multiple columns') raise ValueError('DataTable for meta info cannot have multiple columns')
@ -482,13 +494,13 @@ def _transform_dt_df(
def _from_dt_df( def _from_dt_df(
data, data: DataType,
missing, missing: Optional[FloatCompatible],
nthread, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: ) -> DispatchedDataBackendReturnType:
if enable_categorical: if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.") raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df( data, feature_names, feature_types = _transform_dt_df(
@ -525,7 +537,7 @@ def _from_dt_df(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_arrow(data) -> bool: def _is_arrow(data: DataType) -> bool:
try: try:
import pyarrow as pa import pyarrow as pa
from pyarrow import dataset as arrow_dataset from pyarrow import dataset as arrow_dataset
@ -571,13 +583,13 @@ def record_batch_data_iter(data_iter: Iterator) -> Callable:
def _from_arrow( def _from_arrow(
data, data: DataType,
missing: float, missing: FloatCompatible,
nthread: int, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: ) -> DispatchedDataBackendReturnType:
import pyarrow as pa import pyarrow as pa
if not all( if not all(
@ -605,11 +617,11 @@ def _from_arrow(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_cudf_df(data) -> bool: def _is_cudf_df(data: DataType) -> bool:
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
def _cudf_array_interfaces(data, cat_codes: list) -> bytes: def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
data and a list of array interfaces. The data is list of categorical codes that data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of array caller can safely ignore, but have to keep their reference alive until usage of array
@ -645,11 +657,11 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df( def _transform_cudf_df(
data, data: DataType,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
): ) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
try: try:
from cudf.api.types import is_categorical_dtype from cudf.api.types import is_categorical_dtype
except ImportError: except ImportError:
@ -709,13 +721,13 @@ def _transform_cudf_df(
def _from_cudf_df( def _from_cudf_df(
data, data: DataType,
missing, missing: FloatCompatible,
nthread, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]: ) -> DispatchedDataBackendReturnType:
data, cat_codes, feature_names, feature_types = _transform_cudf_df( data, cat_codes, feature_names, feature_types = _transform_cudf_df(
data, feature_names, feature_types, enable_categorical data, feature_names, feature_types, enable_categorical
) )
@ -732,7 +744,7 @@ def _from_cudf_df(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_cudf_ser(data): def _is_cudf_ser(data: DataType) -> bool:
try: try:
import cudf import cudf
except ImportError: except ImportError:
@ -740,13 +752,13 @@ def _is_cudf_ser(data):
return isinstance(data, cudf.Series) return isinstance(data, cudf.Series)
def _is_cupy_array(data: Any) -> bool: def _is_cupy_array(data: DataType) -> bool:
return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance( return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
data, "cupy._core.core", "ndarray" data, "cupy._core.core", "ndarray"
) )
def _transform_cupy_array(data): def _transform_cupy_array(data: DataType) -> CupyT:
import cupy # pylint: disable=import-error import cupy # pylint: disable=import-error
if not hasattr(data, '__cuda_array_interface__') and hasattr( if not hasattr(data, '__cuda_array_interface__') and hasattr(
data, '__array__'): data, '__array__'):
@ -757,12 +769,12 @@ def _transform_cupy_array(data):
def _from_cupy_array( def _from_cupy_array(
data, data: DataType,
missing, missing: FloatCompatible,
nthread, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
"""Initialize DMatrix from cupy ndarray.""" """Initialize DMatrix from cupy ndarray."""
data = _transform_cupy_array(data) data = _transform_cupy_array(data)
interface_str = _cuda_array_interface(data) interface_str = _cuda_array_interface(data)
@ -776,7 +788,7 @@ def _from_cupy_array(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_cupy_csr(data): def _is_cupy_csr(data: DataType) -> bool:
try: try:
import cupyx import cupyx
except ImportError: except ImportError:
@ -784,7 +796,7 @@ def _is_cupy_csr(data):
return isinstance(data, cupyx.scipy.sparse.csr_matrix) return isinstance(data, cupyx.scipy.sparse.csr_matrix)
def _is_cupy_csc(data): def _is_cupy_csc(data: DataType) -> bool:
try: try:
import cupyx import cupyx
except ImportError: except ImportError:
@ -792,11 +804,11 @@ def _is_cupy_csc(data):
return isinstance(data, cupyx.scipy.sparse.csc_matrix) return isinstance(data, cupyx.scipy.sparse.csc_matrix)
def _is_dlpack(data): def _is_dlpack(data: DataType) -> bool:
return 'PyCapsule' in str(type(data)) and "dltensor" in str(data) return 'PyCapsule' in str(type(data)) and "dltensor" in str(data)
def _transform_dlpack(data): def _transform_dlpack(data: DataType) -> bool:
from cupy import fromDlpack # pylint: disable=E0401 from cupy import fromDlpack # pylint: disable=E0401
assert 'used_dltensor' not in str(data) assert 'used_dltensor' not in str(data)
data = fromDlpack(data) data = fromDlpack(data)
@ -804,27 +816,27 @@ def _transform_dlpack(data):
def _from_dlpack( def _from_dlpack(
data, data: DataType,
missing, missing: FloatCompatible,
nthread, nthread: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
data = _transform_dlpack(data) data = _transform_dlpack(data)
return _from_cupy_array(data, missing, nthread, feature_names, return _from_cupy_array(data, missing, nthread, feature_names,
feature_types) feature_types)
def _is_uri(data): def _is_uri(data: DataType) -> bool:
return isinstance(data, (str, os.PathLike)) return isinstance(data, (str, os.PathLike))
def _from_uri( def _from_uri(
data, data: DataType,
missing, missing: Optional[FloatCompatible],
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
_warn_unused_missing(data, missing) _warn_unused_missing(data, missing)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
data = os.fspath(os.path.expanduser(data)) data = os.fspath(os.path.expanduser(data))
@ -834,51 +846,51 @@ def _from_uri(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_list(data): def _is_list(data: DataType) -> bool:
return isinstance(data, list) return isinstance(data, list)
def _from_list( def _from_list(
data, data: Sequence,
missing, missing: FloatCompatible,
n_threads, n_threads: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
array = np.array(data) array = np.array(data)
_check_data_shape(data) _check_data_shape(data)
return _from_numpy_array(array, missing, n_threads, feature_names, feature_types) return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
def _is_tuple(data): def _is_tuple(data: DataType) -> bool:
return isinstance(data, tuple) return isinstance(data, tuple)
def _from_tuple( def _from_tuple(
data, data: Sequence,
missing, missing: FloatCompatible,
n_threads, n_threads: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
): ) -> DispatchedDataBackendReturnType:
return _from_list(data, missing, n_threads, feature_names, feature_types) return _from_list(data, missing, n_threads, feature_names, feature_types)
def _is_iter(data): def _is_iter(data: DataType) -> bool:
return isinstance(data, DataIter) return isinstance(data, DataIter)
def _has_array_protocol(data): def _has_array_protocol(data: DataType) -> bool:
return hasattr(data, '__array__') return hasattr(data, '__array__')
def _convert_unknown_data(data): def _convert_unknown_data(data: DataType) -> DataType:
warnings.warn( warnings.warn(
f'Unknown data type: {type(data)}, trying to convert it to csr_matrix', f'Unknown data type: {type(data)}, trying to convert it to csr_matrix',
UserWarning UserWarning
) )
try: try:
import scipy import scipy.sparse
except ImportError: except ImportError:
return None return None
@ -891,13 +903,13 @@ def _convert_unknown_data(data):
def dispatch_data_backend( def dispatch_data_backend(
data, data: DataType,
missing, missing: FloatCompatible, # Or Optional[Float]
threads, threads: int,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool = False, enable_categorical: bool = False,
): ) -> DispatchedDataBackendReturnType:
'''Dispatch data for DMatrix.''' '''Dispatch data for DMatrix.'''
if not _is_cudf_ser(data) and not _is_pandas_series(data): if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data) _check_data_shape(data)
@ -964,7 +976,7 @@ def dispatch_data_backend(
raise TypeError('Not supported type for data.' + str(type(data))) raise TypeError('Not supported type for data.' + str(type(data)))
def _to_data_type(dtype: str, name: str): def _to_data_type(dtype: str, name: str) -> int:
dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4} dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
if dtype not in dtype_map: if dtype not in dtype_map:
raise TypeError( raise TypeError(
@ -973,7 +985,7 @@ def _to_data_type(dtype: str, name: str):
return dtype_map[dtype] return dtype_map[dtype]
def _validate_meta_shape(data: Any, name: str) -> None: def _validate_meta_shape(data: DataType, name: str) -> None:
if hasattr(data, "shape"): if hasattr(data, "shape"):
msg = f"Invalid shape: {data.shape} for {name}" msg = f"Invalid shape: {data.shape} for {name}"
if name in _matrix_meta: if name in _matrix_meta:
@ -990,7 +1002,7 @@ def _validate_meta_shape(data: Any, name: str) -> None:
def _meta_from_numpy( def _meta_from_numpy(
data: np.ndarray, data: np.ndarray,
field: str, field: str,
dtype: Optional[Union[np.dtype, str]], dtype: Optional[NumpyDType],
handle: ctypes.c_void_p, handle: ctypes.c_void_p,
) -> None: ) -> None:
data, dtype = _ensure_np_dtype(data, dtype) data, dtype = _ensure_np_dtype(data, dtype)
@ -1001,16 +1013,26 @@ def _meta_from_numpy(
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str)) _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
def _meta_from_list(data, field, dtype, handle): def _meta_from_list(
data = np.array(data) data: Sequence,
_meta_from_numpy(data, field, dtype, handle) field: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
data_np = np.array(data)
_meta_from_numpy(data_np, field, dtype, handle)
def _meta_from_tuple(data, field, dtype, handle): def _meta_from_tuple(
data: Sequence,
field: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
return _meta_from_list(data, field, dtype, handle) return _meta_from_list(data, field, dtype, handle)
def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
if field not in _matrix_meta: if field not in _matrix_meta:
_meta_from_cudf_series(data.iloc[:, 0], field, handle) _meta_from_cudf_series(data.iloc[:, 0], field, handle)
else: else:
@ -1019,7 +1041,7 @@ def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
def _meta_from_cudf_series(data, field, handle): def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
interface = bytes(json.dumps([data.__cuda_array_interface__], interface = bytes(json.dumps([data.__cuda_array_interface__],
indent=2), 'utf-8') indent=2), 'utf-8')
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle,
@ -1027,7 +1049,7 @@ def _meta_from_cudf_series(data, field, handle):
interface)) interface))
def _meta_from_cupy_array(data, field, handle): def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
data = _transform_cupy_array(data) data = _transform_cupy_array(data)
interface = bytes(json.dumps([data.__cuda_array_interface__], interface = bytes(json.dumps([data.__cuda_array_interface__],
indent=2), 'utf-8') indent=2), 'utf-8')
@ -1036,14 +1058,22 @@ def _meta_from_cupy_array(data, field, handle):
interface)) interface))
def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p): def _meta_from_dt(
data: DataType,
field: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
data, _, _ = _transform_dt_df(data, None, None, field, dtype) data, _, _ = _transform_dt_df(data, None, None, field, dtype)
_meta_from_numpy(data, field, dtype, handle) _meta_from_numpy(data, field, dtype, handle)
def dispatch_meta_backend( def dispatch_meta_backend(
matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None matrix: DMatrix,
): data: DataType,
name: str,
dtype: Optional[NumpyDType] = None
) -> None:
'''Dispatch for meta info.''' '''Dispatch for meta info.'''
handle = matrix.handle handle = matrix.handle
assert handle is not None assert handle is not None
@ -1060,8 +1090,7 @@ def dispatch_meta_backend(
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
return return
if _is_pandas_df(data): if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, False, meta=name, data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle) _meta_from_numpy(data, name, dtype, handle)
return return
if _is_pandas_series(data): if _is_pandas_series(data):
@ -1107,7 +1136,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
area for meta info. area for meta info.
''' '''
def __init__(self, **kwargs: Any): def __init__(self, **kwargs: Any) -> None:
self.kwargs = kwargs self.kwargs = kwargs
self.it = 0 # pylint: disable=invalid-name self.it = 0 # pylint: disable=invalid-name
super().__init__() super().__init__()
@ -1124,11 +1153,13 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform( def _proxy_transform(
data, data: DataType,
feature_names: FeatureNames, feature_names: Optional[FeatureNames],
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
enable_categorical: bool, enable_categorical: bool,
): ) -> Tuple[
Union[bool, ctypes.c_void_p, np.ndarray],
Optional[list], Optional[FeatureNames], Optional[FeatureTypes]]:
if _is_cudf_df(data) or _is_cudf_ser(data): if _is_cudf_df(data) or _is_cudf_ser(data):
return _transform_cudf_df( return _transform_cudf_df(
data, feature_names, feature_types, enable_categorical data, feature_names, feature_types, enable_categorical
@ -1152,7 +1183,7 @@ def _proxy_transform(
def dispatch_proxy_set_data( def dispatch_proxy_set_data(
proxy: _ProxyDMatrix, proxy: _ProxyDMatrix,
data: Any, data: DataType,
cat_codes: Optional[list], cat_codes: Optional[list],
allow_host: bool, allow_host: bool,
) -> None: ) -> None:
@ -1162,11 +1193,11 @@ def dispatch_proxy_set_data(
if _is_cudf_df(data): if _is_cudf_df(data):
# pylint: disable=W0212 # pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes) proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
return return
if _is_cudf_ser(data): if _is_cudf_ser(data):
# pylint: disable=W0212 # pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes) proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
return return
if _is_cupy_array(data): if _is_cupy_array(data):
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212 proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212

View File

@ -4,16 +4,34 @@
"""Plotting Library.""" """Plotting Library."""
from io import BytesIO from io import BytesIO
import json import json
from typing import Optional, Any
import numpy as np import numpy as np
from ._typing import PathLike
from .core import Booster from .core import Booster
from .sklearn import XGBModel from .sklearn import XGBModel
Axes = Any # real type is matplotlib.axes.Axes
GraphvizSource = Any # real type is graphviz.Source
def plot_importance(booster, ax=None, height=0.2,
xlim=None, ylim=None, title='Feature importance', def plot_importance(
xlabel='F score', ylabel='Features', fmap='', booster: Booster,
importance_type='weight', max_num_features=None, ax: Optional[Axes] = None,
grid=True, show_values=True, **kwargs): height: float = 0.2,
xlim: Optional[tuple] = None,
ylim: Optional[tuple] = None,
title: str = "Feature importance",
xlabel: str = "F score",
ylabel: str = "Features",
fmap: PathLike = "",
importance_type: str = "weight",
max_num_features: Optional[int] = None,
grid: bool = True,
show_values: bool = True,
**kwargs: Any
) -> Axes:
"""Plot importance based on fitted trees. """Plot importance based on fitted trees.
Parameters Parameters
@ -78,9 +96,9 @@ def plot_importance(booster, ax=None, height=0.2,
tuples = [(k, importance[k]) for k in importance] tuples = [(k, importance[k]) for k in importance]
if max_num_features is not None: if max_num_features is not None:
# pylint: disable=invalid-unary-operand-type # pylint: disable=invalid-unary-operand-type
tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:] tuples = sorted(tuples, key=lambda _x: _x[1])[-max_num_features:]
else: else:
tuples = sorted(tuples, key=lambda x: x[1]) tuples = sorted(tuples, key=lambda _x: _x[1])
labels, values = zip(*tuples) labels, values = zip(*tuples)
if ax is None: if ax is None:
@ -120,9 +138,17 @@ def plot_importance(booster, ax=None, height=0.2,
return ax return ax
def to_graphviz(booster, fmap='', num_trees=0, rankdir=None, def to_graphviz(
yes_color=None, no_color=None, booster: Booster,
condition_node_params=None, leaf_node_params=None, **kwargs): fmap: PathLike = "",
num_trees: int = 0,
rankdir: Optional[str] = None,
yes_color: Optional[str] = None,
no_color: Optional[str] = None,
condition_node_params: Optional[dict] = None,
leaf_node_params: Optional[dict] = None,
**kwargs: Any
) -> GraphvizSource:
"""Convert specified tree to graphviz instance. IPython can automatically plot """Convert specified tree to graphviz instance. IPython can automatically plot
the returned graphiz instance. Otherwise, you should call .render() method the returned graphiz instance. Otherwise, you should call .render() method
of the returned graphiz instance. of the returned graphiz instance.
@ -212,7 +238,14 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
return g return g
def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs): def plot_tree(
booster: Booster,
fmap: PathLike = "",
num_trees: int = 0,
rankdir: Optional[str] = None,
ax: Optional[Axes] = None,
**kwargs: Any
) -> Axes:
"""Plot specified tree. """Plot specified tree.
Parameters Parameters

View File

@ -4,8 +4,19 @@ import copy
import warnings import warnings
import json import json
import os import os
from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar, Type, cast from typing import (
from typing import Sequence Union,
Optional,
List,
Dict,
Callable,
Sequence,
Tuple,
Any,
TypeVar,
Type,
cast,
)
import numpy as np import numpy as np
from .core import Booster, DMatrix, XGBoostError from .core import Booster, DMatrix, XGBoostError
@ -14,7 +25,7 @@ from .core import Metric
from .training import train from .training import train
from .callback import TrainingCallback from .callback import TrainingCallback
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
from ._typing import ArrayLike, FeatureTypes from ._typing import ArrayLike, FeatureNames, FeatureTypes
# Do not use class names on scikit-learn directly. Re-define the classes on # Do not use class names on scikit-learn directly. Re-define the classes on
# .compat to guarantee the behavior without scikit-learn # .compat to guarantee the behavior without scikit-learn
@ -401,7 +412,7 @@ def _wrap_evaluation_matrices(
eval_qid: Optional[Sequence[Any]], eval_qid: Optional[Sequence[Any]],
create_dmatrix: Callable, create_dmatrix: Callable,
enable_categorical: bool, enable_categorical: bool,
feature_types: FeatureTypes, feature_types: Optional[FeatureTypes],
) -> Tuple[Any, List[Tuple[Any, str]]]: ) -> Tuple[Any, List[Tuple[Any, str]]]:
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the way. """Convert array_like evaluation matrices into DMatrix. Perform validation on the way.
@ -717,7 +728,7 @@ class XGBModel(XGBModelBase):
return self._estimator_type # pylint: disable=no-member return self._estimator_type # pylint: disable=no-member
def save_model(self, fname: Union[str, os.PathLike]) -> None: def save_model(self, fname: Union[str, os.PathLike]) -> None:
meta = {} meta: Dict[str, Any] = {}
for k, v in self.__dict__.items(): for k, v in self.__dict__.items():
if k == '_le': if k == '_le':
meta['_le'] = self._le.to_json() meta['_le'] = self._le.to_json()
@ -1231,7 +1242,7 @@ class XGBModel(XGBModelBase):
importance_type=self.importance_type if self.importance_type else dft() importance_type=self.importance_type if self.importance_type else dft()
) )
if b.feature_names is None: if b.feature_names is None:
feature_names = [f"f{i}" for i in range(self.n_features_in_)] feature_names: FeatureNames = [f"f{i}" for i in range(self.n_features_in_)]
else: else:
feature_names = b.feature_names feature_names = b.feature_names
# gblinear returns all features so the `get` in next line is only for gbtree. # gblinear returns all features so the `get` in next line is only for gbtree.

View File

@ -5,20 +5,24 @@
import copy import copy
import os import os
import warnings import warnings
from typing import Optional, Dict, Any, Union, Tuple, Sequence from typing import Optional, Dict, Any, Union, Tuple, Sequence, List, cast, Iterable
import numpy as np import numpy as np
from .callback import TrainingCallback, CallbackContainer, EvaluationMonitor, EarlyStopping
from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args
from .core import Metric, Objective from .core import Metric, Objective
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold) from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold, DataFrame
from . import callback from ._typing import _F, FPreProcCallable, BoosterParam
_CVFolds = Sequence["CVPack"]
def _assert_new_callback( def _assert_new_callback(
callbacks: Optional[Sequence[callback.TrainingCallback]] callbacks: Optional[Sequence[TrainingCallback]]
) -> None: ) -> None:
is_new_callback: bool = not callbacks or all( is_new_callback: bool = not callbacks or all(
isinstance(c, callback.TrainingCallback) for c in callbacks isinstance(c, TrainingCallback) for c in callbacks
) )
if not is_new_callback: if not is_new_callback:
link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html" link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
@ -56,10 +60,10 @@ def train(
feval: Optional[Metric] = None, feval: Optional[Metric] = None,
maximize: Optional[bool] = None, maximize: Optional[bool] = None,
early_stopping_rounds: Optional[int] = None, early_stopping_rounds: Optional[int] = None,
evals_result: callback.TrainingCallback.EvalsLog = None, evals_result: TrainingCallback.EvalsLog = None,
verbose_eval: Optional[Union[bool, int]] = True, verbose_eval: Optional[Union[bool, int]] = True,
xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None, xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None,
callbacks: Optional[Sequence[callback.TrainingCallback]] = None, callbacks: Optional[Sequence[TrainingCallback]] = None,
custom_metric: Optional[Metric] = None, custom_metric: Optional[Metric] = None,
) -> Booster: ) -> Booster:
"""Train a booster with given parameters. """Train a booster with given parameters.
@ -159,12 +163,12 @@ def train(
_assert_new_callback(callbacks) _assert_new_callback(callbacks)
if verbose_eval: if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(callback.EvaluationMonitor(period=verbose_eval)) callbacks.append(EvaluationMonitor(period=verbose_eval))
if early_stopping_rounds: if early_stopping_rounds:
callbacks.append( callbacks.append(
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
) )
cb_container = callback.CallbackContainer( cb_container = CallbackContainer(
callbacks, callbacks,
metric=metric_fn, metric=metric_fn,
# For old `feval` parameter, the behavior is unchanged. For the new # For old `feval` parameter, the behavior is unchanged. For the new
@ -194,71 +198,73 @@ def train(
class CVPack: class CVPack:
""""Auxiliary datastruct to hold one fold of CV.""" """"Auxiliary datastruct to hold one fold of CV."""
def __init__(self, dtrain, dtest, param): def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None:
""""Initialize the CVPack""" """"Initialize the CVPack"""
self.dtrain = dtrain self.dtrain = dtrain
self.dtest = dtest self.dtest = dtest
self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
self.bst = Booster(param, [dtrain, dtest]) self.bst = Booster(param, [dtrain, dtest])
def __getattr__(self, name): def __getattr__(self, name: str) -> _F:
def _inner(*args, **kwargs): def _inner(*args: Any, **kwargs: Any) -> Any:
return getattr(self.bst, name)(*args, **kwargs) return getattr(self.bst, name)(*args, **kwargs)
return _inner return cast(_F, _inner)
def update(self, iteration, fobj): def update(self, iteration: int, fobj: Optional[Objective]) -> None:
""""Update the boosters for one iteration""" """"Update the boosters for one iteration"""
self.bst.update(self.dtrain, iteration, fobj) self.bst.update(self.dtrain, iteration, fobj)
def eval(self, iteration, feval, output_margin): def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str:
""""Evaluate the CVPack for one iteration.""" """"Evaluate the CVPack for one iteration."""
return self.bst.eval_set(self.watchlist, iteration, feval, output_margin) return self.bst.eval_set(self.watchlist, iteration, feval, output_margin)
class _PackedBooster: class _PackedBooster:
def __init__(self, cvfolds) -> None: def __init__(self, cvfolds: _CVFolds) -> None:
self.cvfolds = cvfolds self.cvfolds = cvfolds
def update(self, iteration, obj): def update(self, iteration: int, obj: Optional[Objective]) -> None:
'''Iterate through folds for update''' '''Iterate through folds for update'''
for fold in self.cvfolds: for fold in self.cvfolds:
fold.update(iteration, obj) fold.update(iteration, obj)
def eval(self, iteration, feval, output_margin): def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]:
'''Iterate through folds for eval''' '''Iterate through folds for eval'''
result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds] result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
return result return result
def set_attr(self, **kwargs): def set_attr(self, **kwargs: Optional[str]) -> Any:
'''Iterate through folds for setting attributes''' '''Iterate through folds for setting attributes'''
for f in self.cvfolds: for f in self.cvfolds:
f.bst.set_attr(**kwargs) f.bst.set_attr(**kwargs)
def attr(self, key): def attr(self, key: str) -> Optional[str]:
'''Redirect to booster attr.''' '''Redirect to booster attr.'''
return self.cvfolds[0].bst.attr(key) return self.cvfolds[0].bst.attr(key)
def set_param(self, params, value=None): def set_param(self,
params: Union[Dict, Iterable[Tuple[str, Any]], str],
value: Optional[str] = None) -> None:
"""Iterate through folds for set_param""" """Iterate through folds for set_param"""
for f in self.cvfolds: for f in self.cvfolds:
f.bst.set_param(params, value) f.bst.set_param(params, value)
def num_boosted_rounds(self): def num_boosted_rounds(self) -> int:
'''Number of boosted rounds.''' '''Number of boosted rounds.'''
return self.cvfolds[0].num_boosted_rounds() return self.cvfolds[0].num_boosted_rounds()
@property @property
def best_iteration(self): def best_iteration(self) -> int:
'''Get best_iteration''' '''Get best_iteration'''
return int(self.cvfolds[0].bst.attr("best_iteration")) return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))
@property @property
def best_score(self): def best_score(self) -> float:
"""Get best_score.""" """Get best_score."""
return float(self.cvfolds[0].bst.attr("best_score")) return float(cast(float, self.cvfolds[0].bst.attr("best_score")))
def groups_to_rows(groups, boundaries): def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
""" """
Given group row boundaries, convert ground indexes to row indexes Given group row boundaries, convert ground indexes to row indexes
:param groups: list of groups for testing :param groups: list of groups for testing
@ -268,7 +274,9 @@ def groups_to_rows(groups, boundaries):
return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups]) return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups])
def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True): def mkgroupfold(dall: DMatrix, nfold: int, param: BoosterParam,
evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
shuffle: bool = True) -> List[CVPack]:
""" """
Make n folds for cross-validation maintaining groups Make n folds for cross-validation maintaining groups
:return: cross-validation folds :return: cross-validation folds
@ -308,8 +316,10 @@ def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
return ret return ret
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, def mknfold(dall: DMatrix, nfold: int, param: BoosterParam, seed: int,
folds=None, shuffle=True): evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
stratified: bool = False, folds: XGBStratifiedKFold = None, shuffle: bool = True
) -> List[CVPack]:
""" """
Make an n-fold list of CVPack from random indices. Make an n-fold list of CVPack from random indices.
""" """
@ -362,11 +372,27 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
return ret return ret
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, def cv(
metrics=(), obj: Optional[Objective] = None, params: BoosterParam,
feval=None, maximize=None, early_stopping_rounds=None, dtrain: DMatrix,
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, num_boost_round: int = 10,
seed=0, callbacks=None, shuffle=True, custom_metric: Optional[Metric] = None): nfold: int = 3,
stratified: bool = False,
folds: XGBStratifiedKFold = None,
metrics: Sequence[str] = (),
obj: Optional[Objective] = None,
feval: Optional[Metric] = None,
maximize: bool = None,
early_stopping_rounds: int = None,
fpreproc: FPreProcCallable = None,
as_pandas: bool = True,
verbose_eval: Optional[Union[int, bool]] = None,
show_stdv: bool = True,
seed: int = 0,
callbacks: Sequence[TrainingCallback] = None,
shuffle: bool = True,
custom_metric: Optional[Metric] = None,
) -> Union[Dict[str, float], DataFrame]:
# pylint: disable = invalid-name # pylint: disable = invalid-name
"""Cross-validation with given parameters. """Cross-validation with given parameters.
@ -477,7 +503,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
params.pop("eval_metric", None) params.pop("eval_metric", None)
results = {} results: Dict[str, List[float]] = {}
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
stratified, folds, shuffle) stratified, folds, shuffle)
@ -490,13 +516,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
if verbose_eval: if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append( callbacks.append(
callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv) EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
) )
if early_stopping_rounds: if early_stopping_rounds:
callbacks.append( callbacks.append(
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
) )
callbacks = callback.CallbackContainer( callbacks_container = CallbackContainer(
callbacks, callbacks,
metric=metric_fn, metric=metric_fn,
is_cv=True, is_cv=True,
@ -504,16 +530,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
) )
booster = _PackedBooster(cvfolds) booster = _PackedBooster(cvfolds)
callbacks.before_training(booster) callbacks_container.before_training(booster)
for i in range(num_boost_round): for i in range(num_boost_round):
if callbacks.before_iteration(booster, i, dtrain, None): if callbacks_container.before_iteration(booster, i, dtrain, None):
break break
booster.update(i, obj) booster.update(i, obj)
should_break = callbacks.after_iteration(booster, i, dtrain, None) should_break = callbacks_container.after_iteration(booster, i, dtrain, None)
res = callbacks.aggregated_cv res = callbacks_container.aggregated_cv
for key, mean, std in res: for key, mean, std in cast(List[Tuple[str, float, float]], res):
if key + '-mean' not in results: if key + '-mean' not in results:
results[key + '-mean'] = [] results[key + '-mean'] = []
if key + '-std' not in results: if key + '-std' not in results:
@ -532,6 +558,6 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
except ImportError: except ImportError:
pass pass
callbacks.after_training(booster) callbacks_container.after_training(booster)
return results return results