From 806c92c80ba646ed5b5ec90a12e592a44ccb7a9c Mon Sep 17 00:00:00 2001 From: Chengyang <22215950+bridgream@users.noreply.github.com> Date: Tue, 17 May 2022 10:14:09 -0400 Subject: [PATCH] Add Type Hints for Python Package (#7742) Co-authored-by: Chengyang Gu Co-authored-by: Jiamingy --- python-package/xgboost/_typing.py | 18 +- python-package/xgboost/callback.py | 38 +-- python-package/xgboost/compat.py | 108 +++++---- python-package/xgboost/config.py | 26 ++- python-package/xgboost/core.py | 69 +++--- python-package/xgboost/dask.py | 12 +- python-package/xgboost/data.py | 361 ++++++++++++++++------------- python-package/xgboost/plotting.py | 55 ++++- python-package/xgboost/sklearn.py | 23 +- python-package/xgboost/training.py | 118 ++++++---- 10 files changed, 486 insertions(+), 342 deletions(-) diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py index 64ea9a0a2..b17f5ecb8 100644 --- a/python-package/xgboost/_typing.py +++ b/python-package/xgboost/_typing.py @@ -1,21 +1,32 @@ """Shared typing definition.""" import ctypes import os -from typing import Optional, Any, TypeVar, Union, Sequence +from typing import Any, TypeVar, Union, Type, Sequence, Callable, List, Dict # os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/ # cudf.DataFrame/cupy.array/dlpack +import numpy as np + DataType = Any # xgboost accepts some other possible types in practice due to historical reason, which is # lesser tested. For now we encourage users to pass a simple list of string. -FeatureNames = Optional[Sequence[str]] -FeatureTypes = Optional[Sequence[str]] +FeatureInfo = Sequence[str] +FeatureNames = FeatureInfo +FeatureTypes = FeatureInfo +BoosterParam = Union[List, Dict] # better be sequence ArrayLike = Any PathLike = Union[str, os.PathLike] CupyT = ArrayLike # maybe need a stub for cupy arrays NumpyOrCupy = Any +NumpyDType = Union[str, Type[np.number]] +PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype + +FloatCompatible = Union[float, np.float32, np.float64] + +# callables +FPreProcCallable = Callable # ctypes # c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h @@ -59,3 +70,4 @@ CNumericPtr = ctypes.pointer # template parameter _T = TypeVar("_T") +_F = TypeVar("_F", bound=Callable[..., Any]) diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py index 32d408f3a..021ccd972 100644 --- a/python-package/xgboost/callback.py +++ b/python-package/xgboost/callback.py @@ -10,8 +10,7 @@ from abc import ABC import collections import os import pickle -from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast -from typing import Sequence +from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast, Sequence, Any import numpy from . import rabit @@ -24,11 +23,14 @@ __all__ = [ "EarlyStopping", "EvaluationMonitor", "TrainingCheckPoint", + "CallbackContainer" ] _Score = Union[float, Tuple[float, float]] _ScoreList = Union[List[float], List[Tuple[float, float]]] +_Model = Any # real type is Union[Booster, CVPack]; need more work + # pylint: disable=unused-argument class TrainingCallback(ABC): @@ -43,19 +45,19 @@ class TrainingCallback(ABC): def __init__(self) -> None: pass - def before_training(self, model): + def before_training(self, model: _Model) -> _Model: '''Run before training starts.''' return model - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: '''Run after training is finished.''' return model - def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool: + def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool: '''Run before each iteration. Return True when training should stop.''' return False - def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool: + def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool: '''Run after each iteration. Return True when training should stop.''' return False @@ -140,7 +142,7 @@ class CallbackContainer: if self.is_cv: self.aggregated_cv = None - def before_training(self, model): + def before_training(self, model: _Model) -> _Model: '''Function called before training.''' for c in self.callbacks: model = c.before_training(model=model) @@ -151,7 +153,7 @@ class CallbackContainer: assert isinstance(model, Booster), msg return model - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: '''Function called after training.''' for c in self.callbacks: model = c.after_training(model=model) @@ -182,7 +184,7 @@ class CallbackContainer: return model def before_iteration( - self, model, epoch: int, dtrain: DMatrix, evals: List[Tuple[DMatrix, str]] + self, model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]] ) -> bool: '''Function called before training iteration.''' return any(c.before_iteration(model, epoch, self.history) @@ -220,7 +222,7 @@ class CallbackContainer: def after_iteration( self, - model, + model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]], @@ -276,7 +278,7 @@ class LearningRateScheduler(TrainingCallback): super().__init__() def after_iteration( - self, model, epoch: int, evals_log: TrainingCallback.EvalsLog + self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog ) -> bool: model.set_param("learning_rate", self.learning_rates(epoch)) return False @@ -344,12 +346,12 @@ class EarlyStopping(TrainingCallback): self.starting_round: int = 0 super().__init__() - def before_training(self, model): + def before_training(self, model: _Model) -> _Model: self.starting_round = model.num_boosted_rounds() return model def _update_rounds( - self, score: _Score, name: str, metric: str, model, epoch: int + self, score: _Score, name: str, metric: str, model: _Model, epoch: int ) -> bool: def get_s(x: _Score) -> float: """get score if it's cross validation history.""" @@ -403,7 +405,7 @@ class EarlyStopping(TrainingCallback): return True return False - def after_iteration(self, model, epoch: int, + def after_iteration(self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog) -> bool: epoch += self.starting_round # training continuation msg = 'Must have at least 1 validation dataset for early stopping.' @@ -431,7 +433,7 @@ class EarlyStopping(TrainingCallback): score = data_log[metric_name][-1] return self._update_rounds(score, data_name, metric_name, model, epoch) - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: try: if self.save_best: model = model[: int(model.attr("best_iteration")) + 1] @@ -477,7 +479,7 @@ class EvaluationMonitor(TrainingCallback): msg = f"\t{data + '-' + metric}:{score:.5f}" return msg - def after_iteration(self, model, epoch: int, + def after_iteration(self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog) -> bool: if not evals_log: return False @@ -503,7 +505,7 @@ class EvaluationMonitor(TrainingCallback): self._latest = msg return False - def after_training(self, model): + def after_training(self, model: _Model) -> _Model: if rabit.get_rank() == self.printer_rank and self._latest is not None: rabit.tracker_print(self._latest) return model @@ -544,7 +546,7 @@ class TrainingCheckPoint(TrainingCallback): self._epoch = 0 super().__init__() - def after_iteration(self, model, epoch: int, + def after_iteration(self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog) -> bool: if self._epoch == self._iterations: path = os.path.join(self._path, self._name + '_' + str(epoch) + diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 1967ffc8e..9727005b4 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -1,30 +1,32 @@ # coding: utf-8 # pylint: disable= invalid-name, unused-import """For compatibility and optional dependencies.""" -from typing import Any +from typing import Any, Type, Dict, Optional, List import sys import types import importlib.util import logging import numpy as np +from xgboost._typing import CStrPtr + assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.' -def py_str(x): +def py_str(x: CStrPtr) -> str: """convert c string back to python string""" - return x.decode('utf-8') + return x.decode('utf-8') # type: ignore -def lazy_isinstance(instance, module, name): +def lazy_isinstance(instance: Type[object], module: str, name: str) -> bool: """Use string representation to identify a type.""" # Notice, we use .__class__ as opposed to type() in order # to support object proxies such as weakref.proxy cls = instance.__class__ - module = cls.__module__ == module - name = cls.__name__ == name - return module and name + is_same_module = cls.__module__ == module + has_same_name = cls.__name__ == name + return is_same_module and has_same_name # pandas @@ -37,53 +39,33 @@ try: except ImportError: MultiIndex = object - DataFrame: Any = object + DataFrame = object Series = object pandas_concat = None PANDAS_INSTALLED = False # sklearn try: - from sklearn.base import BaseEstimator - from sklearn.base import RegressorMixin, ClassifierMixin + from sklearn.base import ( + BaseEstimator as XGBModelBase, + RegressorMixin as XGBRegressorBase, + ClassifierMixin as XGBClassifierBase + ) from sklearn.preprocessing import LabelEncoder try: - from sklearn.model_selection import KFold, StratifiedKFold + from sklearn.model_selection import ( + KFold as XGBKFold, + StratifiedKFold as XGBStratifiedKFold + ) except ImportError: - from sklearn.cross_validation import KFold, StratifiedKFold + from sklearn.cross_validation import ( + KFold as XGBKFold, + StratifiedKFold as XGBStratifiedKFold + ) SKLEARN_INSTALLED = True - XGBModelBase = BaseEstimator - XGBRegressorBase = RegressorMixin - XGBClassifierBase = ClassifierMixin - - XGBKFold = KFold - XGBStratifiedKFold = StratifiedKFold - - class XGBoostLabelEncoder(LabelEncoder): - '''Label encoder with JSON serialization methods.''' - def to_json(self): - '''Returns a JSON compatible dictionary''' - meta = {} - for k, v in self.__dict__.items(): - if isinstance(v, np.ndarray): - meta[k] = v.tolist() - else: - meta[k] = v - return meta - - def from_json(self, doc): - # pylint: disable=attribute-defined-outside-init - '''Load the encoder back from a JSON compatible dict.''' - meta = {} - for k, v in doc.items(): - if k == 'classes_': - self.classes_ = np.array(v) - continue - meta[k] = v - self.__dict__.update(meta) except ImportError: SKLEARN_INSTALLED = False @@ -91,10 +73,34 @@ except ImportError: XGBModelBase = object XGBClassifierBase = object XGBRegressorBase = object + LabelEncoder = object XGBKFold = None XGBStratifiedKFold = None - XGBoostLabelEncoder = None + + +class XGBoostLabelEncoder(LabelEncoder): + '''Label encoder with JSON serialization methods.''' + def to_json(self) -> Dict: + '''Returns a JSON compatible dictionary''' + meta = {} + for k, v in self.__dict__.items(): + if isinstance(v, np.ndarray): + meta[k] = v.tolist() + else: + meta[k] = v + return meta + + def from_json(self, doc: Dict) -> None: + # pylint: disable=attribute-defined-outside-init + '''Load the encoder back from a JSON compatible dict.''' + meta = {} + for k, v in doc.items(): + if k == 'classes_': + self.classes_ = np.array(v) + continue + meta[k] = v + self.__dict__.update(meta) # dask @@ -113,7 +119,7 @@ try: SCIPY_INSTALLED = True except ImportError: scipy_sparse = False - scipy_csr: Any = object + scipy_csr = object SCIPY_INSTALLED = False @@ -136,15 +142,21 @@ class LazyLoader(types.ModuleType): """Lazily import a module, mainly to avoid pulling in large dependencies. """ - def __init__(self, local_name, parent_module_globals, name, warning=None): + def __init__( + self, + local_name: str, + parent_module_globals: Dict, + name: str, + warning: Optional[str] = None + ) -> None: self._local_name = local_name self._parent_module_globals = parent_module_globals self._warning = warning - self.module = None + self.module: Optional[types.ModuleType] = None super().__init__(name) - def _load(self): + def _load(self) -> types.ModuleType: """Load the module and insert it into the parent's globals.""" # Import the target module and insert it into the parent's namespace module = importlib.import_module(self.__name__) @@ -163,12 +175,12 @@ class LazyLoader(types.ModuleType): return module - def __getattr__(self, item): + def __getattr__(self, item: str) -> Any: if not self.module: self.module = self._load() return getattr(self.module, item) - def __dir__(self): + def __dir__(self) -> List[str]: if not self.module: self.module = self._load() return dir(self.module) diff --git a/python-package/xgboost/config.py b/python-package/xgboost/config.py index 427ea4ea3..35862def2 100644 --- a/python-package/xgboost/config.py +++ b/python-package/xgboost/config.py @@ -4,12 +4,20 @@ import ctypes import json from contextlib import contextmanager from functools import wraps +from typing import Optional, Callable, Any, Dict, cast, Iterator from .core import _LIB, _check_call, c_str, py_str +from ._typing import _F -def config_doc(*, header=None, extra_note=None, parameters=None, returns=None, - see_also=None): +def config_doc( + *, + header: Optional[str] = None, + extra_note: Optional[str] = None, + parameters: Optional[str] = None, + returns: Optional[str] = None, + see_also: Optional[str] = None +) -> Callable[[_F], _F]: """Decorator to format docstring for config functions. Parameters @@ -64,19 +72,19 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None, assert xgb.get_config()['verbosity'] == 2 # old value restored """ - def none_to_str(value): + def none_to_str(value: Optional[str]) -> str: return '' if value is None else value - def config_doc_decorator(func): + def config_doc_decorator(func: _F) -> _F: func.__doc__ = (doc_template.format(header=none_to_str(header), extra_note=none_to_str(extra_note)) + none_to_str(parameters) + none_to_str(returns) + none_to_str(common_example) + none_to_str(see_also)) @wraps(func) - def wrap(*args, **kwargs): + def wrap(*args: Any, **kwargs: Any) -> Any: return func(*args, **kwargs) - return wrap + return cast(_F, wrap) return config_doc_decorator @@ -89,7 +97,7 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None, new_config: Dict[str, Any] Keyword arguments representing the parameters and their values """) -def set_config(**new_config): +def set_config(**new_config: Any) -> None: config = json.dumps(new_config) _check_call(_LIB.XGBSetGlobalConfig(c_str(config))) @@ -103,7 +111,7 @@ def set_config(**new_config): args: Dict[str, Any] The list of global parameters and their values """) -def get_config(): +def get_config() -> Dict[str, Any]: config_str = ctypes.c_char_p() _check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str))) config = json.loads(py_str(config_str.value)) @@ -132,7 +140,7 @@ def get_config(): set_config: Set global XGBoost configuration get_config: Get current values of the global configuration """) -def config_context(**new_config): +def config_context(**new_config: Any) -> Iterator[None]: old_config = get_config().copy() set_config(**new_config) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index a94c9d767..35a5edd32 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -30,10 +30,12 @@ from ._typing import ( ArrayLike, CFloatPtr, NumpyOrCupy, - FeatureNames, + FeatureInfo, FeatureTypes, + FeatureNames, _T, CupyT, + BoosterParam ) @@ -273,7 +275,7 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n if not isinstance(cptr, ctypes.POINTER(ctype)): raise RuntimeError(f"expected {ctype} pointer") res = np.zeros(length, dtype=dtype) - if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): + if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): # type: ignore raise RuntimeError("memmove failed") return res @@ -310,7 +312,7 @@ def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray: raise RuntimeError('expected char pointer') res = bytearray(length) rptr = (ctypes.c_char * length).from_buffer(res) - if not ctypes.memmove(rptr, cptr, length): + if not ctypes.memmove(rptr, cptr, length): # type: ignore raise RuntimeError('memmove failed') return res @@ -434,8 +436,8 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes def data_handle( data: Any, *, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, **kwargs: Any, ) -> None: from .data import dispatch_proxy_set_data @@ -555,8 +557,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes base_margin: Optional[ArrayLike] = None, missing: Optional[float] = None, silent: bool = False, - feature_names: FeatureNames = None, - feature_types: FeatureTypes = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, nthread: Optional[int] = None, group: Optional[ArrayLike] = None, qid: Optional[ArrayLike] = None, @@ -718,8 +720,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes qid: Optional[ArrayLike] = None, label_lower_bound: Optional[ArrayLike] = None, label_upper_bound: Optional[ArrayLike] = None, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, feature_weights: Optional[ArrayLike] = None ) -> None: """Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`.""" @@ -1000,7 +1002,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes return res @property - def feature_names(self) -> Optional[List[str]]: + def feature_names(self) -> Optional[FeatureNames]: """Get feature names (column labels). Returns @@ -1023,7 +1025,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes return feature_names @feature_names.setter - def feature_names(self, feature_names: FeatureNames) -> None: + def feature_names(self, feature_names: Optional[FeatureNames]) -> None: """Set feature names (column labels). Parameters @@ -1039,7 +1041,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes else: feature_names = [feature_names] except TypeError: - feature_names = [feature_names] + feature_names = [cast(str, feature_names)] if len(feature_names) != len(set(feature_names)): raise ValueError('feature_names must be unique') @@ -1069,8 +1071,13 @@ class DMatrix: # pylint: disable=too-many-instance-attributes self.feature_types = None @property - def feature_types(self) -> Optional[List[str]]: - """Get feature types. See :py:class:`DMatrix` for details.""" + def feature_types(self) -> Optional[FeatureTypes]: + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, @@ -1111,7 +1118,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes else: feature_types = [feature_types] except TypeError: - feature_types = [feature_types] + feature_types = [cast(str, feature_types)] feature_types_bytes = [bytes(f, encoding='utf-8') for f in feature_types] c_feature_types = (ctypes.c_char_p * @@ -1203,8 +1210,8 @@ class DeviceQuantileDMatrix(DMatrix): base_margin: Optional[ArrayLike] = None, missing: Optional[float] = None, silent: bool = False, - feature_names: FeatureNames = None, - feature_types: Optional[List[str]] = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, nthread: Optional[int] = None, max_bin: int = 256, group: Optional[ArrayLike] = None, @@ -1323,7 +1330,7 @@ def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]: return num_parallel_tree, num_groups -def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]: +def _configure_metrics(params: BoosterParam) -> BoosterParam: if ( isinstance(params, dict) and "eval_metric" in params @@ -1349,7 +1356,7 @@ class Booster: def __init__( self, - params: Optional[Dict] = None, + params: Optional[BoosterParam] = None, cache: Optional[Sequence[DMatrix]] = None, model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None ) -> None: @@ -1444,7 +1451,7 @@ class Booster: "Constrained features are not a subset of training data feature names" ) from e - def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]: + def _configure_constraints(self, params: BoosterParam) -> BoosterParam: if isinstance(params, dict): value = params.get("monotone_constraints") if value is not None: @@ -1607,7 +1614,7 @@ class Booster: return py_str(ret.value) return None - def attributes(self) -> Dict[str, str]: + def attributes(self) -> Dict[str, Optional[str]]: """Get attributes stored in the Booster as a dictionary. Returns @@ -1639,7 +1646,7 @@ class Booster: _check_call(_LIB.XGBoosterSetAttr( self.handle, c_str(key), value)) - def _get_feature_info(self, field: str) -> Optional[List[str]]: + def _get_feature_info(self, field: str) -> Optional[FeatureInfo]: length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() if not hasattr(self, "handle") or self.handle is None: @@ -1652,7 +1659,7 @@ class Booster: feature_info = from_cstr_to_pystr(sarr, length) return feature_info if feature_info else None - def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None: + def _set_feature_info(self, features: Optional[FeatureInfo], field: str) -> None: if features is not None: assert isinstance(features, list) feature_info_bytes = [bytes(f, encoding="utf-8") for f in features] @@ -1670,7 +1677,7 @@ class Booster: ) @property - def feature_types(self) -> Optional[List[str]]: + def feature_types(self) -> Optional[FeatureTypes]: """Feature types for this booster. Can be directly set by input data or by assignment. See :py:class:`DMatrix` for details. @@ -1678,11 +1685,11 @@ class Booster: return self._get_feature_info("feature_type") @feature_types.setter - def feature_types(self, features: Optional[List[str]]) -> None: + def feature_types(self, features: Optional[FeatureTypes]) -> None: self._set_feature_info(features, "feature_type") @property - def feature_names(self) -> Optional[List[str]]: + def feature_names(self) -> Optional[FeatureNames]: """Feature names for this booster. Can be directly set by input data or by assignment. @@ -1690,7 +1697,7 @@ class Booster: return self._get_feature_info("feature_name") @feature_names.setter - def feature_names(self, features: FeatureNames) -> None: + def feature_names(self, features: Optional[FeatureNames]) -> None: self._set_feature_info(features, "feature_name") def set_param( @@ -1711,7 +1718,7 @@ class Booster: params = params.items() elif isinstance(params, str) and value is not None: params = [(params, value)] - for key, val in params: + for key, val in cast(Iterable[Tuple[str, str]], params): if val is not None: _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val)))) @@ -2564,8 +2571,10 @@ class Booster: ) # Booster can't accept data with different feature names if self.feature_names != data.feature_names: - dat_missing = set(self.feature_names) - set(data.feature_names) - my_missing = set(data.feature_names) - set(self.feature_names) + dat_missing = set(cast(FeatureNames, self.feature_names)) - \ + set(cast(FeatureNames, data.feature_names)) + my_missing = set(cast(FeatureNames, data.feature_names)) - \ + set(cast(FeatureNames, self.feature_names)) msg = 'feature_names mismatch: {0} {1}' diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index b54e26c9d..ee8ea1a5a 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -318,7 +318,7 @@ class DaskDMatrix: base_margin: Optional[_DaskCollection] = None, missing: float = None, silent: bool = False, # pylint: disable=unused-argument - feature_names: FeatureNames = None, + feature_names: Optional[FeatureNames] = None, feature_types: FeatureTypes = None, group: Optional[_DaskCollection] = None, qid: Optional[_DaskCollection] = None, @@ -594,7 +594,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902 qid: Optional[List[Any]] = None, label_lower_bound: Optional[List[Any]] = None, label_upper_bound: Optional[List[Any]] = None, - feature_names: FeatureNames = None, + feature_names: Optional[FeatureNames] = None, feature_types: Optional[Union[Any, List[Any]]] = None, ) -> None: self._data = data @@ -637,7 +637,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902 if self._iter == len(self._data): # Return 0 when there's no more batch. return 0 - feature_names: FeatureNames = None + feature_names: Optional[FeatureNames] = None if self._feature_names: feature_names = self._feature_names else: @@ -688,7 +688,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix): base_margin: Optional[_DaskCollection] = None, missing: float = None, silent: bool = False, # disable=unused-argument - feature_names: FeatureNames = None, + feature_names: Optional[FeatureNames] = None, feature_types: Optional[Union[Any, List[Any]]] = None, max_bin: int = 256, group: Optional[_DaskCollection] = None, @@ -725,7 +725,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix): def _create_device_quantile_dmatrix( - feature_names: FeatureNames, + feature_names: Optional[FeatureNames], feature_types: Optional[Union[Any, List[Any]]], feature_weights: Optional[Any], missing: float, @@ -766,7 +766,7 @@ def _create_device_quantile_dmatrix( def _create_dmatrix( - feature_names: FeatureNames, + feature_names: Optional[FeatureNames], feature_types: Optional[Union[Any, List[Any]]], feature_weights: Optional[Any], missing: float, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index d21c97910..a0505e9c9 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -5,17 +5,26 @@ import ctypes import json import warnings import os -from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type +from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast import numpy as np from .core import c_array, _LIB, _check_call, c_str from .core import _cuda_array_interface -from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames -from ._typing import FeatureTypes +from .core import DataIter, _ProxyDMatrix, DMatrix from .compat import lazy_isinstance, DataFrame +from ._typing import ( + c_bst_ulong, + DataType, + FeatureTypes, + FeatureNames, + NumpyDType, + CupyT, + FloatCompatible, PandasDType +) -c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name +DispatchedDataBackendReturnType = Tuple[ + ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]] CAT_T = "c" @@ -23,14 +32,14 @@ CAT_T = "c" _matrix_meta = {"base_margin", "label"} -def _warn_unused_missing(data, missing): +def _warn_unused_missing(data: DataType, missing: Optional[FloatCompatible]) -> None: if (missing is not None) and (not np.isnan(missing)): warnings.warn( '`missing` is not used for current input data type:' + str(type(data)), UserWarning) -def _check_complex(data): +def _check_complex(data: DataType) -> None: '''Test whether data is complex using `dtype` attribute.''' complex_dtypes = (np.complex128, np.complex64, np.cfloat, np.cdouble, np.clongdouble) @@ -38,16 +47,15 @@ def _check_complex(data): raise ValueError('Complex data not supported') -def _check_data_shape(data: Any) -> None: +def _check_data_shape(data: DataType) -> None: if hasattr(data, "shape") and len(data.shape) != 2: raise ValueError("Please reshape the input data into 2-dimensional matrix.") -def _is_scipy_csr(data): +def _is_scipy_csr(data: DataType) -> bool: try: - import scipy + import scipy.sparse except ImportError: - scipy = None return False return isinstance(data, scipy.sparse.csr_matrix) @@ -64,12 +72,12 @@ def _array_interface(data: np.ndarray) -> bytes: def _from_scipy_csr( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: """Initialize data from a CSR matrix.""" if len(data.indices) != len(data.data): raise ValueError( @@ -94,21 +102,20 @@ def _from_scipy_csr( return handle, feature_names, feature_types -def _is_scipy_csc(data): +def _is_scipy_csc(data: DataType) -> bool: try: - import scipy + import scipy.sparse except ImportError: - scipy = None return False return isinstance(data, scipy.sparse.csc_matrix) def _from_scipy_csc( - data, - missing, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: Optional[FloatCompatible], + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: if len(data.indices) != len(data.data): raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}") _warn_unused_missing(data, missing) @@ -124,27 +131,29 @@ def _from_scipy_csc( return handle, feature_names, feature_types -def _is_scipy_coo(data): +def _is_scipy_coo(data: DataType) -> bool: try: - import scipy + import scipy.sparse except ImportError: - scipy = None return False return isinstance(data, scipy.sparse.coo_matrix) -def _is_numpy_array(data): +def _is_numpy_array(data: DataType) -> bool: return isinstance(data, (np.ndarray, np.matrix)) -def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]: +def _ensure_np_dtype( + data: DataType, + dtype: Optional[NumpyDType] +) -> Tuple[np.ndarray, Optional[NumpyDType]]: if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]: data = data.astype(np.float32, copy=False) dtype = np.float32 return data, dtype -def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray: +def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray: '''Handle numpy slice. This can be removed if we use __array_interface__. ''' try: @@ -159,12 +168,12 @@ def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray: def _from_numpy_array( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: """Initialize data from a 2-D numpy matrix. """ @@ -189,7 +198,7 @@ def _from_numpy_array( return handle, feature_names, feature_types -def _is_pandas_df(data): +def _is_pandas_df(data: DataType) -> bool: try: import pandas as pd except ImportError: @@ -197,7 +206,7 @@ def _is_pandas_df(data): return isinstance(data, pd.DataFrame) -def _is_modin_df(data): +def _is_modin_df(data: DataType) -> bool: try: import modin.pandas as pd except ImportError: @@ -232,7 +241,7 @@ _ENABLE_CAT_ERR = ( ) -def _invalid_dataframe_dtype(data: Any) -> None: +def _invalid_dataframe_dtype(data: DataType) -> None: # pandas series has `dtypes` but it's just a single object # cudf series doesn't have `dtypes`. if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"): @@ -253,10 +262,10 @@ def _invalid_dataframe_dtype(data: Any) -> None: def _pandas_feature_info( data: DataFrame, meta: Optional[str], - feature_names: FeatureNames, - feature_types: FeatureTypes, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[FeatureNames, FeatureTypes]: +) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]: import pandas as pd from pandas.api.types import ( is_sparse, @@ -285,13 +294,13 @@ def _pandas_feature_info( return feature_names, feature_types -def is_nullable_dtype(dtype: Any) -> bool: +def is_nullable_dtype(dtype: PandasDType) -> bool: """Wether dtype is a pandas nullable type.""" from pandas.api.types import is_integer_dtype, is_bool_dtype # dtype: pd.core.arrays.numeric.NumericDtype nullable_alias = {"Int16", "Int32", "Int64"} is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias - # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`. + # np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`. is_bool = is_bool_dtype(dtype) and dtype.name == "boolean" return is_int or is_bool @@ -331,11 +340,11 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame: def _transform_pandas_df( data: DataFrame, enable_categorical: bool, - feature_names: FeatureNames = None, - feature_types: FeatureTypes = None, + feature_names: Optional[FeatureNames] = None, + feature_types: Optional[FeatureTypes] = None, meta: Optional[str] = None, - meta_type: Optional[str] = None, -) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]: + meta_type: Optional[NumpyDType] = None, +) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: from pandas.api.types import ( is_sparse, is_categorical_dtype, @@ -359,7 +368,7 @@ def _transform_pandas_df( if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") - dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32 + dtype = meta_type if meta_type else np.float32 arr: np.ndarray = transformed.values if meta_type: arr = arr.astype(dtype) @@ -369,18 +378,18 @@ def _transform_pandas_df( def _from_pandas_df( data: DataFrame, enable_categorical: bool, - missing: float, + missing: FloatCompatible, nthread: int, - feature_names: FeatureNames, - feature_types: FeatureTypes, -) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: data, feature_names, feature_types = _transform_pandas_df( data, enable_categorical, feature_names, feature_types ) return _from_numpy_array(data, missing, nthread, feature_names, feature_types) -def _is_pandas_series(data): +def _is_pandas_series(data: DataType) -> bool: try: import pandas as pd except ImportError: @@ -389,18 +398,21 @@ def _is_pandas_series(data): def _meta_from_pandas_series( - data, name: str, dtype: Optional[str], handle: ctypes.c_void_p + data: DataType, + name: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p ) -> None: """Help transform pandas series for meta data like labels""" data = data.values.astype('float') from pandas.api.types import is_sparse if is_sparse(data): - data = data.to_dense() + data = data.to_dense() # type: ignore assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 _meta_from_numpy(data, name, dtype, handle) -def _is_modin_series(data): +def _is_modin_series(data: DataType) -> bool: try: import modin.pandas as pd except ImportError: @@ -409,13 +421,13 @@ def _is_modin_series(data): def _from_pandas_series( - data, - missing: float, + data: DataType, + missing: FloatCompatible, nthread: int, enable_categorical: bool, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: from pandas.api.types import is_categorical_dtype if (data.dtype.name not in _pandas_dtype_mapper) and not ( @@ -433,7 +445,7 @@ def _from_pandas_series( ) -def _is_dt_df(data): +def _is_dt_df(data: DataType) -> bool: return lazy_isinstance(data, 'datatable', 'Frame') or \ lazy_isinstance(data, 'datatable', 'DataTable') @@ -443,12 +455,12 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'} def _transform_dt_df( - data, - feature_names: FeatureNames, - feature_types: FeatureTypes, - meta=None, - meta_type=None, -): + data: DataType, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], + meta: Optional[str] = None, + meta_type: Optional[NumpyDType] = None, +) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]: """Validate feature names and types if data table""" if meta and data.shape[1] > 1: raise ValueError('DataTable for meta info cannot have multiple columns') @@ -482,13 +494,13 @@ def _transform_dt_df( def _from_dt_df( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + missing: Optional[FloatCompatible], + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: +) -> DispatchedDataBackendReturnType: if enable_categorical: raise ValueError("categorical data in datatable is not supported yet.") data, feature_names, feature_types = _transform_dt_df( @@ -525,7 +537,7 @@ def _from_dt_df( return handle, feature_names, feature_types -def _is_arrow(data) -> bool: +def _is_arrow(data: DataType) -> bool: try: import pyarrow as pa from pyarrow import dataset as arrow_dataset @@ -571,13 +583,13 @@ def record_batch_data_iter(data_iter: Iterator) -> Callable: def _from_arrow( - data, - missing: float, + data: DataType, + missing: FloatCompatible, nthread: int, - feature_names: FeatureNames, - feature_types: FeatureTypes, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]: +) -> DispatchedDataBackendReturnType: import pyarrow as pa if not all( @@ -605,11 +617,11 @@ def _from_arrow( return handle, feature_names, feature_types -def _is_cudf_df(data) -> bool: +def _is_cudf_df(data: DataType) -> bool: return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") -def _cudf_array_interfaces(data, cat_codes: list) -> bytes: +def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of data and a list of array interfaces. The data is list of categorical codes that caller can safely ignore, but have to keep their reference alive until usage of array @@ -645,11 +657,11 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes: def _transform_cudf_df( - data, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -): +) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]: try: from cudf.api.types import is_categorical_dtype except ImportError: @@ -709,13 +721,13 @@ def _transform_cudf_df( def _from_cudf_df( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -) -> Tuple[ctypes.c_void_p, Any, Any]: +) -> DispatchedDataBackendReturnType: data, cat_codes, feature_names, feature_types = _transform_cudf_df( data, feature_names, feature_types, enable_categorical ) @@ -732,7 +744,7 @@ def _from_cudf_df( return handle, feature_names, feature_types -def _is_cudf_ser(data): +def _is_cudf_ser(data: DataType) -> bool: try: import cudf except ImportError: @@ -740,13 +752,13 @@ def _is_cudf_ser(data): return isinstance(data, cudf.Series) -def _is_cupy_array(data: Any) -> bool: +def _is_cupy_array(data: DataType) -> bool: return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance( data, "cupy._core.core", "ndarray" ) -def _transform_cupy_array(data): +def _transform_cupy_array(data: DataType) -> CupyT: import cupy # pylint: disable=import-error if not hasattr(data, '__cuda_array_interface__') and hasattr( data, '__array__'): @@ -757,12 +769,12 @@ def _transform_cupy_array(data): def _from_cupy_array( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: """Initialize DMatrix from cupy ndarray.""" data = _transform_cupy_array(data) interface_str = _cuda_array_interface(data) @@ -776,7 +788,7 @@ def _from_cupy_array( return handle, feature_names, feature_types -def _is_cupy_csr(data): +def _is_cupy_csr(data: DataType) -> bool: try: import cupyx except ImportError: @@ -784,7 +796,7 @@ def _is_cupy_csr(data): return isinstance(data, cupyx.scipy.sparse.csr_matrix) -def _is_cupy_csc(data): +def _is_cupy_csc(data: DataType) -> bool: try: import cupyx except ImportError: @@ -792,11 +804,11 @@ def _is_cupy_csc(data): return isinstance(data, cupyx.scipy.sparse.csc_matrix) -def _is_dlpack(data): +def _is_dlpack(data: DataType) -> bool: return 'PyCapsule' in str(type(data)) and "dltensor" in str(data) -def _transform_dlpack(data): +def _transform_dlpack(data: DataType) -> bool: from cupy import fromDlpack # pylint: disable=E0401 assert 'used_dltensor' not in str(data) data = fromDlpack(data) @@ -804,27 +816,27 @@ def _transform_dlpack(data): def _from_dlpack( - data, - missing, - nthread, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: FloatCompatible, + nthread: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: data = _transform_dlpack(data) return _from_cupy_array(data, missing, nthread, feature_names, feature_types) -def _is_uri(data): +def _is_uri(data: DataType) -> bool: return isinstance(data, (str, os.PathLike)) def _from_uri( - data, - missing, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: DataType, + missing: Optional[FloatCompatible], + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: _warn_unused_missing(data, missing) handle = ctypes.c_void_p() data = os.fspath(os.path.expanduser(data)) @@ -834,51 +846,51 @@ def _from_uri( return handle, feature_names, feature_types -def _is_list(data): +def _is_list(data: DataType) -> bool: return isinstance(data, list) def _from_list( - data, - missing, - n_threads, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: Sequence, + missing: FloatCompatible, + n_threads: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: array = np.array(data) _check_data_shape(data) return _from_numpy_array(array, missing, n_threads, feature_names, feature_types) -def _is_tuple(data): +def _is_tuple(data: DataType) -> bool: return isinstance(data, tuple) def _from_tuple( - data, - missing, - n_threads, - feature_names: FeatureNames, - feature_types: FeatureTypes, -): + data: Sequence, + missing: FloatCompatible, + n_threads: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], +) -> DispatchedDataBackendReturnType: return _from_list(data, missing, n_threads, feature_names, feature_types) -def _is_iter(data): +def _is_iter(data: DataType) -> bool: return isinstance(data, DataIter) -def _has_array_protocol(data): +def _has_array_protocol(data: DataType) -> bool: return hasattr(data, '__array__') -def _convert_unknown_data(data): +def _convert_unknown_data(data: DataType) -> DataType: warnings.warn( f'Unknown data type: {type(data)}, trying to convert it to csr_matrix', UserWarning ) try: - import scipy + import scipy.sparse except ImportError: return None @@ -891,13 +903,13 @@ def _convert_unknown_data(data): def dispatch_data_backend( - data, - missing, - threads, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + missing: FloatCompatible, # Or Optional[Float] + threads: int, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool = False, -): +) -> DispatchedDataBackendReturnType: '''Dispatch data for DMatrix.''' if not _is_cudf_ser(data) and not _is_pandas_series(data): _check_data_shape(data) @@ -964,7 +976,7 @@ def dispatch_data_backend( raise TypeError('Not supported type for data.' + str(type(data))) -def _to_data_type(dtype: str, name: str): +def _to_data_type(dtype: str, name: str) -> int: dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4} if dtype not in dtype_map: raise TypeError( @@ -973,7 +985,7 @@ def _to_data_type(dtype: str, name: str): return dtype_map[dtype] -def _validate_meta_shape(data: Any, name: str) -> None: +def _validate_meta_shape(data: DataType, name: str) -> None: if hasattr(data, "shape"): msg = f"Invalid shape: {data.shape} for {name}" if name in _matrix_meta: @@ -990,7 +1002,7 @@ def _validate_meta_shape(data: Any, name: str) -> None: def _meta_from_numpy( data: np.ndarray, field: str, - dtype: Optional[Union[np.dtype, str]], + dtype: Optional[NumpyDType], handle: ctypes.c_void_p, ) -> None: data, dtype = _ensure_np_dtype(data, dtype) @@ -1001,16 +1013,26 @@ def _meta_from_numpy( _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str)) -def _meta_from_list(data, field, dtype, handle): - data = np.array(data) - _meta_from_numpy(data, field, dtype, handle) +def _meta_from_list( + data: Sequence, + field: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p +) -> None: + data_np = np.array(data) + _meta_from_numpy(data_np, field, dtype, handle) -def _meta_from_tuple(data, field, dtype, handle): +def _meta_from_tuple( + data: Sequence, + field: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p +) -> None: return _meta_from_list(data, field, dtype, handle) -def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: +def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> None: if field not in _matrix_meta: _meta_from_cudf_series(data.iloc[:, 0], field, handle) else: @@ -1019,7 +1041,7 @@ def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) -def _meta_from_cudf_series(data, field, handle): +def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None: interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), 'utf-8') _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, @@ -1027,7 +1049,7 @@ def _meta_from_cudf_series(data, field, handle): interface)) -def _meta_from_cupy_array(data, field, handle): +def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None: data = _transform_cupy_array(data) interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), 'utf-8') @@ -1036,14 +1058,22 @@ def _meta_from_cupy_array(data, field, handle): interface)) -def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p): +def _meta_from_dt( + data: DataType, + field: str, + dtype: Optional[NumpyDType], + handle: ctypes.c_void_p +) -> None: data, _, _ = _transform_dt_df(data, None, None, field, dtype) _meta_from_numpy(data, field, dtype, handle) def dispatch_meta_backend( - matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None -): + matrix: DMatrix, + data: DataType, + name: str, + dtype: Optional[NumpyDType] = None +) -> None: '''Dispatch for meta info.''' handle = matrix.handle assert handle is not None @@ -1060,8 +1090,7 @@ def dispatch_meta_backend( _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_df(data): - data, _, _ = _transform_pandas_df(data, False, meta=name, - meta_type=dtype) + data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype) _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_series(data): @@ -1107,7 +1136,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902 area for meta info. ''' - def __init__(self, **kwargs: Any): + def __init__(self, **kwargs: Any) -> None: self.kwargs = kwargs self.it = 0 # pylint: disable=invalid-name super().__init__() @@ -1124,11 +1153,13 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902 def _proxy_transform( - data, - feature_names: FeatureNames, - feature_types: FeatureTypes, + data: DataType, + feature_names: Optional[FeatureNames], + feature_types: Optional[FeatureTypes], enable_categorical: bool, -): +) -> Tuple[ + Union[bool, ctypes.c_void_p, np.ndarray], + Optional[list], Optional[FeatureNames], Optional[FeatureTypes]]: if _is_cudf_df(data) or _is_cudf_ser(data): return _transform_cudf_df( data, feature_names, feature_types, enable_categorical @@ -1152,7 +1183,7 @@ def _proxy_transform( def dispatch_proxy_set_data( proxy: _ProxyDMatrix, - data: Any, + data: DataType, cat_codes: Optional[list], allow_host: bool, ) -> None: @@ -1162,11 +1193,11 @@ def dispatch_proxy_set_data( if _is_cudf_df(data): # pylint: disable=W0212 - proxy._set_data_from_cuda_columnar(data, cat_codes) + proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes)) return if _is_cudf_ser(data): # pylint: disable=W0212 - proxy._set_data_from_cuda_columnar(data, cat_codes) + proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes)) return if _is_cupy_array(data): proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212 diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 75159d104..85a8428bc 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -4,16 +4,34 @@ """Plotting Library.""" from io import BytesIO import json +from typing import Optional, Any + import numpy as np + +from ._typing import PathLike from .core import Booster from .sklearn import XGBModel +Axes = Any # real type is matplotlib.axes.Axes +GraphvizSource = Any # real type is graphviz.Source -def plot_importance(booster, ax=None, height=0.2, - xlim=None, ylim=None, title='Feature importance', - xlabel='F score', ylabel='Features', fmap='', - importance_type='weight', max_num_features=None, - grid=True, show_values=True, **kwargs): + +def plot_importance( + booster: Booster, + ax: Optional[Axes] = None, + height: float = 0.2, + xlim: Optional[tuple] = None, + ylim: Optional[tuple] = None, + title: str = "Feature importance", + xlabel: str = "F score", + ylabel: str = "Features", + fmap: PathLike = "", + importance_type: str = "weight", + max_num_features: Optional[int] = None, + grid: bool = True, + show_values: bool = True, + **kwargs: Any +) -> Axes: """Plot importance based on fitted trees. Parameters @@ -78,9 +96,9 @@ def plot_importance(booster, ax=None, height=0.2, tuples = [(k, importance[k]) for k in importance] if max_num_features is not None: # pylint: disable=invalid-unary-operand-type - tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:] + tuples = sorted(tuples, key=lambda _x: _x[1])[-max_num_features:] else: - tuples = sorted(tuples, key=lambda x: x[1]) + tuples = sorted(tuples, key=lambda _x: _x[1]) labels, values = zip(*tuples) if ax is None: @@ -120,9 +138,17 @@ def plot_importance(booster, ax=None, height=0.2, return ax -def to_graphviz(booster, fmap='', num_trees=0, rankdir=None, - yes_color=None, no_color=None, - condition_node_params=None, leaf_node_params=None, **kwargs): +def to_graphviz( + booster: Booster, + fmap: PathLike = "", + num_trees: int = 0, + rankdir: Optional[str] = None, + yes_color: Optional[str] = None, + no_color: Optional[str] = None, + condition_node_params: Optional[dict] = None, + leaf_node_params: Optional[dict] = None, + **kwargs: Any +) -> GraphvizSource: """Convert specified tree to graphviz instance. IPython can automatically plot the returned graphiz instance. Otherwise, you should call .render() method of the returned graphiz instance. @@ -212,7 +238,14 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None, return g -def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs): +def plot_tree( + booster: Booster, + fmap: PathLike = "", + num_trees: int = 0, + rankdir: Optional[str] = None, + ax: Optional[Axes] = None, + **kwargs: Any +) -> Axes: """Plot specified tree. Parameters diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index ae883e30e..f6b43d8de 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -4,8 +4,19 @@ import copy import warnings import json import os -from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar, Type, cast -from typing import Sequence +from typing import ( + Union, + Optional, + List, + Dict, + Callable, + Sequence, + Tuple, + Any, + TypeVar, + Type, + cast, +) import numpy as np from .core import Booster, DMatrix, XGBoostError @@ -14,7 +25,7 @@ from .core import Metric from .training import train from .callback import TrainingCallback from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array -from ._typing import ArrayLike, FeatureTypes +from ._typing import ArrayLike, FeatureNames, FeatureTypes # Do not use class names on scikit-learn directly. Re-define the classes on # .compat to guarantee the behavior without scikit-learn @@ -401,7 +412,7 @@ def _wrap_evaluation_matrices( eval_qid: Optional[Sequence[Any]], create_dmatrix: Callable, enable_categorical: bool, - feature_types: FeatureTypes, + feature_types: Optional[FeatureTypes], ) -> Tuple[Any, List[Tuple[Any, str]]]: """Convert array_like evaluation matrices into DMatrix. Perform validation on the way. @@ -717,7 +728,7 @@ class XGBModel(XGBModelBase): return self._estimator_type # pylint: disable=no-member def save_model(self, fname: Union[str, os.PathLike]) -> None: - meta = {} + meta: Dict[str, Any] = {} for k, v in self.__dict__.items(): if k == '_le': meta['_le'] = self._le.to_json() @@ -1231,7 +1242,7 @@ class XGBModel(XGBModelBase): importance_type=self.importance_type if self.importance_type else dft() ) if b.feature_names is None: - feature_names = [f"f{i}" for i in range(self.n_features_in_)] + feature_names: FeatureNames = [f"f{i}" for i in range(self.n_features_in_)] else: feature_names = b.feature_names # gblinear returns all features so the `get` in next line is only for gbtree. diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 38567b6bf..2103303fb 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -5,20 +5,24 @@ import copy import os import warnings -from typing import Optional, Dict, Any, Union, Tuple, Sequence +from typing import Optional, Dict, Any, Union, Tuple, Sequence, List, cast, Iterable import numpy as np + +from .callback import TrainingCallback, CallbackContainer, EvaluationMonitor, EarlyStopping from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args from .core import Metric, Objective -from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold) -from . import callback +from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold, DataFrame +from ._typing import _F, FPreProcCallable, BoosterParam + +_CVFolds = Sequence["CVPack"] def _assert_new_callback( - callbacks: Optional[Sequence[callback.TrainingCallback]] + callbacks: Optional[Sequence[TrainingCallback]] ) -> None: is_new_callback: bool = not callbacks or all( - isinstance(c, callback.TrainingCallback) for c in callbacks + isinstance(c, TrainingCallback) for c in callbacks ) if not is_new_callback: link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html" @@ -56,10 +60,10 @@ def train( feval: Optional[Metric] = None, maximize: Optional[bool] = None, early_stopping_rounds: Optional[int] = None, - evals_result: callback.TrainingCallback.EvalsLog = None, + evals_result: TrainingCallback.EvalsLog = None, verbose_eval: Optional[Union[bool, int]] = True, xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None, - callbacks: Optional[Sequence[callback.TrainingCallback]] = None, + callbacks: Optional[Sequence[TrainingCallback]] = None, custom_metric: Optional[Metric] = None, ) -> Booster: """Train a booster with given parameters. @@ -159,12 +163,12 @@ def train( _assert_new_callback(callbacks) if verbose_eval: verbose_eval = 1 if verbose_eval is True else verbose_eval - callbacks.append(callback.EvaluationMonitor(period=verbose_eval)) + callbacks.append(EvaluationMonitor(period=verbose_eval)) if early_stopping_rounds: callbacks.append( - callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) + EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) ) - cb_container = callback.CallbackContainer( + cb_container = CallbackContainer( callbacks, metric=metric_fn, # For old `feval` parameter, the behavior is unchanged. For the new @@ -194,71 +198,73 @@ def train( class CVPack: """"Auxiliary datastruct to hold one fold of CV.""" - def __init__(self, dtrain, dtest, param): + def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None: """"Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest self.watchlist = [(dtrain, 'train'), (dtest, 'test')] self.bst = Booster(param, [dtrain, dtest]) - def __getattr__(self, name): - def _inner(*args, **kwargs): + def __getattr__(self, name: str) -> _F: + def _inner(*args: Any, **kwargs: Any) -> Any: return getattr(self.bst, name)(*args, **kwargs) - return _inner + return cast(_F, _inner) - def update(self, iteration, fobj): + def update(self, iteration: int, fobj: Optional[Objective]) -> None: """"Update the boosters for one iteration""" self.bst.update(self.dtrain, iteration, fobj) - def eval(self, iteration, feval, output_margin): + def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str: """"Evaluate the CVPack for one iteration.""" return self.bst.eval_set(self.watchlist, iteration, feval, output_margin) class _PackedBooster: - def __init__(self, cvfolds) -> None: + def __init__(self, cvfolds: _CVFolds) -> None: self.cvfolds = cvfolds - def update(self, iteration, obj): + def update(self, iteration: int, obj: Optional[Objective]) -> None: '''Iterate through folds for update''' for fold in self.cvfolds: fold.update(iteration, obj) - def eval(self, iteration, feval, output_margin): + def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]: '''Iterate through folds for eval''' result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds] return result - def set_attr(self, **kwargs): + def set_attr(self, **kwargs: Optional[str]) -> Any: '''Iterate through folds for setting attributes''' for f in self.cvfolds: f.bst.set_attr(**kwargs) - def attr(self, key): + def attr(self, key: str) -> Optional[str]: '''Redirect to booster attr.''' return self.cvfolds[0].bst.attr(key) - def set_param(self, params, value=None): + def set_param(self, + params: Union[Dict, Iterable[Tuple[str, Any]], str], + value: Optional[str] = None) -> None: """Iterate through folds for set_param""" for f in self.cvfolds: f.bst.set_param(params, value) - def num_boosted_rounds(self): + def num_boosted_rounds(self) -> int: '''Number of boosted rounds.''' return self.cvfolds[0].num_boosted_rounds() @property - def best_iteration(self): + def best_iteration(self) -> int: '''Get best_iteration''' - return int(self.cvfolds[0].bst.attr("best_iteration")) + return int(cast(int, self.cvfolds[0].bst.attr("best_iteration"))) @property - def best_score(self): + def best_score(self) -> float: """Get best_score.""" - return float(self.cvfolds[0].bst.attr("best_score")) + return float(cast(float, self.cvfolds[0].bst.attr("best_score"))) -def groups_to_rows(groups, boundaries): +def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray: """ Given group row boundaries, convert ground indexes to row indexes :param groups: list of groups for testing @@ -268,7 +274,9 @@ def groups_to_rows(groups, boundaries): return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups]) -def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True): +def mkgroupfold(dall: DMatrix, nfold: int, param: BoosterParam, + evals: Sequence[str] = (), fpreproc: FPreProcCallable = None, + shuffle: bool = True) -> List[CVPack]: """ Make n folds for cross-validation maintaining groups :return: cross-validation folds @@ -308,8 +316,10 @@ def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True): return ret -def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, - folds=None, shuffle=True): +def mknfold(dall: DMatrix, nfold: int, param: BoosterParam, seed: int, + evals: Sequence[str] = (), fpreproc: FPreProcCallable = None, + stratified: bool = False, folds: XGBStratifiedKFold = None, shuffle: bool = True + ) -> List[CVPack]: """ Make an n-fold list of CVPack from random indices. """ @@ -362,11 +372,27 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False, return ret -def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, - metrics=(), obj: Optional[Objective] = None, - feval=None, maximize=None, early_stopping_rounds=None, - fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, - seed=0, callbacks=None, shuffle=True, custom_metric: Optional[Metric] = None): +def cv( + params: BoosterParam, + dtrain: DMatrix, + num_boost_round: int = 10, + nfold: int = 3, + stratified: bool = False, + folds: XGBStratifiedKFold = None, + metrics: Sequence[str] = (), + obj: Optional[Objective] = None, + feval: Optional[Metric] = None, + maximize: bool = None, + early_stopping_rounds: int = None, + fpreproc: FPreProcCallable = None, + as_pandas: bool = True, + verbose_eval: Optional[Union[int, bool]] = None, + show_stdv: bool = True, + seed: int = 0, + callbacks: Sequence[TrainingCallback] = None, + shuffle: bool = True, + custom_metric: Optional[Metric] = None, +) -> Union[Dict[str, float], DataFrame]: # pylint: disable = invalid-name """Cross-validation with given parameters. @@ -477,7 +503,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None params.pop("eval_metric", None) - results = {} + results: Dict[str, List[float]] = {} cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds, shuffle) @@ -490,13 +516,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None if verbose_eval: verbose_eval = 1 if verbose_eval is True else verbose_eval callbacks.append( - callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv) + EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv) ) if early_stopping_rounds: callbacks.append( - callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) + EarlyStopping(rounds=early_stopping_rounds, maximize=maximize) ) - callbacks = callback.CallbackContainer( + callbacks_container = CallbackContainer( callbacks, metric=metric_fn, is_cv=True, @@ -504,16 +530,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None ) booster = _PackedBooster(cvfolds) - callbacks.before_training(booster) + callbacks_container.before_training(booster) for i in range(num_boost_round): - if callbacks.before_iteration(booster, i, dtrain, None): + if callbacks_container.before_iteration(booster, i, dtrain, None): break booster.update(i, obj) - should_break = callbacks.after_iteration(booster, i, dtrain, None) - res = callbacks.aggregated_cv - for key, mean, std in res: + should_break = callbacks_container.after_iteration(booster, i, dtrain, None) + res = callbacks_container.aggregated_cv + for key, mean, std in cast(List[Tuple[str, float, float]], res): if key + '-mean' not in results: results[key + '-mean'] = [] if key + '-std' not in results: @@ -532,6 +558,6 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None except ImportError: pass - callbacks.after_training(booster) + callbacks_container.after_training(booster) return results