Add Type Hints for Python Package (#7742)

Co-authored-by: Chengyang Gu <bridgream@gmail.com>
Co-authored-by: Jiamingy <jm.yuan@outlook.com>
This commit is contained in:
Chengyang 2022-05-17 10:14:09 -04:00 committed by GitHub
parent 71d3b2e036
commit 806c92c80b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 486 additions and 342 deletions

View File

@ -1,21 +1,32 @@
"""Shared typing definition."""
import ctypes
import os
from typing import Optional, Any, TypeVar, Union, Sequence
from typing import Any, TypeVar, Union, Type, Sequence, Callable, List, Dict
# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
# cudf.DataFrame/cupy.array/dlpack
import numpy as np
DataType = Any
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatureNames = Optional[Sequence[str]]
FeatureTypes = Optional[Sequence[str]]
FeatureInfo = Sequence[str]
FeatureNames = FeatureInfo
FeatureTypes = FeatureInfo
BoosterParam = Union[List, Dict] # better be sequence
ArrayLike = Any
PathLike = Union[str, os.PathLike]
CupyT = ArrayLike # maybe need a stub for cupy arrays
NumpyOrCupy = Any
NumpyDType = Union[str, Type[np.number]]
PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
FloatCompatible = Union[float, np.float32, np.float64]
# callables
FPreProcCallable = Callable
# ctypes
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
@ -59,3 +70,4 @@ CNumericPtr = ctypes.pointer
# template parameter
_T = TypeVar("_T")
_F = TypeVar("_F", bound=Callable[..., Any])

View File

@ -10,8 +10,7 @@ from abc import ABC
import collections
import os
import pickle
from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast
from typing import Sequence
from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast, Sequence, Any
import numpy
from . import rabit
@ -24,11 +23,14 @@ __all__ = [
"EarlyStopping",
"EvaluationMonitor",
"TrainingCheckPoint",
"CallbackContainer"
]
_Score = Union[float, Tuple[float, float]]
_ScoreList = Union[List[float], List[Tuple[float, float]]]
_Model = Any # real type is Union[Booster, CVPack]; need more work
# pylint: disable=unused-argument
class TrainingCallback(ABC):
@ -43,19 +45,19 @@ class TrainingCallback(ABC):
def __init__(self) -> None:
pass
def before_training(self, model):
def before_training(self, model: _Model) -> _Model:
'''Run before training starts.'''
return model
def after_training(self, model):
def after_training(self, model: _Model) -> _Model:
'''Run after training is finished.'''
return model
def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
'''Run before each iteration. Return True when training should stop.'''
return False
def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
'''Run after each iteration. Return True when training should stop.'''
return False
@ -140,7 +142,7 @@ class CallbackContainer:
if self.is_cv:
self.aggregated_cv = None
def before_training(self, model):
def before_training(self, model: _Model) -> _Model:
'''Function called before training.'''
for c in self.callbacks:
model = c.before_training(model=model)
@ -151,7 +153,7 @@ class CallbackContainer:
assert isinstance(model, Booster), msg
return model
def after_training(self, model):
def after_training(self, model: _Model) -> _Model:
'''Function called after training.'''
for c in self.callbacks:
model = c.after_training(model=model)
@ -182,7 +184,7 @@ class CallbackContainer:
return model
def before_iteration(
self, model, epoch: int, dtrain: DMatrix, evals: List[Tuple[DMatrix, str]]
self, model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]]
) -> bool:
'''Function called before training iteration.'''
return any(c.before_iteration(model, epoch, self.history)
@ -220,7 +222,7 @@ class CallbackContainer:
def after_iteration(
self,
model,
model: _Model,
epoch: int,
dtrain: DMatrix,
evals: Optional[List[Tuple[DMatrix, str]]],
@ -276,7 +278,7 @@ class LearningRateScheduler(TrainingCallback):
super().__init__()
def after_iteration(
self, model, epoch: int, evals_log: TrainingCallback.EvalsLog
self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
) -> bool:
model.set_param("learning_rate", self.learning_rates(epoch))
return False
@ -344,12 +346,12 @@ class EarlyStopping(TrainingCallback):
self.starting_round: int = 0
super().__init__()
def before_training(self, model):
def before_training(self, model: _Model) -> _Model:
self.starting_round = model.num_boosted_rounds()
return model
def _update_rounds(
self, score: _Score, name: str, metric: str, model, epoch: int
self, score: _Score, name: str, metric: str, model: _Model, epoch: int
) -> bool:
def get_s(x: _Score) -> float:
"""get score if it's cross validation history."""
@ -403,7 +405,7 @@ class EarlyStopping(TrainingCallback):
return True
return False
def after_iteration(self, model, epoch: int,
def after_iteration(self, model: _Model, epoch: int,
evals_log: TrainingCallback.EvalsLog) -> bool:
epoch += self.starting_round # training continuation
msg = 'Must have at least 1 validation dataset for early stopping.'
@ -431,7 +433,7 @@ class EarlyStopping(TrainingCallback):
score = data_log[metric_name][-1]
return self._update_rounds(score, data_name, metric_name, model, epoch)
def after_training(self, model):
def after_training(self, model: _Model) -> _Model:
try:
if self.save_best:
model = model[: int(model.attr("best_iteration")) + 1]
@ -477,7 +479,7 @@ class EvaluationMonitor(TrainingCallback):
msg = f"\t{data + '-' + metric}:{score:.5f}"
return msg
def after_iteration(self, model, epoch: int,
def after_iteration(self, model: _Model, epoch: int,
evals_log: TrainingCallback.EvalsLog) -> bool:
if not evals_log:
return False
@ -503,7 +505,7 @@ class EvaluationMonitor(TrainingCallback):
self._latest = msg
return False
def after_training(self, model):
def after_training(self, model: _Model) -> _Model:
if rabit.get_rank() == self.printer_rank and self._latest is not None:
rabit.tracker_print(self._latest)
return model
@ -544,7 +546,7 @@ class TrainingCheckPoint(TrainingCallback):
self._epoch = 0
super().__init__()
def after_iteration(self, model, epoch: int,
def after_iteration(self, model: _Model, epoch: int,
evals_log: TrainingCallback.EvalsLog) -> bool:
if self._epoch == self._iterations:
path = os.path.join(self._path, self._name + '_' + str(epoch) +

View File

@ -1,30 +1,32 @@
# coding: utf-8
# pylint: disable= invalid-name, unused-import
"""For compatibility and optional dependencies."""
from typing import Any
from typing import Any, Type, Dict, Optional, List
import sys
import types
import importlib.util
import logging
import numpy as np
from xgboost._typing import CStrPtr
assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.'
def py_str(x):
def py_str(x: CStrPtr) -> str:
"""convert c string back to python string"""
return x.decode('utf-8')
return x.decode('utf-8') # type: ignore
def lazy_isinstance(instance, module, name):
def lazy_isinstance(instance: Type[object], module: str, name: str) -> bool:
"""Use string representation to identify a type."""
# Notice, we use .__class__ as opposed to type() in order
# to support object proxies such as weakref.proxy
cls = instance.__class__
module = cls.__module__ == module
name = cls.__name__ == name
return module and name
is_same_module = cls.__module__ == module
has_same_name = cls.__name__ == name
return is_same_module and has_same_name
# pandas
@ -37,34 +39,49 @@ try:
except ImportError:
MultiIndex = object
DataFrame: Any = object
DataFrame = object
Series = object
pandas_concat = None
PANDAS_INSTALLED = False
# sklearn
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.base import (
BaseEstimator as XGBModelBase,
RegressorMixin as XGBRegressorBase,
ClassifierMixin as XGBClassifierBase
)
from sklearn.preprocessing import LabelEncoder
try:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import (
KFold as XGBKFold,
StratifiedKFold as XGBStratifiedKFold
)
except ImportError:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.cross_validation import (
KFold as XGBKFold,
StratifiedKFold as XGBStratifiedKFold
)
SKLEARN_INSTALLED = True
XGBModelBase = BaseEstimator
XGBRegressorBase = RegressorMixin
XGBClassifierBase = ClassifierMixin
except ImportError:
SKLEARN_INSTALLED = False
XGBKFold = KFold
XGBStratifiedKFold = StratifiedKFold
# used for compatibility without sklearn
XGBModelBase = object
XGBClassifierBase = object
XGBRegressorBase = object
LabelEncoder = object
class XGBoostLabelEncoder(LabelEncoder):
XGBKFold = None
XGBStratifiedKFold = None
class XGBoostLabelEncoder(LabelEncoder):
'''Label encoder with JSON serialization methods.'''
def to_json(self):
def to_json(self) -> Dict:
'''Returns a JSON compatible dictionary'''
meta = {}
for k, v in self.__dict__.items():
@ -74,7 +91,7 @@ try:
meta[k] = v
return meta
def from_json(self, doc):
def from_json(self, doc: Dict) -> None:
# pylint: disable=attribute-defined-outside-init
'''Load the encoder back from a JSON compatible dict.'''
meta = {}
@ -84,17 +101,6 @@ try:
continue
meta[k] = v
self.__dict__.update(meta)
except ImportError:
SKLEARN_INSTALLED = False
# used for compatibility without sklearn
XGBModelBase = object
XGBClassifierBase = object
XGBRegressorBase = object
XGBKFold = None
XGBStratifiedKFold = None
XGBoostLabelEncoder = None
# dask
@ -113,7 +119,7 @@ try:
SCIPY_INSTALLED = True
except ImportError:
scipy_sparse = False
scipy_csr: Any = object
scipy_csr = object
SCIPY_INSTALLED = False
@ -136,15 +142,21 @@ class LazyLoader(types.ModuleType):
"""Lazily import a module, mainly to avoid pulling in large dependencies.
"""
def __init__(self, local_name, parent_module_globals, name, warning=None):
def __init__(
self,
local_name: str,
parent_module_globals: Dict,
name: str,
warning: Optional[str] = None
) -> None:
self._local_name = local_name
self._parent_module_globals = parent_module_globals
self._warning = warning
self.module = None
self.module: Optional[types.ModuleType] = None
super().__init__(name)
def _load(self):
def _load(self) -> types.ModuleType:
"""Load the module and insert it into the parent's globals."""
# Import the target module and insert it into the parent's namespace
module = importlib.import_module(self.__name__)
@ -163,12 +175,12 @@ class LazyLoader(types.ModuleType):
return module
def __getattr__(self, item):
def __getattr__(self, item: str) -> Any:
if not self.module:
self.module = self._load()
return getattr(self.module, item)
def __dir__(self):
def __dir__(self) -> List[str]:
if not self.module:
self.module = self._load()
return dir(self.module)

View File

@ -4,12 +4,20 @@ import ctypes
import json
from contextlib import contextmanager
from functools import wraps
from typing import Optional, Callable, Any, Dict, cast, Iterator
from .core import _LIB, _check_call, c_str, py_str
from ._typing import _F
def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
see_also=None):
def config_doc(
*,
header: Optional[str] = None,
extra_note: Optional[str] = None,
parameters: Optional[str] = None,
returns: Optional[str] = None,
see_also: Optional[str] = None
) -> Callable[[_F], _F]:
"""Decorator to format docstring for config functions.
Parameters
@ -64,19 +72,19 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
assert xgb.get_config()['verbosity'] == 2 # old value restored
"""
def none_to_str(value):
def none_to_str(value: Optional[str]) -> str:
return '' if value is None else value
def config_doc_decorator(func):
def config_doc_decorator(func: _F) -> _F:
func.__doc__ = (doc_template.format(header=none_to_str(header),
extra_note=none_to_str(extra_note))
+ none_to_str(parameters) + none_to_str(returns)
+ none_to_str(common_example) + none_to_str(see_also))
@wraps(func)
def wrap(*args, **kwargs):
def wrap(*args: Any, **kwargs: Any) -> Any:
return func(*args, **kwargs)
return wrap
return cast(_F, wrap)
return config_doc_decorator
@ -89,7 +97,7 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
new_config: Dict[str, Any]
Keyword arguments representing the parameters and their values
""")
def set_config(**new_config):
def set_config(**new_config: Any) -> None:
config = json.dumps(new_config)
_check_call(_LIB.XGBSetGlobalConfig(c_str(config)))
@ -103,7 +111,7 @@ def set_config(**new_config):
args: Dict[str, Any]
The list of global parameters and their values
""")
def get_config():
def get_config() -> Dict[str, Any]:
config_str = ctypes.c_char_p()
_check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str)))
config = json.loads(py_str(config_str.value))
@ -132,7 +140,7 @@ def get_config():
set_config: Set global XGBoost configuration
get_config: Get current values of the global configuration
""")
def config_context(**new_config):
def config_context(**new_config: Any) -> Iterator[None]:
old_config = get_config().copy()
set_config(**new_config)

View File

@ -30,10 +30,12 @@ from ._typing import (
ArrayLike,
CFloatPtr,
NumpyOrCupy,
FeatureNames,
FeatureInfo,
FeatureTypes,
FeatureNames,
_T,
CupyT,
BoosterParam
)
@ -273,7 +275,7 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n
if not isinstance(cptr, ctypes.POINTER(ctype)):
raise RuntimeError(f"expected {ctype} pointer")
res = np.zeros(length, dtype=dtype)
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): # type: ignore
raise RuntimeError("memmove failed")
return res
@ -310,7 +312,7 @@ def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
raise RuntimeError('expected char pointer')
res = bytearray(length)
rptr = (ctypes.c_char * length).from_buffer(res)
if not ctypes.memmove(rptr, cptr, length):
if not ctypes.memmove(rptr, cptr, length): # type: ignore
raise RuntimeError('memmove failed')
return res
@ -434,8 +436,8 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
def data_handle(
data: Any,
*,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
**kwargs: Any,
) -> None:
from .data import dispatch_proxy_set_data
@ -555,8 +557,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
base_margin: Optional[ArrayLike] = None,
missing: Optional[float] = None,
silent: bool = False,
feature_names: FeatureNames = None,
feature_types: FeatureTypes = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
nthread: Optional[int] = None,
group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None,
@ -718,8 +720,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
qid: Optional[ArrayLike] = None,
label_lower_bound: Optional[ArrayLike] = None,
label_upper_bound: Optional[ArrayLike] = None,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
feature_weights: Optional[ArrayLike] = None
) -> None:
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
@ -1000,7 +1002,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return res
@property
def feature_names(self) -> Optional[List[str]]:
def feature_names(self) -> Optional[FeatureNames]:
"""Get feature names (column labels).
Returns
@ -1023,7 +1025,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return feature_names
@feature_names.setter
def feature_names(self, feature_names: FeatureNames) -> None:
def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
"""Set feature names (column labels).
Parameters
@ -1039,7 +1041,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
else:
feature_names = [feature_names]
except TypeError:
feature_names = [feature_names]
feature_names = [cast(str, feature_names)]
if len(feature_names) != len(set(feature_names)):
raise ValueError('feature_names must be unique')
@ -1069,8 +1071,13 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
self.feature_types = None
@property
def feature_types(self) -> Optional[List[str]]:
"""Get feature types. See :py:class:`DMatrix` for details."""
def feature_types(self) -> Optional[FeatureTypes]:
"""Get feature types (column types).
Returns
-------
feature_types : list or None
"""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
@ -1111,7 +1118,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
else:
feature_types = [feature_types]
except TypeError:
feature_types = [feature_types]
feature_types = [cast(str, feature_types)]
feature_types_bytes = [bytes(f, encoding='utf-8')
for f in feature_types]
c_feature_types = (ctypes.c_char_p *
@ -1203,8 +1210,8 @@ class DeviceQuantileDMatrix(DMatrix):
base_margin: Optional[ArrayLike] = None,
missing: Optional[float] = None,
silent: bool = False,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
nthread: Optional[int] = None,
max_bin: int = 256,
group: Optional[ArrayLike] = None,
@ -1323,7 +1330,7 @@ def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
return num_parallel_tree, num_groups
def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]:
def _configure_metrics(params: BoosterParam) -> BoosterParam:
if (
isinstance(params, dict)
and "eval_metric" in params
@ -1349,7 +1356,7 @@ class Booster:
def __init__(
self,
params: Optional[Dict] = None,
params: Optional[BoosterParam] = None,
cache: Optional[Sequence[DMatrix]] = None,
model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None
) -> None:
@ -1444,7 +1451,7 @@ class Booster:
"Constrained features are not a subset of training data feature names"
) from e
def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]:
def _configure_constraints(self, params: BoosterParam) -> BoosterParam:
if isinstance(params, dict):
value = params.get("monotone_constraints")
if value is not None:
@ -1607,7 +1614,7 @@ class Booster:
return py_str(ret.value)
return None
def attributes(self) -> Dict[str, str]:
def attributes(self) -> Dict[str, Optional[str]]:
"""Get attributes stored in the Booster as a dictionary.
Returns
@ -1639,7 +1646,7 @@ class Booster:
_check_call(_LIB.XGBoosterSetAttr(
self.handle, c_str(key), value))
def _get_feature_info(self, field: str) -> Optional[List[str]]:
def _get_feature_info(self, field: str) -> Optional[FeatureInfo]:
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
if not hasattr(self, "handle") or self.handle is None:
@ -1652,7 +1659,7 @@ class Booster:
feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None
def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None:
def _set_feature_info(self, features: Optional[FeatureInfo], field: str) -> None:
if features is not None:
assert isinstance(features, list)
feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
@ -1670,7 +1677,7 @@ class Booster:
)
@property
def feature_types(self) -> Optional[List[str]]:
def feature_types(self) -> Optional[FeatureTypes]:
"""Feature types for this booster. Can be directly set by input data or by
assignment. See :py:class:`DMatrix` for details.
@ -1678,11 +1685,11 @@ class Booster:
return self._get_feature_info("feature_type")
@feature_types.setter
def feature_types(self, features: Optional[List[str]]) -> None:
def feature_types(self, features: Optional[FeatureTypes]) -> None:
self._set_feature_info(features, "feature_type")
@property
def feature_names(self) -> Optional[List[str]]:
def feature_names(self) -> Optional[FeatureNames]:
"""Feature names for this booster. Can be directly set by input data or by
assignment.
@ -1690,7 +1697,7 @@ class Booster:
return self._get_feature_info("feature_name")
@feature_names.setter
def feature_names(self, features: FeatureNames) -> None:
def feature_names(self, features: Optional[FeatureNames]) -> None:
self._set_feature_info(features, "feature_name")
def set_param(
@ -1711,7 +1718,7 @@ class Booster:
params = params.items()
elif isinstance(params, str) and value is not None:
params = [(params, value)]
for key, val in params:
for key, val in cast(Iterable[Tuple[str, str]], params):
if val is not None:
_check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key),
c_str(str(val))))
@ -2564,8 +2571,10 @@ class Booster:
)
# Booster can't accept data with different feature names
if self.feature_names != data.feature_names:
dat_missing = set(self.feature_names) - set(data.feature_names)
my_missing = set(data.feature_names) - set(self.feature_names)
dat_missing = set(cast(FeatureNames, self.feature_names)) - \
set(cast(FeatureNames, data.feature_names))
my_missing = set(cast(FeatureNames, data.feature_names)) - \
set(cast(FeatureNames, self.feature_names))
msg = 'feature_names mismatch: {0} {1}'

View File

@ -318,7 +318,7 @@ class DaskDMatrix:
base_margin: Optional[_DaskCollection] = None,
missing: float = None,
silent: bool = False, # pylint: disable=unused-argument
feature_names: FeatureNames = None,
feature_names: Optional[FeatureNames] = None,
feature_types: FeatureTypes = None,
group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None,
@ -594,7 +594,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
qid: Optional[List[Any]] = None,
label_lower_bound: Optional[List[Any]] = None,
label_upper_bound: Optional[List[Any]] = None,
feature_names: FeatureNames = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[Union[Any, List[Any]]] = None,
) -> None:
self._data = data
@ -637,7 +637,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
if self._iter == len(self._data):
# Return 0 when there's no more batch.
return 0
feature_names: FeatureNames = None
feature_names: Optional[FeatureNames] = None
if self._feature_names:
feature_names = self._feature_names
else:
@ -688,7 +688,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
base_margin: Optional[_DaskCollection] = None,
missing: float = None,
silent: bool = False, # disable=unused-argument
feature_names: FeatureNames = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[Union[Any, List[Any]]] = None,
max_bin: int = 256,
group: Optional[_DaskCollection] = None,
@ -725,7 +725,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
def _create_device_quantile_dmatrix(
feature_names: FeatureNames,
feature_names: Optional[FeatureNames],
feature_types: Optional[Union[Any, List[Any]]],
feature_weights: Optional[Any],
missing: float,
@ -766,7 +766,7 @@ def _create_device_quantile_dmatrix(
def _create_dmatrix(
feature_names: FeatureNames,
feature_names: Optional[FeatureNames],
feature_types: Optional[Union[Any, List[Any]]],
feature_weights: Optional[Any],
missing: float,

View File

@ -5,17 +5,26 @@ import ctypes
import json
import warnings
import os
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast
import numpy as np
from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
from ._typing import FeatureTypes
from .core import DataIter, _ProxyDMatrix, DMatrix
from .compat import lazy_isinstance, DataFrame
from ._typing import (
c_bst_ulong,
DataType,
FeatureTypes,
FeatureNames,
NumpyDType,
CupyT,
FloatCompatible, PandasDType
)
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
DispatchedDataBackendReturnType = Tuple[
ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]]
CAT_T = "c"
@ -23,14 +32,14 @@ CAT_T = "c"
_matrix_meta = {"base_margin", "label"}
def _warn_unused_missing(data, missing):
def _warn_unused_missing(data: DataType, missing: Optional[FloatCompatible]) -> None:
if (missing is not None) and (not np.isnan(missing)):
warnings.warn(
'`missing` is not used for current input data type:' +
str(type(data)), UserWarning)
def _check_complex(data):
def _check_complex(data: DataType) -> None:
'''Test whether data is complex using `dtype` attribute.'''
complex_dtypes = (np.complex128, np.complex64,
np.cfloat, np.cdouble, np.clongdouble)
@ -38,16 +47,15 @@ def _check_complex(data):
raise ValueError('Complex data not supported')
def _check_data_shape(data: Any) -> None:
def _check_data_shape(data: DataType) -> None:
if hasattr(data, "shape") and len(data.shape) != 2:
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
def _is_scipy_csr(data):
def _is_scipy_csr(data: DataType) -> bool:
try:
import scipy
import scipy.sparse
except ImportError:
scipy = None
return False
return isinstance(data, scipy.sparse.csr_matrix)
@ -64,12 +72,12 @@ def _array_interface(data: np.ndarray) -> bytes:
def _from_scipy_csr(
data,
missing,
nthread,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: DataType,
missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
"""Initialize data from a CSR matrix."""
if len(data.indices) != len(data.data):
raise ValueError(
@ -94,21 +102,20 @@ def _from_scipy_csr(
return handle, feature_names, feature_types
def _is_scipy_csc(data):
def _is_scipy_csc(data: DataType) -> bool:
try:
import scipy
import scipy.sparse
except ImportError:
scipy = None
return False
return isinstance(data, scipy.sparse.csc_matrix)
def _from_scipy_csc(
data,
missing,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: DataType,
missing: Optional[FloatCompatible],
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
if len(data.indices) != len(data.data):
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
_warn_unused_missing(data, missing)
@ -124,27 +131,29 @@ def _from_scipy_csc(
return handle, feature_names, feature_types
def _is_scipy_coo(data):
def _is_scipy_coo(data: DataType) -> bool:
try:
import scipy
import scipy.sparse
except ImportError:
scipy = None
return False
return isinstance(data, scipy.sparse.coo_matrix)
def _is_numpy_array(data):
def _is_numpy_array(data: DataType) -> bool:
return isinstance(data, (np.ndarray, np.matrix))
def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]:
def _ensure_np_dtype(
data: DataType,
dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
data = data.astype(np.float32, copy=False)
dtype = np.float32
return data, dtype
def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
'''Handle numpy slice. This can be removed if we use __array_interface__.
'''
try:
@ -159,12 +168,12 @@ def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
def _from_numpy_array(
data,
missing,
nthread,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: DataType,
missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
"""Initialize data from a 2-D numpy matrix.
"""
@ -189,7 +198,7 @@ def _from_numpy_array(
return handle, feature_names, feature_types
def _is_pandas_df(data):
def _is_pandas_df(data: DataType) -> bool:
try:
import pandas as pd
except ImportError:
@ -197,7 +206,7 @@ def _is_pandas_df(data):
return isinstance(data, pd.DataFrame)
def _is_modin_df(data):
def _is_modin_df(data: DataType) -> bool:
try:
import modin.pandas as pd
except ImportError:
@ -232,7 +241,7 @@ _ENABLE_CAT_ERR = (
)
def _invalid_dataframe_dtype(data: Any) -> None:
def _invalid_dataframe_dtype(data: DataType) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
@ -253,10 +262,10 @@ def _invalid_dataframe_dtype(data: Any) -> None:
def _pandas_feature_info(
data: DataFrame,
meta: Optional[str],
feature_names: FeatureNames,
feature_types: FeatureTypes,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> Tuple[FeatureNames, FeatureTypes]:
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
import pandas as pd
from pandas.api.types import (
is_sparse,
@ -285,13 +294,13 @@ def _pandas_feature_info(
return feature_names, feature_types
def is_nullable_dtype(dtype: Any) -> bool:
def is_nullable_dtype(dtype: PandasDType) -> bool:
"""Wether dtype is a pandas nullable type."""
from pandas.api.types import is_integer_dtype, is_bool_dtype
# dtype: pd.core.arrays.numeric.NumericDtype
nullable_alias = {"Int16", "Int32", "Int64"}
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
# np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`.
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
return is_int or is_bool
@ -331,11 +340,11 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame:
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: FeatureNames = None,
feature_types: FeatureTypes = None,
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[FeatureTypes] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
from pandas.api.types import (
is_sparse,
is_categorical_dtype,
@ -359,7 +368,7 @@ def _transform_pandas_df(
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
dtype = meta_type if meta_type else np.float32
arr: np.ndarray = transformed.values
if meta_type:
arr = arr.astype(dtype)
@ -369,18 +378,18 @@ def _transform_pandas_df(
def _from_pandas_df(
data: DataFrame,
enable_categorical: bool,
missing: float,
missing: FloatCompatible,
nthread: int,
feature_names: FeatureNames,
feature_types: FeatureTypes,
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
def _is_pandas_series(data):
def _is_pandas_series(data: DataType) -> bool:
try:
import pandas as pd
except ImportError:
@ -389,18 +398,21 @@ def _is_pandas_series(data):
def _meta_from_pandas_series(
data, name: str, dtype: Optional[str], handle: ctypes.c_void_p
data: DataType,
name: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
"""Help transform pandas series for meta data like labels"""
data = data.values.astype('float')
from pandas.api.types import is_sparse
if is_sparse(data):
data = data.to_dense()
data = data.to_dense() # type: ignore
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
def _is_modin_series(data):
def _is_modin_series(data: DataType) -> bool:
try:
import modin.pandas as pd
except ImportError:
@ -409,13 +421,13 @@ def _is_modin_series(data):
def _from_pandas_series(
data,
missing: float,
data: DataType,
missing: FloatCompatible,
nthread: int,
enable_categorical: bool,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
from pandas.api.types import is_categorical_dtype
if (data.dtype.name not in _pandas_dtype_mapper) and not (
@ -433,7 +445,7 @@ def _from_pandas_series(
)
def _is_dt_df(data):
def _is_dt_df(data: DataType) -> bool:
return lazy_isinstance(data, 'datatable', 'Frame') or \
lazy_isinstance(data, 'datatable', 'DataTable')
@ -443,12 +455,12 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df(
data,
feature_names: FeatureNames,
feature_types: FeatureTypes,
meta=None,
meta_type=None,
):
data: DataType,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
meta: Optional[str] = None,
meta_type: Optional[NumpyDType] = None,
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
"""Validate feature names and types if data table"""
if meta and data.shape[1] > 1:
raise ValueError('DataTable for meta info cannot have multiple columns')
@ -482,13 +494,13 @@ def _transform_dt_df(
def _from_dt_df(
data,
missing,
nthread,
feature_names: FeatureNames,
feature_types: FeatureTypes,
data: DataType,
missing: Optional[FloatCompatible],
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
) -> DispatchedDataBackendReturnType:
if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df(
@ -525,7 +537,7 @@ def _from_dt_df(
return handle, feature_names, feature_types
def _is_arrow(data) -> bool:
def _is_arrow(data: DataType) -> bool:
try:
import pyarrow as pa
from pyarrow import dataset as arrow_dataset
@ -571,13 +583,13 @@ def record_batch_data_iter(data_iter: Iterator) -> Callable:
def _from_arrow(
data,
missing: float,
data: DataType,
missing: FloatCompatible,
nthread: int,
feature_names: FeatureNames,
feature_types: FeatureTypes,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
) -> DispatchedDataBackendReturnType:
import pyarrow as pa
if not all(
@ -605,11 +617,11 @@ def _from_arrow(
return handle, feature_names, feature_types
def _is_cudf_df(data) -> bool:
def _is_cudf_df(data: DataType) -> bool:
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of array
@ -645,11 +657,11 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df(
data,
feature_names: FeatureNames,
feature_types: FeatureTypes,
data: DataType,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
):
) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
try:
from cudf.api.types import is_categorical_dtype
except ImportError:
@ -709,13 +721,13 @@ def _transform_cudf_df(
def _from_cudf_df(
data,
missing,
nthread,
feature_names: FeatureNames,
feature_types: FeatureTypes,
data: DataType,
missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]:
) -> DispatchedDataBackendReturnType:
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
data, feature_names, feature_types, enable_categorical
)
@ -732,7 +744,7 @@ def _from_cudf_df(
return handle, feature_names, feature_types
def _is_cudf_ser(data):
def _is_cudf_ser(data: DataType) -> bool:
try:
import cudf
except ImportError:
@ -740,13 +752,13 @@ def _is_cudf_ser(data):
return isinstance(data, cudf.Series)
def _is_cupy_array(data: Any) -> bool:
def _is_cupy_array(data: DataType) -> bool:
return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
data, "cupy._core.core", "ndarray"
)
def _transform_cupy_array(data):
def _transform_cupy_array(data: DataType) -> CupyT:
import cupy # pylint: disable=import-error
if not hasattr(data, '__cuda_array_interface__') and hasattr(
data, '__array__'):
@ -757,12 +769,12 @@ def _transform_cupy_array(data):
def _from_cupy_array(
data,
missing,
nthread,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: DataType,
missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
"""Initialize DMatrix from cupy ndarray."""
data = _transform_cupy_array(data)
interface_str = _cuda_array_interface(data)
@ -776,7 +788,7 @@ def _from_cupy_array(
return handle, feature_names, feature_types
def _is_cupy_csr(data):
def _is_cupy_csr(data: DataType) -> bool:
try:
import cupyx
except ImportError:
@ -784,7 +796,7 @@ def _is_cupy_csr(data):
return isinstance(data, cupyx.scipy.sparse.csr_matrix)
def _is_cupy_csc(data):
def _is_cupy_csc(data: DataType) -> bool:
try:
import cupyx
except ImportError:
@ -792,11 +804,11 @@ def _is_cupy_csc(data):
return isinstance(data, cupyx.scipy.sparse.csc_matrix)
def _is_dlpack(data):
def _is_dlpack(data: DataType) -> bool:
return 'PyCapsule' in str(type(data)) and "dltensor" in str(data)
def _transform_dlpack(data):
def _transform_dlpack(data: DataType) -> bool:
from cupy import fromDlpack # pylint: disable=E0401
assert 'used_dltensor' not in str(data)
data = fromDlpack(data)
@ -804,27 +816,27 @@ def _transform_dlpack(data):
def _from_dlpack(
data,
missing,
nthread,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: DataType,
missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
data = _transform_dlpack(data)
return _from_cupy_array(data, missing, nthread, feature_names,
feature_types)
def _is_uri(data):
def _is_uri(data: DataType) -> bool:
return isinstance(data, (str, os.PathLike))
def _from_uri(
data,
missing,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: DataType,
missing: Optional[FloatCompatible],
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
_warn_unused_missing(data, missing)
handle = ctypes.c_void_p()
data = os.fspath(os.path.expanduser(data))
@ -834,51 +846,51 @@ def _from_uri(
return handle, feature_names, feature_types
def _is_list(data):
def _is_list(data: DataType) -> bool:
return isinstance(data, list)
def _from_list(
data,
missing,
n_threads,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: Sequence,
missing: FloatCompatible,
n_threads: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
array = np.array(data)
_check_data_shape(data)
return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
def _is_tuple(data):
def _is_tuple(data: DataType) -> bool:
return isinstance(data, tuple)
def _from_tuple(
data,
missing,
n_threads,
feature_names: FeatureNames,
feature_types: FeatureTypes,
):
data: Sequence,
missing: FloatCompatible,
n_threads: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType:
return _from_list(data, missing, n_threads, feature_names, feature_types)
def _is_iter(data):
def _is_iter(data: DataType) -> bool:
return isinstance(data, DataIter)
def _has_array_protocol(data):
def _has_array_protocol(data: DataType) -> bool:
return hasattr(data, '__array__')
def _convert_unknown_data(data):
def _convert_unknown_data(data: DataType) -> DataType:
warnings.warn(
f'Unknown data type: {type(data)}, trying to convert it to csr_matrix',
UserWarning
)
try:
import scipy
import scipy.sparse
except ImportError:
return None
@ -891,13 +903,13 @@ def _convert_unknown_data(data):
def dispatch_data_backend(
data,
missing,
threads,
feature_names: FeatureNames,
feature_types: FeatureTypes,
data: DataType,
missing: FloatCompatible, # Or Optional[Float]
threads: int,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool = False,
):
) -> DispatchedDataBackendReturnType:
'''Dispatch data for DMatrix.'''
if not _is_cudf_ser(data) and not _is_pandas_series(data):
_check_data_shape(data)
@ -964,7 +976,7 @@ def dispatch_data_backend(
raise TypeError('Not supported type for data.' + str(type(data)))
def _to_data_type(dtype: str, name: str):
def _to_data_type(dtype: str, name: str) -> int:
dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
if dtype not in dtype_map:
raise TypeError(
@ -973,7 +985,7 @@ def _to_data_type(dtype: str, name: str):
return dtype_map[dtype]
def _validate_meta_shape(data: Any, name: str) -> None:
def _validate_meta_shape(data: DataType, name: str) -> None:
if hasattr(data, "shape"):
msg = f"Invalid shape: {data.shape} for {name}"
if name in _matrix_meta:
@ -990,7 +1002,7 @@ def _validate_meta_shape(data: Any, name: str) -> None:
def _meta_from_numpy(
data: np.ndarray,
field: str,
dtype: Optional[Union[np.dtype, str]],
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p,
) -> None:
data, dtype = _ensure_np_dtype(data, dtype)
@ -1001,16 +1013,26 @@ def _meta_from_numpy(
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
def _meta_from_list(data, field, dtype, handle):
data = np.array(data)
_meta_from_numpy(data, field, dtype, handle)
def _meta_from_list(
data: Sequence,
field: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
data_np = np.array(data)
_meta_from_numpy(data_np, field, dtype, handle)
def _meta_from_tuple(data, field, dtype, handle):
def _meta_from_tuple(
data: Sequence,
field: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
return _meta_from_list(data, field, dtype, handle)
def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
if field not in _matrix_meta:
_meta_from_cudf_series(data.iloc[:, 0], field, handle)
else:
@ -1019,7 +1041,7 @@ def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
def _meta_from_cudf_series(data, field, handle):
def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
interface = bytes(json.dumps([data.__cuda_array_interface__],
indent=2), 'utf-8')
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle,
@ -1027,7 +1049,7 @@ def _meta_from_cudf_series(data, field, handle):
interface))
def _meta_from_cupy_array(data, field, handle):
def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
data = _transform_cupy_array(data)
interface = bytes(json.dumps([data.__cuda_array_interface__],
indent=2), 'utf-8')
@ -1036,14 +1058,22 @@ def _meta_from_cupy_array(data, field, handle):
interface))
def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p):
def _meta_from_dt(
data: DataType,
field: str,
dtype: Optional[NumpyDType],
handle: ctypes.c_void_p
) -> None:
data, _, _ = _transform_dt_df(data, None, None, field, dtype)
_meta_from_numpy(data, field, dtype, handle)
def dispatch_meta_backend(
matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None
):
matrix: DMatrix,
data: DataType,
name: str,
dtype: Optional[NumpyDType] = None
) -> None:
'''Dispatch for meta info.'''
handle = matrix.handle
assert handle is not None
@ -1060,8 +1090,7 @@ def dispatch_meta_backend(
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, False, meta=name,
meta_type=dtype)
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_series(data):
@ -1107,7 +1136,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
area for meta info.
'''
def __init__(self, **kwargs: Any):
def __init__(self, **kwargs: Any) -> None:
self.kwargs = kwargs
self.it = 0 # pylint: disable=invalid-name
super().__init__()
@ -1124,11 +1153,13 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform(
data,
feature_names: FeatureNames,
feature_types: FeatureTypes,
data: DataType,
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
):
) -> Tuple[
Union[bool, ctypes.c_void_p, np.ndarray],
Optional[list], Optional[FeatureNames], Optional[FeatureTypes]]:
if _is_cudf_df(data) or _is_cudf_ser(data):
return _transform_cudf_df(
data, feature_names, feature_types, enable_categorical
@ -1152,7 +1183,7 @@ def _proxy_transform(
def dispatch_proxy_set_data(
proxy: _ProxyDMatrix,
data: Any,
data: DataType,
cat_codes: Optional[list],
allow_host: bool,
) -> None:
@ -1162,11 +1193,11 @@ def dispatch_proxy_set_data(
if _is_cudf_df(data):
# pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
return
if _is_cudf_ser(data):
# pylint: disable=W0212
proxy._set_data_from_cuda_columnar(data, cat_codes)
proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
return
if _is_cupy_array(data):
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212

View File

@ -4,16 +4,34 @@
"""Plotting Library."""
from io import BytesIO
import json
from typing import Optional, Any
import numpy as np
from ._typing import PathLike
from .core import Booster
from .sklearn import XGBModel
Axes = Any # real type is matplotlib.axes.Axes
GraphvizSource = Any # real type is graphviz.Source
def plot_importance(booster, ax=None, height=0.2,
xlim=None, ylim=None, title='Feature importance',
xlabel='F score', ylabel='Features', fmap='',
importance_type='weight', max_num_features=None,
grid=True, show_values=True, **kwargs):
def plot_importance(
booster: Booster,
ax: Optional[Axes] = None,
height: float = 0.2,
xlim: Optional[tuple] = None,
ylim: Optional[tuple] = None,
title: str = "Feature importance",
xlabel: str = "F score",
ylabel: str = "Features",
fmap: PathLike = "",
importance_type: str = "weight",
max_num_features: Optional[int] = None,
grid: bool = True,
show_values: bool = True,
**kwargs: Any
) -> Axes:
"""Plot importance based on fitted trees.
Parameters
@ -78,9 +96,9 @@ def plot_importance(booster, ax=None, height=0.2,
tuples = [(k, importance[k]) for k in importance]
if max_num_features is not None:
# pylint: disable=invalid-unary-operand-type
tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:]
tuples = sorted(tuples, key=lambda _x: _x[1])[-max_num_features:]
else:
tuples = sorted(tuples, key=lambda x: x[1])
tuples = sorted(tuples, key=lambda _x: _x[1])
labels, values = zip(*tuples)
if ax is None:
@ -120,9 +138,17 @@ def plot_importance(booster, ax=None, height=0.2,
return ax
def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
yes_color=None, no_color=None,
condition_node_params=None, leaf_node_params=None, **kwargs):
def to_graphviz(
booster: Booster,
fmap: PathLike = "",
num_trees: int = 0,
rankdir: Optional[str] = None,
yes_color: Optional[str] = None,
no_color: Optional[str] = None,
condition_node_params: Optional[dict] = None,
leaf_node_params: Optional[dict] = None,
**kwargs: Any
) -> GraphvizSource:
"""Convert specified tree to graphviz instance. IPython can automatically plot
the returned graphiz instance. Otherwise, you should call .render() method
of the returned graphiz instance.
@ -212,7 +238,14 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
return g
def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs):
def plot_tree(
booster: Booster,
fmap: PathLike = "",
num_trees: int = 0,
rankdir: Optional[str] = None,
ax: Optional[Axes] = None,
**kwargs: Any
) -> Axes:
"""Plot specified tree.
Parameters

View File

@ -4,8 +4,19 @@ import copy
import warnings
import json
import os
from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar, Type, cast
from typing import Sequence
from typing import (
Union,
Optional,
List,
Dict,
Callable,
Sequence,
Tuple,
Any,
TypeVar,
Type,
cast,
)
import numpy as np
from .core import Booster, DMatrix, XGBoostError
@ -14,7 +25,7 @@ from .core import Metric
from .training import train
from .callback import TrainingCallback
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
from ._typing import ArrayLike, FeatureTypes
from ._typing import ArrayLike, FeatureNames, FeatureTypes
# Do not use class names on scikit-learn directly. Re-define the classes on
# .compat to guarantee the behavior without scikit-learn
@ -401,7 +412,7 @@ def _wrap_evaluation_matrices(
eval_qid: Optional[Sequence[Any]],
create_dmatrix: Callable,
enable_categorical: bool,
feature_types: FeatureTypes,
feature_types: Optional[FeatureTypes],
) -> Tuple[Any, List[Tuple[Any, str]]]:
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the way.
@ -717,7 +728,7 @@ class XGBModel(XGBModelBase):
return self._estimator_type # pylint: disable=no-member
def save_model(self, fname: Union[str, os.PathLike]) -> None:
meta = {}
meta: Dict[str, Any] = {}
for k, v in self.__dict__.items():
if k == '_le':
meta['_le'] = self._le.to_json()
@ -1231,7 +1242,7 @@ class XGBModel(XGBModelBase):
importance_type=self.importance_type if self.importance_type else dft()
)
if b.feature_names is None:
feature_names = [f"f{i}" for i in range(self.n_features_in_)]
feature_names: FeatureNames = [f"f{i}" for i in range(self.n_features_in_)]
else:
feature_names = b.feature_names
# gblinear returns all features so the `get` in next line is only for gbtree.

View File

@ -5,20 +5,24 @@
import copy
import os
import warnings
from typing import Optional, Dict, Any, Union, Tuple, Sequence
from typing import Optional, Dict, Any, Union, Tuple, Sequence, List, cast, Iterable
import numpy as np
from .callback import TrainingCallback, CallbackContainer, EvaluationMonitor, EarlyStopping
from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args
from .core import Metric, Objective
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
from . import callback
from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold, DataFrame
from ._typing import _F, FPreProcCallable, BoosterParam
_CVFolds = Sequence["CVPack"]
def _assert_new_callback(
callbacks: Optional[Sequence[callback.TrainingCallback]]
callbacks: Optional[Sequence[TrainingCallback]]
) -> None:
is_new_callback: bool = not callbacks or all(
isinstance(c, callback.TrainingCallback) for c in callbacks
isinstance(c, TrainingCallback) for c in callbacks
)
if not is_new_callback:
link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
@ -56,10 +60,10 @@ def train(
feval: Optional[Metric] = None,
maximize: Optional[bool] = None,
early_stopping_rounds: Optional[int] = None,
evals_result: callback.TrainingCallback.EvalsLog = None,
evals_result: TrainingCallback.EvalsLog = None,
verbose_eval: Optional[Union[bool, int]] = True,
xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None,
callbacks: Optional[Sequence[callback.TrainingCallback]] = None,
callbacks: Optional[Sequence[TrainingCallback]] = None,
custom_metric: Optional[Metric] = None,
) -> Booster:
"""Train a booster with given parameters.
@ -159,12 +163,12 @@ def train(
_assert_new_callback(callbacks)
if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(callback.EvaluationMonitor(period=verbose_eval))
callbacks.append(EvaluationMonitor(period=verbose_eval))
if early_stopping_rounds:
callbacks.append(
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
)
cb_container = callback.CallbackContainer(
cb_container = CallbackContainer(
callbacks,
metric=metric_fn,
# For old `feval` parameter, the behavior is unchanged. For the new
@ -194,71 +198,73 @@ def train(
class CVPack:
""""Auxiliary datastruct to hold one fold of CV."""
def __init__(self, dtrain, dtest, param):
def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None:
""""Initialize the CVPack"""
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
self.bst = Booster(param, [dtrain, dtest])
def __getattr__(self, name):
def _inner(*args, **kwargs):
def __getattr__(self, name: str) -> _F:
def _inner(*args: Any, **kwargs: Any) -> Any:
return getattr(self.bst, name)(*args, **kwargs)
return _inner
return cast(_F, _inner)
def update(self, iteration, fobj):
def update(self, iteration: int, fobj: Optional[Objective]) -> None:
""""Update the boosters for one iteration"""
self.bst.update(self.dtrain, iteration, fobj)
def eval(self, iteration, feval, output_margin):
def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str:
""""Evaluate the CVPack for one iteration."""
return self.bst.eval_set(self.watchlist, iteration, feval, output_margin)
class _PackedBooster:
def __init__(self, cvfolds) -> None:
def __init__(self, cvfolds: _CVFolds) -> None:
self.cvfolds = cvfolds
def update(self, iteration, obj):
def update(self, iteration: int, obj: Optional[Objective]) -> None:
'''Iterate through folds for update'''
for fold in self.cvfolds:
fold.update(iteration, obj)
def eval(self, iteration, feval, output_margin):
def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]:
'''Iterate through folds for eval'''
result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
return result
def set_attr(self, **kwargs):
def set_attr(self, **kwargs: Optional[str]) -> Any:
'''Iterate through folds for setting attributes'''
for f in self.cvfolds:
f.bst.set_attr(**kwargs)
def attr(self, key):
def attr(self, key: str) -> Optional[str]:
'''Redirect to booster attr.'''
return self.cvfolds[0].bst.attr(key)
def set_param(self, params, value=None):
def set_param(self,
params: Union[Dict, Iterable[Tuple[str, Any]], str],
value: Optional[str] = None) -> None:
"""Iterate through folds for set_param"""
for f in self.cvfolds:
f.bst.set_param(params, value)
def num_boosted_rounds(self):
def num_boosted_rounds(self) -> int:
'''Number of boosted rounds.'''
return self.cvfolds[0].num_boosted_rounds()
@property
def best_iteration(self):
def best_iteration(self) -> int:
'''Get best_iteration'''
return int(self.cvfolds[0].bst.attr("best_iteration"))
return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))
@property
def best_score(self):
def best_score(self) -> float:
"""Get best_score."""
return float(self.cvfolds[0].bst.attr("best_score"))
return float(cast(float, self.cvfolds[0].bst.attr("best_score")))
def groups_to_rows(groups, boundaries):
def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
"""
Given group row boundaries, convert ground indexes to row indexes
:param groups: list of groups for testing
@ -268,7 +274,9 @@ def groups_to_rows(groups, boundaries):
return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups])
def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
def mkgroupfold(dall: DMatrix, nfold: int, param: BoosterParam,
evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
shuffle: bool = True) -> List[CVPack]:
"""
Make n folds for cross-validation maintaining groups
:return: cross-validation folds
@ -308,8 +316,10 @@ def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
return ret
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
folds=None, shuffle=True):
def mknfold(dall: DMatrix, nfold: int, param: BoosterParam, seed: int,
evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
stratified: bool = False, folds: XGBStratifiedKFold = None, shuffle: bool = True
) -> List[CVPack]:
"""
Make an n-fold list of CVPack from random indices.
"""
@ -362,11 +372,27 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
return ret
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
metrics=(), obj: Optional[Objective] = None,
feval=None, maximize=None, early_stopping_rounds=None,
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True,
seed=0, callbacks=None, shuffle=True, custom_metric: Optional[Metric] = None):
def cv(
params: BoosterParam,
dtrain: DMatrix,
num_boost_round: int = 10,
nfold: int = 3,
stratified: bool = False,
folds: XGBStratifiedKFold = None,
metrics: Sequence[str] = (),
obj: Optional[Objective] = None,
feval: Optional[Metric] = None,
maximize: bool = None,
early_stopping_rounds: int = None,
fpreproc: FPreProcCallable = None,
as_pandas: bool = True,
verbose_eval: Optional[Union[int, bool]] = None,
show_stdv: bool = True,
seed: int = 0,
callbacks: Sequence[TrainingCallback] = None,
shuffle: bool = True,
custom_metric: Optional[Metric] = None,
) -> Union[Dict[str, float], DataFrame]:
# pylint: disable = invalid-name
"""Cross-validation with given parameters.
@ -477,7 +503,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
params.pop("eval_metric", None)
results = {}
results: Dict[str, List[float]] = {}
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
stratified, folds, shuffle)
@ -490,13 +516,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(
callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
)
if early_stopping_rounds:
callbacks.append(
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
)
callbacks = callback.CallbackContainer(
callbacks_container = CallbackContainer(
callbacks,
metric=metric_fn,
is_cv=True,
@ -504,16 +530,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
)
booster = _PackedBooster(cvfolds)
callbacks.before_training(booster)
callbacks_container.before_training(booster)
for i in range(num_boost_round):
if callbacks.before_iteration(booster, i, dtrain, None):
if callbacks_container.before_iteration(booster, i, dtrain, None):
break
booster.update(i, obj)
should_break = callbacks.after_iteration(booster, i, dtrain, None)
res = callbacks.aggregated_cv
for key, mean, std in res:
should_break = callbacks_container.after_iteration(booster, i, dtrain, None)
res = callbacks_container.aggregated_cv
for key, mean, std in cast(List[Tuple[str, float, float]], res):
if key + '-mean' not in results:
results[key + '-mean'] = []
if key + '-std' not in results:
@ -532,6 +558,6 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
except ImportError:
pass
callbacks.after_training(booster)
callbacks_container.after_training(booster)
return results