Add Type Hints for Python Package (#7742)
Co-authored-by: Chengyang Gu <bridgream@gmail.com> Co-authored-by: Jiamingy <jm.yuan@outlook.com>
This commit is contained in:
parent
71d3b2e036
commit
806c92c80b
@ -1,21 +1,32 @@
|
||||
"""Shared typing definition."""
|
||||
import ctypes
|
||||
import os
|
||||
from typing import Optional, Any, TypeVar, Union, Sequence
|
||||
from typing import Any, TypeVar, Union, Type, Sequence, Callable, List, Dict
|
||||
|
||||
# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
|
||||
# cudf.DataFrame/cupy.array/dlpack
|
||||
import numpy as np
|
||||
|
||||
DataType = Any
|
||||
|
||||
# xgboost accepts some other possible types in practice due to historical reason, which is
|
||||
# lesser tested. For now we encourage users to pass a simple list of string.
|
||||
FeatureNames = Optional[Sequence[str]]
|
||||
FeatureTypes = Optional[Sequence[str]]
|
||||
FeatureInfo = Sequence[str]
|
||||
FeatureNames = FeatureInfo
|
||||
FeatureTypes = FeatureInfo
|
||||
BoosterParam = Union[List, Dict] # better be sequence
|
||||
|
||||
ArrayLike = Any
|
||||
PathLike = Union[str, os.PathLike]
|
||||
CupyT = ArrayLike # maybe need a stub for cupy arrays
|
||||
NumpyOrCupy = Any
|
||||
NumpyDType = Union[str, Type[np.number]]
|
||||
PandasDType = Any # real type is pandas.core.dtypes.base.ExtensionDtype
|
||||
|
||||
FloatCompatible = Union[float, np.float32, np.float64]
|
||||
|
||||
# callables
|
||||
FPreProcCallable = Callable
|
||||
|
||||
# ctypes
|
||||
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
|
||||
@ -59,3 +70,4 @@ CNumericPtr = ctypes.pointer
|
||||
|
||||
# template parameter
|
||||
_T = TypeVar("_T")
|
||||
_F = TypeVar("_F", bound=Callable[..., Any])
|
||||
|
||||
@ -10,8 +10,7 @@ from abc import ABC
|
||||
import collections
|
||||
import os
|
||||
import pickle
|
||||
from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast
|
||||
from typing import Sequence
|
||||
from typing import Callable, List, Optional, Union, Dict, Tuple, TypeVar, cast, Sequence, Any
|
||||
import numpy
|
||||
|
||||
from . import rabit
|
||||
@ -24,11 +23,14 @@ __all__ = [
|
||||
"EarlyStopping",
|
||||
"EvaluationMonitor",
|
||||
"TrainingCheckPoint",
|
||||
"CallbackContainer"
|
||||
]
|
||||
|
||||
_Score = Union[float, Tuple[float, float]]
|
||||
_ScoreList = Union[List[float], List[Tuple[float, float]]]
|
||||
|
||||
_Model = Any # real type is Union[Booster, CVPack]; need more work
|
||||
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
class TrainingCallback(ABC):
|
||||
@ -43,19 +45,19 @@ class TrainingCallback(ABC):
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def before_training(self, model):
|
||||
def before_training(self, model: _Model) -> _Model:
|
||||
'''Run before training starts.'''
|
||||
return model
|
||||
|
||||
def after_training(self, model):
|
||||
def after_training(self, model: _Model) -> _Model:
|
||||
'''Run after training is finished.'''
|
||||
return model
|
||||
|
||||
def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
|
||||
def before_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
|
||||
'''Run before each iteration. Return True when training should stop.'''
|
||||
return False
|
||||
|
||||
def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
|
||||
def after_iteration(self, model: _Model, epoch: int, evals_log: EvalsLog) -> bool:
|
||||
'''Run after each iteration. Return True when training should stop.'''
|
||||
return False
|
||||
|
||||
@ -140,7 +142,7 @@ class CallbackContainer:
|
||||
if self.is_cv:
|
||||
self.aggregated_cv = None
|
||||
|
||||
def before_training(self, model):
|
||||
def before_training(self, model: _Model) -> _Model:
|
||||
'''Function called before training.'''
|
||||
for c in self.callbacks:
|
||||
model = c.before_training(model=model)
|
||||
@ -151,7 +153,7 @@ class CallbackContainer:
|
||||
assert isinstance(model, Booster), msg
|
||||
return model
|
||||
|
||||
def after_training(self, model):
|
||||
def after_training(self, model: _Model) -> _Model:
|
||||
'''Function called after training.'''
|
||||
for c in self.callbacks:
|
||||
model = c.after_training(model=model)
|
||||
@ -182,7 +184,7 @@ class CallbackContainer:
|
||||
return model
|
||||
|
||||
def before_iteration(
|
||||
self, model, epoch: int, dtrain: DMatrix, evals: List[Tuple[DMatrix, str]]
|
||||
self, model: _Model, epoch: int, dtrain: DMatrix, evals: Optional[List[Tuple[DMatrix, str]]]
|
||||
) -> bool:
|
||||
'''Function called before training iteration.'''
|
||||
return any(c.before_iteration(model, epoch, self.history)
|
||||
@ -220,7 +222,7 @@ class CallbackContainer:
|
||||
|
||||
def after_iteration(
|
||||
self,
|
||||
model,
|
||||
model: _Model,
|
||||
epoch: int,
|
||||
dtrain: DMatrix,
|
||||
evals: Optional[List[Tuple[DMatrix, str]]],
|
||||
@ -276,7 +278,7 @@ class LearningRateScheduler(TrainingCallback):
|
||||
super().__init__()
|
||||
|
||||
def after_iteration(
|
||||
self, model, epoch: int, evals_log: TrainingCallback.EvalsLog
|
||||
self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
|
||||
) -> bool:
|
||||
model.set_param("learning_rate", self.learning_rates(epoch))
|
||||
return False
|
||||
@ -344,12 +346,12 @@ class EarlyStopping(TrainingCallback):
|
||||
self.starting_round: int = 0
|
||||
super().__init__()
|
||||
|
||||
def before_training(self, model):
|
||||
def before_training(self, model: _Model) -> _Model:
|
||||
self.starting_round = model.num_boosted_rounds()
|
||||
return model
|
||||
|
||||
def _update_rounds(
|
||||
self, score: _Score, name: str, metric: str, model, epoch: int
|
||||
self, score: _Score, name: str, metric: str, model: _Model, epoch: int
|
||||
) -> bool:
|
||||
def get_s(x: _Score) -> float:
|
||||
"""get score if it's cross validation history."""
|
||||
@ -403,7 +405,7 @@ class EarlyStopping(TrainingCallback):
|
||||
return True
|
||||
return False
|
||||
|
||||
def after_iteration(self, model, epoch: int,
|
||||
def after_iteration(self, model: _Model, epoch: int,
|
||||
evals_log: TrainingCallback.EvalsLog) -> bool:
|
||||
epoch += self.starting_round # training continuation
|
||||
msg = 'Must have at least 1 validation dataset for early stopping.'
|
||||
@ -431,7 +433,7 @@ class EarlyStopping(TrainingCallback):
|
||||
score = data_log[metric_name][-1]
|
||||
return self._update_rounds(score, data_name, metric_name, model, epoch)
|
||||
|
||||
def after_training(self, model):
|
||||
def after_training(self, model: _Model) -> _Model:
|
||||
try:
|
||||
if self.save_best:
|
||||
model = model[: int(model.attr("best_iteration")) + 1]
|
||||
@ -477,7 +479,7 @@ class EvaluationMonitor(TrainingCallback):
|
||||
msg = f"\t{data + '-' + metric}:{score:.5f}"
|
||||
return msg
|
||||
|
||||
def after_iteration(self, model, epoch: int,
|
||||
def after_iteration(self, model: _Model, epoch: int,
|
||||
evals_log: TrainingCallback.EvalsLog) -> bool:
|
||||
if not evals_log:
|
||||
return False
|
||||
@ -503,7 +505,7 @@ class EvaluationMonitor(TrainingCallback):
|
||||
self._latest = msg
|
||||
return False
|
||||
|
||||
def after_training(self, model):
|
||||
def after_training(self, model: _Model) -> _Model:
|
||||
if rabit.get_rank() == self.printer_rank and self._latest is not None:
|
||||
rabit.tracker_print(self._latest)
|
||||
return model
|
||||
@ -544,7 +546,7 @@ class TrainingCheckPoint(TrainingCallback):
|
||||
self._epoch = 0
|
||||
super().__init__()
|
||||
|
||||
def after_iteration(self, model, epoch: int,
|
||||
def after_iteration(self, model: _Model, epoch: int,
|
||||
evals_log: TrainingCallback.EvalsLog) -> bool:
|
||||
if self._epoch == self._iterations:
|
||||
path = os.path.join(self._path, self._name + '_' + str(epoch) +
|
||||
|
||||
@ -1,30 +1,32 @@
|
||||
# coding: utf-8
|
||||
# pylint: disable= invalid-name, unused-import
|
||||
"""For compatibility and optional dependencies."""
|
||||
from typing import Any
|
||||
from typing import Any, Type, Dict, Optional, List
|
||||
import sys
|
||||
import types
|
||||
import importlib.util
|
||||
import logging
|
||||
import numpy as np
|
||||
|
||||
from xgboost._typing import CStrPtr
|
||||
|
||||
assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.'
|
||||
|
||||
|
||||
def py_str(x):
|
||||
def py_str(x: CStrPtr) -> str:
|
||||
"""convert c string back to python string"""
|
||||
return x.decode('utf-8')
|
||||
return x.decode('utf-8') # type: ignore
|
||||
|
||||
|
||||
def lazy_isinstance(instance, module, name):
|
||||
def lazy_isinstance(instance: Type[object], module: str, name: str) -> bool:
|
||||
"""Use string representation to identify a type."""
|
||||
|
||||
# Notice, we use .__class__ as opposed to type() in order
|
||||
# to support object proxies such as weakref.proxy
|
||||
cls = instance.__class__
|
||||
module = cls.__module__ == module
|
||||
name = cls.__name__ == name
|
||||
return module and name
|
||||
is_same_module = cls.__module__ == module
|
||||
has_same_name = cls.__name__ == name
|
||||
return is_same_module and has_same_name
|
||||
|
||||
|
||||
# pandas
|
||||
@ -37,53 +39,33 @@ try:
|
||||
except ImportError:
|
||||
|
||||
MultiIndex = object
|
||||
DataFrame: Any = object
|
||||
DataFrame = object
|
||||
Series = object
|
||||
pandas_concat = None
|
||||
PANDAS_INSTALLED = False
|
||||
|
||||
# sklearn
|
||||
try:
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import RegressorMixin, ClassifierMixin
|
||||
from sklearn.base import (
|
||||
BaseEstimator as XGBModelBase,
|
||||
RegressorMixin as XGBRegressorBase,
|
||||
ClassifierMixin as XGBClassifierBase
|
||||
)
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
try:
|
||||
from sklearn.model_selection import KFold, StratifiedKFold
|
||||
from sklearn.model_selection import (
|
||||
KFold as XGBKFold,
|
||||
StratifiedKFold as XGBStratifiedKFold
|
||||
)
|
||||
except ImportError:
|
||||
from sklearn.cross_validation import KFold, StratifiedKFold
|
||||
from sklearn.cross_validation import (
|
||||
KFold as XGBKFold,
|
||||
StratifiedKFold as XGBStratifiedKFold
|
||||
)
|
||||
|
||||
SKLEARN_INSTALLED = True
|
||||
|
||||
XGBModelBase = BaseEstimator
|
||||
XGBRegressorBase = RegressorMixin
|
||||
XGBClassifierBase = ClassifierMixin
|
||||
|
||||
XGBKFold = KFold
|
||||
XGBStratifiedKFold = StratifiedKFold
|
||||
|
||||
class XGBoostLabelEncoder(LabelEncoder):
|
||||
'''Label encoder with JSON serialization methods.'''
|
||||
def to_json(self):
|
||||
'''Returns a JSON compatible dictionary'''
|
||||
meta = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if isinstance(v, np.ndarray):
|
||||
meta[k] = v.tolist()
|
||||
else:
|
||||
meta[k] = v
|
||||
return meta
|
||||
|
||||
def from_json(self, doc):
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
'''Load the encoder back from a JSON compatible dict.'''
|
||||
meta = {}
|
||||
for k, v in doc.items():
|
||||
if k == 'classes_':
|
||||
self.classes_ = np.array(v)
|
||||
continue
|
||||
meta[k] = v
|
||||
self.__dict__.update(meta)
|
||||
except ImportError:
|
||||
SKLEARN_INSTALLED = False
|
||||
|
||||
@ -91,10 +73,34 @@ except ImportError:
|
||||
XGBModelBase = object
|
||||
XGBClassifierBase = object
|
||||
XGBRegressorBase = object
|
||||
LabelEncoder = object
|
||||
|
||||
XGBKFold = None
|
||||
XGBStratifiedKFold = None
|
||||
XGBoostLabelEncoder = None
|
||||
|
||||
|
||||
class XGBoostLabelEncoder(LabelEncoder):
|
||||
'''Label encoder with JSON serialization methods.'''
|
||||
def to_json(self) -> Dict:
|
||||
'''Returns a JSON compatible dictionary'''
|
||||
meta = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if isinstance(v, np.ndarray):
|
||||
meta[k] = v.tolist()
|
||||
else:
|
||||
meta[k] = v
|
||||
return meta
|
||||
|
||||
def from_json(self, doc: Dict) -> None:
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
'''Load the encoder back from a JSON compatible dict.'''
|
||||
meta = {}
|
||||
for k, v in doc.items():
|
||||
if k == 'classes_':
|
||||
self.classes_ = np.array(v)
|
||||
continue
|
||||
meta[k] = v
|
||||
self.__dict__.update(meta)
|
||||
|
||||
|
||||
# dask
|
||||
@ -113,7 +119,7 @@ try:
|
||||
SCIPY_INSTALLED = True
|
||||
except ImportError:
|
||||
scipy_sparse = False
|
||||
scipy_csr: Any = object
|
||||
scipy_csr = object
|
||||
SCIPY_INSTALLED = False
|
||||
|
||||
|
||||
@ -136,15 +142,21 @@ class LazyLoader(types.ModuleType):
|
||||
"""Lazily import a module, mainly to avoid pulling in large dependencies.
|
||||
"""
|
||||
|
||||
def __init__(self, local_name, parent_module_globals, name, warning=None):
|
||||
def __init__(
|
||||
self,
|
||||
local_name: str,
|
||||
parent_module_globals: Dict,
|
||||
name: str,
|
||||
warning: Optional[str] = None
|
||||
) -> None:
|
||||
self._local_name = local_name
|
||||
self._parent_module_globals = parent_module_globals
|
||||
self._warning = warning
|
||||
self.module = None
|
||||
self.module: Optional[types.ModuleType] = None
|
||||
|
||||
super().__init__(name)
|
||||
|
||||
def _load(self):
|
||||
def _load(self) -> types.ModuleType:
|
||||
"""Load the module and insert it into the parent's globals."""
|
||||
# Import the target module and insert it into the parent's namespace
|
||||
module = importlib.import_module(self.__name__)
|
||||
@ -163,12 +175,12 @@ class LazyLoader(types.ModuleType):
|
||||
|
||||
return module
|
||||
|
||||
def __getattr__(self, item):
|
||||
def __getattr__(self, item: str) -> Any:
|
||||
if not self.module:
|
||||
self.module = self._load()
|
||||
return getattr(self.module, item)
|
||||
|
||||
def __dir__(self):
|
||||
def __dir__(self) -> List[str]:
|
||||
if not self.module:
|
||||
self.module = self._load()
|
||||
return dir(self.module)
|
||||
|
||||
@ -4,12 +4,20 @@ import ctypes
|
||||
import json
|
||||
from contextlib import contextmanager
|
||||
from functools import wraps
|
||||
from typing import Optional, Callable, Any, Dict, cast, Iterator
|
||||
|
||||
from .core import _LIB, _check_call, c_str, py_str
|
||||
from ._typing import _F
|
||||
|
||||
|
||||
def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
|
||||
see_also=None):
|
||||
def config_doc(
|
||||
*,
|
||||
header: Optional[str] = None,
|
||||
extra_note: Optional[str] = None,
|
||||
parameters: Optional[str] = None,
|
||||
returns: Optional[str] = None,
|
||||
see_also: Optional[str] = None
|
||||
) -> Callable[[_F], _F]:
|
||||
"""Decorator to format docstring for config functions.
|
||||
|
||||
Parameters
|
||||
@ -64,19 +72,19 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
|
||||
assert xgb.get_config()['verbosity'] == 2 # old value restored
|
||||
"""
|
||||
|
||||
def none_to_str(value):
|
||||
def none_to_str(value: Optional[str]) -> str:
|
||||
return '' if value is None else value
|
||||
|
||||
def config_doc_decorator(func):
|
||||
def config_doc_decorator(func: _F) -> _F:
|
||||
func.__doc__ = (doc_template.format(header=none_to_str(header),
|
||||
extra_note=none_to_str(extra_note))
|
||||
+ none_to_str(parameters) + none_to_str(returns)
|
||||
+ none_to_str(common_example) + none_to_str(see_also))
|
||||
|
||||
@wraps(func)
|
||||
def wrap(*args, **kwargs):
|
||||
def wrap(*args: Any, **kwargs: Any) -> Any:
|
||||
return func(*args, **kwargs)
|
||||
return wrap
|
||||
return cast(_F, wrap)
|
||||
return config_doc_decorator
|
||||
|
||||
|
||||
@ -89,7 +97,7 @@ def config_doc(*, header=None, extra_note=None, parameters=None, returns=None,
|
||||
new_config: Dict[str, Any]
|
||||
Keyword arguments representing the parameters and their values
|
||||
""")
|
||||
def set_config(**new_config):
|
||||
def set_config(**new_config: Any) -> None:
|
||||
config = json.dumps(new_config)
|
||||
_check_call(_LIB.XGBSetGlobalConfig(c_str(config)))
|
||||
|
||||
@ -103,7 +111,7 @@ def set_config(**new_config):
|
||||
args: Dict[str, Any]
|
||||
The list of global parameters and their values
|
||||
""")
|
||||
def get_config():
|
||||
def get_config() -> Dict[str, Any]:
|
||||
config_str = ctypes.c_char_p()
|
||||
_check_call(_LIB.XGBGetGlobalConfig(ctypes.byref(config_str)))
|
||||
config = json.loads(py_str(config_str.value))
|
||||
@ -132,7 +140,7 @@ def get_config():
|
||||
set_config: Set global XGBoost configuration
|
||||
get_config: Get current values of the global configuration
|
||||
""")
|
||||
def config_context(**new_config):
|
||||
def config_context(**new_config: Any) -> Iterator[None]:
|
||||
old_config = get_config().copy()
|
||||
set_config(**new_config)
|
||||
|
||||
|
||||
@ -30,10 +30,12 @@ from ._typing import (
|
||||
ArrayLike,
|
||||
CFloatPtr,
|
||||
NumpyOrCupy,
|
||||
FeatureNames,
|
||||
FeatureInfo,
|
||||
FeatureTypes,
|
||||
FeatureNames,
|
||||
_T,
|
||||
CupyT,
|
||||
BoosterParam
|
||||
)
|
||||
|
||||
|
||||
@ -273,7 +275,7 @@ def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.n
|
||||
if not isinstance(cptr, ctypes.POINTER(ctype)):
|
||||
raise RuntimeError(f"expected {ctype} pointer")
|
||||
res = np.zeros(length, dtype=dtype)
|
||||
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
|
||||
if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): # type: ignore
|
||||
raise RuntimeError("memmove failed")
|
||||
return res
|
||||
|
||||
@ -310,7 +312,7 @@ def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
|
||||
raise RuntimeError('expected char pointer')
|
||||
res = bytearray(length)
|
||||
rptr = (ctypes.c_char * length).from_buffer(res)
|
||||
if not ctypes.memmove(rptr, cptr, length):
|
||||
if not ctypes.memmove(rptr, cptr, length): # type: ignore
|
||||
raise RuntimeError('memmove failed')
|
||||
return res
|
||||
|
||||
@ -434,8 +436,8 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
def data_handle(
|
||||
data: Any,
|
||||
*,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
from .data import dispatch_proxy_set_data
|
||||
@ -555,8 +557,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
base_margin: Optional[ArrayLike] = None,
|
||||
missing: Optional[float] = None,
|
||||
silent: bool = False,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: FeatureTypes = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
nthread: Optional[int] = None,
|
||||
group: Optional[ArrayLike] = None,
|
||||
qid: Optional[ArrayLike] = None,
|
||||
@ -718,8 +720,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
qid: Optional[ArrayLike] = None,
|
||||
label_lower_bound: Optional[ArrayLike] = None,
|
||||
label_upper_bound: Optional[ArrayLike] = None,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
feature_weights: Optional[ArrayLike] = None
|
||||
) -> None:
|
||||
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
|
||||
@ -1000,7 +1002,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
return res
|
||||
|
||||
@property
|
||||
def feature_names(self) -> Optional[List[str]]:
|
||||
def feature_names(self) -> Optional[FeatureNames]:
|
||||
"""Get feature names (column labels).
|
||||
|
||||
Returns
|
||||
@ -1023,7 +1025,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
return feature_names
|
||||
|
||||
@feature_names.setter
|
||||
def feature_names(self, feature_names: FeatureNames) -> None:
|
||||
def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
|
||||
"""Set feature names (column labels).
|
||||
|
||||
Parameters
|
||||
@ -1039,7 +1041,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
else:
|
||||
feature_names = [feature_names]
|
||||
except TypeError:
|
||||
feature_names = [feature_names]
|
||||
feature_names = [cast(str, feature_names)]
|
||||
|
||||
if len(feature_names) != len(set(feature_names)):
|
||||
raise ValueError('feature_names must be unique')
|
||||
@ -1069,8 +1071,13 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
self.feature_types = None
|
||||
|
||||
@property
|
||||
def feature_types(self) -> Optional[List[str]]:
|
||||
"""Get feature types. See :py:class:`DMatrix` for details."""
|
||||
def feature_types(self) -> Optional[FeatureTypes]:
|
||||
"""Get feature types (column types).
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_types : list or None
|
||||
"""
|
||||
length = c_bst_ulong()
|
||||
sarr = ctypes.POINTER(ctypes.c_char_p)()
|
||||
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
|
||||
@ -1111,7 +1118,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
|
||||
else:
|
||||
feature_types = [feature_types]
|
||||
except TypeError:
|
||||
feature_types = [feature_types]
|
||||
feature_types = [cast(str, feature_types)]
|
||||
feature_types_bytes = [bytes(f, encoding='utf-8')
|
||||
for f in feature_types]
|
||||
c_feature_types = (ctypes.c_char_p *
|
||||
@ -1203,8 +1210,8 @@ class DeviceQuantileDMatrix(DMatrix):
|
||||
base_margin: Optional[ArrayLike] = None,
|
||||
missing: Optional[float] = None,
|
||||
silent: bool = False,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
nthread: Optional[int] = None,
|
||||
max_bin: int = 256,
|
||||
group: Optional[ArrayLike] = None,
|
||||
@ -1323,7 +1330,7 @@ def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
|
||||
return num_parallel_tree, num_groups
|
||||
|
||||
|
||||
def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]:
|
||||
def _configure_metrics(params: BoosterParam) -> BoosterParam:
|
||||
if (
|
||||
isinstance(params, dict)
|
||||
and "eval_metric" in params
|
||||
@ -1349,7 +1356,7 @@ class Booster:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
params: Optional[Dict] = None,
|
||||
params: Optional[BoosterParam] = None,
|
||||
cache: Optional[Sequence[DMatrix]] = None,
|
||||
model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None
|
||||
) -> None:
|
||||
@ -1444,7 +1451,7 @@ class Booster:
|
||||
"Constrained features are not a subset of training data feature names"
|
||||
) from e
|
||||
|
||||
def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]:
|
||||
def _configure_constraints(self, params: BoosterParam) -> BoosterParam:
|
||||
if isinstance(params, dict):
|
||||
value = params.get("monotone_constraints")
|
||||
if value is not None:
|
||||
@ -1607,7 +1614,7 @@ class Booster:
|
||||
return py_str(ret.value)
|
||||
return None
|
||||
|
||||
def attributes(self) -> Dict[str, str]:
|
||||
def attributes(self) -> Dict[str, Optional[str]]:
|
||||
"""Get attributes stored in the Booster as a dictionary.
|
||||
|
||||
Returns
|
||||
@ -1639,7 +1646,7 @@ class Booster:
|
||||
_check_call(_LIB.XGBoosterSetAttr(
|
||||
self.handle, c_str(key), value))
|
||||
|
||||
def _get_feature_info(self, field: str) -> Optional[List[str]]:
|
||||
def _get_feature_info(self, field: str) -> Optional[FeatureInfo]:
|
||||
length = c_bst_ulong()
|
||||
sarr = ctypes.POINTER(ctypes.c_char_p)()
|
||||
if not hasattr(self, "handle") or self.handle is None:
|
||||
@ -1652,7 +1659,7 @@ class Booster:
|
||||
feature_info = from_cstr_to_pystr(sarr, length)
|
||||
return feature_info if feature_info else None
|
||||
|
||||
def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None:
|
||||
def _set_feature_info(self, features: Optional[FeatureInfo], field: str) -> None:
|
||||
if features is not None:
|
||||
assert isinstance(features, list)
|
||||
feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
|
||||
@ -1670,7 +1677,7 @@ class Booster:
|
||||
)
|
||||
|
||||
@property
|
||||
def feature_types(self) -> Optional[List[str]]:
|
||||
def feature_types(self) -> Optional[FeatureTypes]:
|
||||
"""Feature types for this booster. Can be directly set by input data or by
|
||||
assignment. See :py:class:`DMatrix` for details.
|
||||
|
||||
@ -1678,11 +1685,11 @@ class Booster:
|
||||
return self._get_feature_info("feature_type")
|
||||
|
||||
@feature_types.setter
|
||||
def feature_types(self, features: Optional[List[str]]) -> None:
|
||||
def feature_types(self, features: Optional[FeatureTypes]) -> None:
|
||||
self._set_feature_info(features, "feature_type")
|
||||
|
||||
@property
|
||||
def feature_names(self) -> Optional[List[str]]:
|
||||
def feature_names(self) -> Optional[FeatureNames]:
|
||||
"""Feature names for this booster. Can be directly set by input data or by
|
||||
assignment.
|
||||
|
||||
@ -1690,7 +1697,7 @@ class Booster:
|
||||
return self._get_feature_info("feature_name")
|
||||
|
||||
@feature_names.setter
|
||||
def feature_names(self, features: FeatureNames) -> None:
|
||||
def feature_names(self, features: Optional[FeatureNames]) -> None:
|
||||
self._set_feature_info(features, "feature_name")
|
||||
|
||||
def set_param(
|
||||
@ -1711,7 +1718,7 @@ class Booster:
|
||||
params = params.items()
|
||||
elif isinstance(params, str) and value is not None:
|
||||
params = [(params, value)]
|
||||
for key, val in params:
|
||||
for key, val in cast(Iterable[Tuple[str, str]], params):
|
||||
if val is not None:
|
||||
_check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key),
|
||||
c_str(str(val))))
|
||||
@ -2564,8 +2571,10 @@ class Booster:
|
||||
)
|
||||
# Booster can't accept data with different feature names
|
||||
if self.feature_names != data.feature_names:
|
||||
dat_missing = set(self.feature_names) - set(data.feature_names)
|
||||
my_missing = set(data.feature_names) - set(self.feature_names)
|
||||
dat_missing = set(cast(FeatureNames, self.feature_names)) - \
|
||||
set(cast(FeatureNames, data.feature_names))
|
||||
my_missing = set(cast(FeatureNames, data.feature_names)) - \
|
||||
set(cast(FeatureNames, self.feature_names))
|
||||
|
||||
msg = 'feature_names mismatch: {0} {1}'
|
||||
|
||||
|
||||
@ -318,7 +318,7 @@ class DaskDMatrix:
|
||||
base_margin: Optional[_DaskCollection] = None,
|
||||
missing: float = None,
|
||||
silent: bool = False, # pylint: disable=unused-argument
|
||||
feature_names: FeatureNames = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: FeatureTypes = None,
|
||||
group: Optional[_DaskCollection] = None,
|
||||
qid: Optional[_DaskCollection] = None,
|
||||
@ -594,7 +594,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
|
||||
qid: Optional[List[Any]] = None,
|
||||
label_lower_bound: Optional[List[Any]] = None,
|
||||
label_upper_bound: Optional[List[Any]] = None,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[Union[Any, List[Any]]] = None,
|
||||
) -> None:
|
||||
self._data = data
|
||||
@ -637,7 +637,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
|
||||
if self._iter == len(self._data):
|
||||
# Return 0 when there's no more batch.
|
||||
return 0
|
||||
feature_names: FeatureNames = None
|
||||
feature_names: Optional[FeatureNames] = None
|
||||
if self._feature_names:
|
||||
feature_names = self._feature_names
|
||||
else:
|
||||
@ -688,7 +688,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
|
||||
base_margin: Optional[_DaskCollection] = None,
|
||||
missing: float = None,
|
||||
silent: bool = False, # disable=unused-argument
|
||||
feature_names: FeatureNames = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[Union[Any, List[Any]]] = None,
|
||||
max_bin: int = 256,
|
||||
group: Optional[_DaskCollection] = None,
|
||||
@ -725,7 +725,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
|
||||
|
||||
|
||||
def _create_device_quantile_dmatrix(
|
||||
feature_names: FeatureNames,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[Union[Any, List[Any]]],
|
||||
feature_weights: Optional[Any],
|
||||
missing: float,
|
||||
@ -766,7 +766,7 @@ def _create_device_quantile_dmatrix(
|
||||
|
||||
|
||||
def _create_dmatrix(
|
||||
feature_names: FeatureNames,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[Union[Any, List[Any]]],
|
||||
feature_weights: Optional[Any],
|
||||
missing: float,
|
||||
|
||||
@ -5,17 +5,26 @@ import ctypes
|
||||
import json
|
||||
import warnings
|
||||
import os
|
||||
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
|
||||
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .core import c_array, _LIB, _check_call, c_str
|
||||
from .core import _cuda_array_interface
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
|
||||
from ._typing import FeatureTypes
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||
from .compat import lazy_isinstance, DataFrame
|
||||
from ._typing import (
|
||||
c_bst_ulong,
|
||||
DataType,
|
||||
FeatureTypes,
|
||||
FeatureNames,
|
||||
NumpyDType,
|
||||
CupyT,
|
||||
FloatCompatible, PandasDType
|
||||
)
|
||||
|
||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
DispatchedDataBackendReturnType = Tuple[
|
||||
ctypes.c_void_p, Optional[FeatureNames], Optional[FeatureTypes]]
|
||||
|
||||
CAT_T = "c"
|
||||
|
||||
@ -23,14 +32,14 @@ CAT_T = "c"
|
||||
_matrix_meta = {"base_margin", "label"}
|
||||
|
||||
|
||||
def _warn_unused_missing(data, missing):
|
||||
def _warn_unused_missing(data: DataType, missing: Optional[FloatCompatible]) -> None:
|
||||
if (missing is not None) and (not np.isnan(missing)):
|
||||
warnings.warn(
|
||||
'`missing` is not used for current input data type:' +
|
||||
str(type(data)), UserWarning)
|
||||
|
||||
|
||||
def _check_complex(data):
|
||||
def _check_complex(data: DataType) -> None:
|
||||
'''Test whether data is complex using `dtype` attribute.'''
|
||||
complex_dtypes = (np.complex128, np.complex64,
|
||||
np.cfloat, np.cdouble, np.clongdouble)
|
||||
@ -38,16 +47,15 @@ def _check_complex(data):
|
||||
raise ValueError('Complex data not supported')
|
||||
|
||||
|
||||
def _check_data_shape(data: Any) -> None:
|
||||
def _check_data_shape(data: DataType) -> None:
|
||||
if hasattr(data, "shape") and len(data.shape) != 2:
|
||||
raise ValueError("Please reshape the input data into 2-dimensional matrix.")
|
||||
|
||||
|
||||
def _is_scipy_csr(data):
|
||||
def _is_scipy_csr(data: DataType) -> bool:
|
||||
try:
|
||||
import scipy
|
||||
import scipy.sparse
|
||||
except ImportError:
|
||||
scipy = None
|
||||
return False
|
||||
return isinstance(data, scipy.sparse.csr_matrix)
|
||||
|
||||
@ -64,12 +72,12 @@ def _array_interface(data: np.ndarray) -> bytes:
|
||||
|
||||
|
||||
def _from_scipy_csr(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
"""Initialize data from a CSR matrix."""
|
||||
if len(data.indices) != len(data.data):
|
||||
raise ValueError(
|
||||
@ -94,21 +102,20 @@ def _from_scipy_csr(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_scipy_csc(data):
|
||||
def _is_scipy_csc(data: DataType) -> bool:
|
||||
try:
|
||||
import scipy
|
||||
import scipy.sparse
|
||||
except ImportError:
|
||||
scipy = None
|
||||
return False
|
||||
return isinstance(data, scipy.sparse.csc_matrix)
|
||||
|
||||
|
||||
def _from_scipy_csc(
|
||||
data,
|
||||
missing,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: DataType,
|
||||
missing: Optional[FloatCompatible],
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
if len(data.indices) != len(data.data):
|
||||
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
|
||||
_warn_unused_missing(data, missing)
|
||||
@ -124,27 +131,29 @@ def _from_scipy_csc(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_scipy_coo(data):
|
||||
def _is_scipy_coo(data: DataType) -> bool:
|
||||
try:
|
||||
import scipy
|
||||
import scipy.sparse
|
||||
except ImportError:
|
||||
scipy = None
|
||||
return False
|
||||
return isinstance(data, scipy.sparse.coo_matrix)
|
||||
|
||||
|
||||
def _is_numpy_array(data):
|
||||
def _is_numpy_array(data: DataType) -> bool:
|
||||
return isinstance(data, (np.ndarray, np.matrix))
|
||||
|
||||
|
||||
def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]:
|
||||
def _ensure_np_dtype(
|
||||
data: DataType,
|
||||
dtype: Optional[NumpyDType]
|
||||
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
|
||||
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
|
||||
data = data.astype(np.float32, copy=False)
|
||||
dtype = np.float32
|
||||
return data, dtype
|
||||
|
||||
|
||||
def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
|
||||
def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
|
||||
'''Handle numpy slice. This can be removed if we use __array_interface__.
|
||||
'''
|
||||
try:
|
||||
@ -159,12 +168,12 @@ def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
|
||||
|
||||
|
||||
def _from_numpy_array(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
"""Initialize data from a 2-D numpy matrix.
|
||||
|
||||
"""
|
||||
@ -189,7 +198,7 @@ def _from_numpy_array(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_pandas_df(data):
|
||||
def _is_pandas_df(data: DataType) -> bool:
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
@ -197,7 +206,7 @@ def _is_pandas_df(data):
|
||||
return isinstance(data, pd.DataFrame)
|
||||
|
||||
|
||||
def _is_modin_df(data):
|
||||
def _is_modin_df(data: DataType) -> bool:
|
||||
try:
|
||||
import modin.pandas as pd
|
||||
except ImportError:
|
||||
@ -232,7 +241,7 @@ _ENABLE_CAT_ERR = (
|
||||
)
|
||||
|
||||
|
||||
def _invalid_dataframe_dtype(data: Any) -> None:
|
||||
def _invalid_dataframe_dtype(data: DataType) -> None:
|
||||
# pandas series has `dtypes` but it's just a single object
|
||||
# cudf series doesn't have `dtypes`.
|
||||
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
|
||||
@ -253,10 +262,10 @@ def _invalid_dataframe_dtype(data: Any) -> None:
|
||||
def _pandas_feature_info(
|
||||
data: DataFrame,
|
||||
meta: Optional[str],
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[FeatureNames, FeatureTypes]:
|
||||
) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
import pandas as pd
|
||||
from pandas.api.types import (
|
||||
is_sparse,
|
||||
@ -285,13 +294,13 @@ def _pandas_feature_info(
|
||||
return feature_names, feature_types
|
||||
|
||||
|
||||
def is_nullable_dtype(dtype: Any) -> bool:
|
||||
def is_nullable_dtype(dtype: PandasDType) -> bool:
|
||||
"""Wether dtype is a pandas nullable type."""
|
||||
from pandas.api.types import is_integer_dtype, is_bool_dtype
|
||||
# dtype: pd.core.arrays.numeric.NumericDtype
|
||||
nullable_alias = {"Int16", "Int32", "Int64"}
|
||||
is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
|
||||
# np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
|
||||
# np.bool has alias `bool`, while pd.BooleanDtype has `bzoolean`.
|
||||
is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
|
||||
return is_int or is_bool
|
||||
|
||||
@ -331,11 +340,11 @@ def _pandas_cat_null(data: DataFrame) -> DataFrame:
|
||||
def _transform_pandas_df(
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
feature_names: FeatureNames = None,
|
||||
feature_types: FeatureTypes = None,
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[FeatureTypes] = None,
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[str] = None,
|
||||
) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
|
||||
meta_type: Optional[NumpyDType] = None,
|
||||
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
from pandas.api.types import (
|
||||
is_sparse,
|
||||
is_categorical_dtype,
|
||||
@ -359,7 +368,7 @@ def _transform_pandas_df(
|
||||
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
|
||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||
|
||||
dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
|
||||
dtype = meta_type if meta_type else np.float32
|
||||
arr: np.ndarray = transformed.values
|
||||
if meta_type:
|
||||
arr = arr.astype(dtype)
|
||||
@ -369,18 +378,18 @@ def _transform_pandas_df(
|
||||
def _from_pandas_df(
|
||||
data: DataFrame,
|
||||
enable_categorical: bool,
|
||||
missing: float,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
data, feature_names, feature_types = _transform_pandas_df(
|
||||
data, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
|
||||
|
||||
|
||||
def _is_pandas_series(data):
|
||||
def _is_pandas_series(data: DataType) -> bool:
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
@ -389,18 +398,21 @@ def _is_pandas_series(data):
|
||||
|
||||
|
||||
def _meta_from_pandas_series(
|
||||
data, name: str, dtype: Optional[str], handle: ctypes.c_void_p
|
||||
data: DataType,
|
||||
name: str,
|
||||
dtype: Optional[NumpyDType],
|
||||
handle: ctypes.c_void_p
|
||||
) -> None:
|
||||
"""Help transform pandas series for meta data like labels"""
|
||||
data = data.values.astype('float')
|
||||
from pandas.api.types import is_sparse
|
||||
if is_sparse(data):
|
||||
data = data.to_dense()
|
||||
data = data.to_dense() # type: ignore
|
||||
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
|
||||
|
||||
def _is_modin_series(data):
|
||||
def _is_modin_series(data: DataType) -> bool:
|
||||
try:
|
||||
import modin.pandas as pd
|
||||
except ImportError:
|
||||
@ -409,13 +421,13 @@ def _is_modin_series(data):
|
||||
|
||||
|
||||
def _from_pandas_series(
|
||||
data,
|
||||
missing: float,
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
enable_categorical: bool,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
|
||||
if (data.dtype.name not in _pandas_dtype_mapper) and not (
|
||||
@ -433,7 +445,7 @@ def _from_pandas_series(
|
||||
)
|
||||
|
||||
|
||||
def _is_dt_df(data):
|
||||
def _is_dt_df(data: DataType) -> bool:
|
||||
return lazy_isinstance(data, 'datatable', 'Frame') or \
|
||||
lazy_isinstance(data, 'datatable', 'DataTable')
|
||||
|
||||
@ -443,12 +455,12 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
|
||||
|
||||
|
||||
def _transform_dt_df(
|
||||
data,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
meta=None,
|
||||
meta_type=None,
|
||||
):
|
||||
data: DataType,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[NumpyDType] = None,
|
||||
) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
"""Validate feature names and types if data table"""
|
||||
if meta and data.shape[1] > 1:
|
||||
raise ValueError('DataTable for meta info cannot have multiple columns')
|
||||
@ -482,13 +494,13 @@ def _transform_dt_df(
|
||||
|
||||
|
||||
def _from_dt_df(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
data: DataType,
|
||||
missing: Optional[FloatCompatible],
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
if enable_categorical:
|
||||
raise ValueError("categorical data in datatable is not supported yet.")
|
||||
data, feature_names, feature_types = _transform_dt_df(
|
||||
@ -525,7 +537,7 @@ def _from_dt_df(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_arrow(data) -> bool:
|
||||
def _is_arrow(data: DataType) -> bool:
|
||||
try:
|
||||
import pyarrow as pa
|
||||
from pyarrow import dataset as arrow_dataset
|
||||
@ -571,13 +583,13 @@ def record_batch_data_iter(data_iter: Iterator) -> Callable:
|
||||
|
||||
|
||||
def _from_arrow(
|
||||
data,
|
||||
missing: float,
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
import pyarrow as pa
|
||||
|
||||
if not all(
|
||||
@ -605,11 +617,11 @@ def _from_arrow(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_cudf_df(data) -> bool:
|
||||
def _is_cudf_df(data: DataType) -> bool:
|
||||
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||
|
||||
|
||||
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
|
||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
||||
data and a list of array interfaces. The data is list of categorical codes that
|
||||
caller can safely ignore, but have to keep their reference alive until usage of array
|
||||
@ -645,11 +657,11 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
|
||||
|
||||
def _transform_cudf_df(
|
||||
data,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
data: DataType,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
):
|
||||
) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
try:
|
||||
from cudf.api.types import is_categorical_dtype
|
||||
except ImportError:
|
||||
@ -709,13 +721,13 @@ def _transform_cudf_df(
|
||||
|
||||
|
||||
def _from_cudf_df(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
|
||||
data, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
@ -732,7 +744,7 @@ def _from_cudf_df(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_cudf_ser(data):
|
||||
def _is_cudf_ser(data: DataType) -> bool:
|
||||
try:
|
||||
import cudf
|
||||
except ImportError:
|
||||
@ -740,13 +752,13 @@ def _is_cudf_ser(data):
|
||||
return isinstance(data, cudf.Series)
|
||||
|
||||
|
||||
def _is_cupy_array(data: Any) -> bool:
|
||||
def _is_cupy_array(data: DataType) -> bool:
|
||||
return lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
|
||||
data, "cupy._core.core", "ndarray"
|
||||
)
|
||||
|
||||
|
||||
def _transform_cupy_array(data):
|
||||
def _transform_cupy_array(data: DataType) -> CupyT:
|
||||
import cupy # pylint: disable=import-error
|
||||
if not hasattr(data, '__cuda_array_interface__') and hasattr(
|
||||
data, '__array__'):
|
||||
@ -757,12 +769,12 @@ def _transform_cupy_array(data):
|
||||
|
||||
|
||||
def _from_cupy_array(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
"""Initialize DMatrix from cupy ndarray."""
|
||||
data = _transform_cupy_array(data)
|
||||
interface_str = _cuda_array_interface(data)
|
||||
@ -776,7 +788,7 @@ def _from_cupy_array(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_cupy_csr(data):
|
||||
def _is_cupy_csr(data: DataType) -> bool:
|
||||
try:
|
||||
import cupyx
|
||||
except ImportError:
|
||||
@ -784,7 +796,7 @@ def _is_cupy_csr(data):
|
||||
return isinstance(data, cupyx.scipy.sparse.csr_matrix)
|
||||
|
||||
|
||||
def _is_cupy_csc(data):
|
||||
def _is_cupy_csc(data: DataType) -> bool:
|
||||
try:
|
||||
import cupyx
|
||||
except ImportError:
|
||||
@ -792,11 +804,11 @@ def _is_cupy_csc(data):
|
||||
return isinstance(data, cupyx.scipy.sparse.csc_matrix)
|
||||
|
||||
|
||||
def _is_dlpack(data):
|
||||
def _is_dlpack(data: DataType) -> bool:
|
||||
return 'PyCapsule' in str(type(data)) and "dltensor" in str(data)
|
||||
|
||||
|
||||
def _transform_dlpack(data):
|
||||
def _transform_dlpack(data: DataType) -> bool:
|
||||
from cupy import fromDlpack # pylint: disable=E0401
|
||||
assert 'used_dltensor' not in str(data)
|
||||
data = fromDlpack(data)
|
||||
@ -804,27 +816,27 @@ def _transform_dlpack(data):
|
||||
|
||||
|
||||
def _from_dlpack(
|
||||
data,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: DataType,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
data = _transform_dlpack(data)
|
||||
return _from_cupy_array(data, missing, nthread, feature_names,
|
||||
feature_types)
|
||||
|
||||
|
||||
def _is_uri(data):
|
||||
def _is_uri(data: DataType) -> bool:
|
||||
return isinstance(data, (str, os.PathLike))
|
||||
|
||||
|
||||
def _from_uri(
|
||||
data,
|
||||
missing,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: DataType,
|
||||
missing: Optional[FloatCompatible],
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
_warn_unused_missing(data, missing)
|
||||
handle = ctypes.c_void_p()
|
||||
data = os.fspath(os.path.expanduser(data))
|
||||
@ -834,51 +846,51 @@ def _from_uri(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_list(data):
|
||||
def _is_list(data: DataType) -> bool:
|
||||
return isinstance(data, list)
|
||||
|
||||
|
||||
def _from_list(
|
||||
data,
|
||||
missing,
|
||||
n_threads,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: Sequence,
|
||||
missing: FloatCompatible,
|
||||
n_threads: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
array = np.array(data)
|
||||
_check_data_shape(data)
|
||||
return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
|
||||
|
||||
|
||||
def _is_tuple(data):
|
||||
def _is_tuple(data: DataType) -> bool:
|
||||
return isinstance(data, tuple)
|
||||
|
||||
|
||||
def _from_tuple(
|
||||
data,
|
||||
missing,
|
||||
n_threads,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
):
|
||||
data: Sequence,
|
||||
missing: FloatCompatible,
|
||||
n_threads: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
return _from_list(data, missing, n_threads, feature_names, feature_types)
|
||||
|
||||
|
||||
def _is_iter(data):
|
||||
def _is_iter(data: DataType) -> bool:
|
||||
return isinstance(data, DataIter)
|
||||
|
||||
|
||||
def _has_array_protocol(data):
|
||||
def _has_array_protocol(data: DataType) -> bool:
|
||||
return hasattr(data, '__array__')
|
||||
|
||||
|
||||
def _convert_unknown_data(data):
|
||||
def _convert_unknown_data(data: DataType) -> DataType:
|
||||
warnings.warn(
|
||||
f'Unknown data type: {type(data)}, trying to convert it to csr_matrix',
|
||||
UserWarning
|
||||
)
|
||||
try:
|
||||
import scipy
|
||||
import scipy.sparse
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
@ -891,13 +903,13 @@ def _convert_unknown_data(data):
|
||||
|
||||
|
||||
def dispatch_data_backend(
|
||||
data,
|
||||
missing,
|
||||
threads,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
data: DataType,
|
||||
missing: FloatCompatible, # Or Optional[Float]
|
||||
threads: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool = False,
|
||||
):
|
||||
) -> DispatchedDataBackendReturnType:
|
||||
'''Dispatch data for DMatrix.'''
|
||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||
_check_data_shape(data)
|
||||
@ -964,7 +976,7 @@ def dispatch_data_backend(
|
||||
raise TypeError('Not supported type for data.' + str(type(data)))
|
||||
|
||||
|
||||
def _to_data_type(dtype: str, name: str):
|
||||
def _to_data_type(dtype: str, name: str) -> int:
|
||||
dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
|
||||
if dtype not in dtype_map:
|
||||
raise TypeError(
|
||||
@ -973,7 +985,7 @@ def _to_data_type(dtype: str, name: str):
|
||||
return dtype_map[dtype]
|
||||
|
||||
|
||||
def _validate_meta_shape(data: Any, name: str) -> None:
|
||||
def _validate_meta_shape(data: DataType, name: str) -> None:
|
||||
if hasattr(data, "shape"):
|
||||
msg = f"Invalid shape: {data.shape} for {name}"
|
||||
if name in _matrix_meta:
|
||||
@ -990,7 +1002,7 @@ def _validate_meta_shape(data: Any, name: str) -> None:
|
||||
def _meta_from_numpy(
|
||||
data: np.ndarray,
|
||||
field: str,
|
||||
dtype: Optional[Union[np.dtype, str]],
|
||||
dtype: Optional[NumpyDType],
|
||||
handle: ctypes.c_void_p,
|
||||
) -> None:
|
||||
data, dtype = _ensure_np_dtype(data, dtype)
|
||||
@ -1001,16 +1013,26 @@ def _meta_from_numpy(
|
||||
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
|
||||
|
||||
|
||||
def _meta_from_list(data, field, dtype, handle):
|
||||
data = np.array(data)
|
||||
_meta_from_numpy(data, field, dtype, handle)
|
||||
def _meta_from_list(
|
||||
data: Sequence,
|
||||
field: str,
|
||||
dtype: Optional[NumpyDType],
|
||||
handle: ctypes.c_void_p
|
||||
) -> None:
|
||||
data_np = np.array(data)
|
||||
_meta_from_numpy(data_np, field, dtype, handle)
|
||||
|
||||
|
||||
def _meta_from_tuple(data, field, dtype, handle):
|
||||
def _meta_from_tuple(
|
||||
data: Sequence,
|
||||
field: str,
|
||||
dtype: Optional[NumpyDType],
|
||||
handle: ctypes.c_void_p
|
||||
) -> None:
|
||||
return _meta_from_list(data, field, dtype, handle)
|
||||
|
||||
|
||||
def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
|
||||
def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
|
||||
if field not in _matrix_meta:
|
||||
_meta_from_cudf_series(data.iloc[:, 0], field, handle)
|
||||
else:
|
||||
@ -1019,7 +1041,7 @@ def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
|
||||
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))
|
||||
|
||||
|
||||
def _meta_from_cudf_series(data, field, handle):
|
||||
def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
|
||||
interface = bytes(json.dumps([data.__cuda_array_interface__],
|
||||
indent=2), 'utf-8')
|
||||
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle,
|
||||
@ -1027,7 +1049,7 @@ def _meta_from_cudf_series(data, field, handle):
|
||||
interface))
|
||||
|
||||
|
||||
def _meta_from_cupy_array(data, field, handle):
|
||||
def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None:
|
||||
data = _transform_cupy_array(data)
|
||||
interface = bytes(json.dumps([data.__cuda_array_interface__],
|
||||
indent=2), 'utf-8')
|
||||
@ -1036,14 +1058,22 @@ def _meta_from_cupy_array(data, field, handle):
|
||||
interface))
|
||||
|
||||
|
||||
def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p):
|
||||
def _meta_from_dt(
|
||||
data: DataType,
|
||||
field: str,
|
||||
dtype: Optional[NumpyDType],
|
||||
handle: ctypes.c_void_p
|
||||
) -> None:
|
||||
data, _, _ = _transform_dt_df(data, None, None, field, dtype)
|
||||
_meta_from_numpy(data, field, dtype, handle)
|
||||
|
||||
|
||||
def dispatch_meta_backend(
|
||||
matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None
|
||||
):
|
||||
matrix: DMatrix,
|
||||
data: DataType,
|
||||
name: str,
|
||||
dtype: Optional[NumpyDType] = None
|
||||
) -> None:
|
||||
'''Dispatch for meta info.'''
|
||||
handle = matrix.handle
|
||||
assert handle is not None
|
||||
@ -1060,8 +1090,7 @@ def dispatch_meta_backend(
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
return
|
||||
if _is_pandas_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name,
|
||||
meta_type=dtype)
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
return
|
||||
if _is_pandas_series(data):
|
||||
@ -1107,7 +1136,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
|
||||
area for meta info.
|
||||
|
||||
'''
|
||||
def __init__(self, **kwargs: Any):
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
self.kwargs = kwargs
|
||||
self.it = 0 # pylint: disable=invalid-name
|
||||
super().__init__()
|
||||
@ -1124,11 +1153,13 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
|
||||
|
||||
|
||||
def _proxy_transform(
|
||||
data,
|
||||
feature_names: FeatureNames,
|
||||
feature_types: FeatureTypes,
|
||||
data: DataType,
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
):
|
||||
) -> Tuple[
|
||||
Union[bool, ctypes.c_void_p, np.ndarray],
|
||||
Optional[list], Optional[FeatureNames], Optional[FeatureTypes]]:
|
||||
if _is_cudf_df(data) or _is_cudf_ser(data):
|
||||
return _transform_cudf_df(
|
||||
data, feature_names, feature_types, enable_categorical
|
||||
@ -1152,7 +1183,7 @@ def _proxy_transform(
|
||||
|
||||
def dispatch_proxy_set_data(
|
||||
proxy: _ProxyDMatrix,
|
||||
data: Any,
|
||||
data: DataType,
|
||||
cat_codes: Optional[list],
|
||||
allow_host: bool,
|
||||
) -> None:
|
||||
@ -1162,11 +1193,11 @@ def dispatch_proxy_set_data(
|
||||
|
||||
if _is_cudf_df(data):
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||
proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
|
||||
return
|
||||
if _is_cudf_ser(data):
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||
proxy._set_data_from_cuda_columnar(data, cast(List, cat_codes))
|
||||
return
|
||||
if _is_cupy_array(data):
|
||||
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
||||
|
||||
@ -4,16 +4,34 @@
|
||||
"""Plotting Library."""
|
||||
from io import BytesIO
|
||||
import json
|
||||
from typing import Optional, Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ._typing import PathLike
|
||||
from .core import Booster
|
||||
from .sklearn import XGBModel
|
||||
|
||||
Axes = Any # real type is matplotlib.axes.Axes
|
||||
GraphvizSource = Any # real type is graphviz.Source
|
||||
|
||||
def plot_importance(booster, ax=None, height=0.2,
|
||||
xlim=None, ylim=None, title='Feature importance',
|
||||
xlabel='F score', ylabel='Features', fmap='',
|
||||
importance_type='weight', max_num_features=None,
|
||||
grid=True, show_values=True, **kwargs):
|
||||
|
||||
def plot_importance(
|
||||
booster: Booster,
|
||||
ax: Optional[Axes] = None,
|
||||
height: float = 0.2,
|
||||
xlim: Optional[tuple] = None,
|
||||
ylim: Optional[tuple] = None,
|
||||
title: str = "Feature importance",
|
||||
xlabel: str = "F score",
|
||||
ylabel: str = "Features",
|
||||
fmap: PathLike = "",
|
||||
importance_type: str = "weight",
|
||||
max_num_features: Optional[int] = None,
|
||||
grid: bool = True,
|
||||
show_values: bool = True,
|
||||
**kwargs: Any
|
||||
) -> Axes:
|
||||
"""Plot importance based on fitted trees.
|
||||
|
||||
Parameters
|
||||
@ -78,9 +96,9 @@ def plot_importance(booster, ax=None, height=0.2,
|
||||
tuples = [(k, importance[k]) for k in importance]
|
||||
if max_num_features is not None:
|
||||
# pylint: disable=invalid-unary-operand-type
|
||||
tuples = sorted(tuples, key=lambda x: x[1])[-max_num_features:]
|
||||
tuples = sorted(tuples, key=lambda _x: _x[1])[-max_num_features:]
|
||||
else:
|
||||
tuples = sorted(tuples, key=lambda x: x[1])
|
||||
tuples = sorted(tuples, key=lambda _x: _x[1])
|
||||
labels, values = zip(*tuples)
|
||||
|
||||
if ax is None:
|
||||
@ -120,9 +138,17 @@ def plot_importance(booster, ax=None, height=0.2,
|
||||
return ax
|
||||
|
||||
|
||||
def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
|
||||
yes_color=None, no_color=None,
|
||||
condition_node_params=None, leaf_node_params=None, **kwargs):
|
||||
def to_graphviz(
|
||||
booster: Booster,
|
||||
fmap: PathLike = "",
|
||||
num_trees: int = 0,
|
||||
rankdir: Optional[str] = None,
|
||||
yes_color: Optional[str] = None,
|
||||
no_color: Optional[str] = None,
|
||||
condition_node_params: Optional[dict] = None,
|
||||
leaf_node_params: Optional[dict] = None,
|
||||
**kwargs: Any
|
||||
) -> GraphvizSource:
|
||||
"""Convert specified tree to graphviz instance. IPython can automatically plot
|
||||
the returned graphiz instance. Otherwise, you should call .render() method
|
||||
of the returned graphiz instance.
|
||||
@ -212,7 +238,14 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None,
|
||||
return g
|
||||
|
||||
|
||||
def plot_tree(booster, fmap='', num_trees=0, rankdir=None, ax=None, **kwargs):
|
||||
def plot_tree(
|
||||
booster: Booster,
|
||||
fmap: PathLike = "",
|
||||
num_trees: int = 0,
|
||||
rankdir: Optional[str] = None,
|
||||
ax: Optional[Axes] = None,
|
||||
**kwargs: Any
|
||||
) -> Axes:
|
||||
"""Plot specified tree.
|
||||
|
||||
Parameters
|
||||
|
||||
@ -4,8 +4,19 @@ import copy
|
||||
import warnings
|
||||
import json
|
||||
import os
|
||||
from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar, Type, cast
|
||||
from typing import Sequence
|
||||
from typing import (
|
||||
Union,
|
||||
Optional,
|
||||
List,
|
||||
Dict,
|
||||
Callable,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Any,
|
||||
TypeVar,
|
||||
Type,
|
||||
cast,
|
||||
)
|
||||
import numpy as np
|
||||
|
||||
from .core import Booster, DMatrix, XGBoostError
|
||||
@ -14,7 +25,7 @@ from .core import Metric
|
||||
from .training import train
|
||||
from .callback import TrainingCallback
|
||||
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
|
||||
from ._typing import ArrayLike, FeatureTypes
|
||||
from ._typing import ArrayLike, FeatureNames, FeatureTypes
|
||||
|
||||
# Do not use class names on scikit-learn directly. Re-define the classes on
|
||||
# .compat to guarantee the behavior without scikit-learn
|
||||
@ -401,7 +412,7 @@ def _wrap_evaluation_matrices(
|
||||
eval_qid: Optional[Sequence[Any]],
|
||||
create_dmatrix: Callable,
|
||||
enable_categorical: bool,
|
||||
feature_types: FeatureTypes,
|
||||
feature_types: Optional[FeatureTypes],
|
||||
) -> Tuple[Any, List[Tuple[Any, str]]]:
|
||||
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the way.
|
||||
|
||||
@ -717,7 +728,7 @@ class XGBModel(XGBModelBase):
|
||||
return self._estimator_type # pylint: disable=no-member
|
||||
|
||||
def save_model(self, fname: Union[str, os.PathLike]) -> None:
|
||||
meta = {}
|
||||
meta: Dict[str, Any] = {}
|
||||
for k, v in self.__dict__.items():
|
||||
if k == '_le':
|
||||
meta['_le'] = self._le.to_json()
|
||||
@ -1231,7 +1242,7 @@ class XGBModel(XGBModelBase):
|
||||
importance_type=self.importance_type if self.importance_type else dft()
|
||||
)
|
||||
if b.feature_names is None:
|
||||
feature_names = [f"f{i}" for i in range(self.n_features_in_)]
|
||||
feature_names: FeatureNames = [f"f{i}" for i in range(self.n_features_in_)]
|
||||
else:
|
||||
feature_names = b.feature_names
|
||||
# gblinear returns all features so the `get` in next line is only for gbtree.
|
||||
|
||||
@ -5,20 +5,24 @@
|
||||
import copy
|
||||
import os
|
||||
import warnings
|
||||
from typing import Optional, Dict, Any, Union, Tuple, Sequence
|
||||
from typing import Optional, Dict, Any, Union, Tuple, Sequence, List, cast, Iterable
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .callback import TrainingCallback, CallbackContainer, EvaluationMonitor, EarlyStopping
|
||||
from .core import Booster, DMatrix, XGBoostError, _deprecate_positional_args
|
||||
from .core import Metric, Objective
|
||||
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
|
||||
from . import callback
|
||||
from .compat import SKLEARN_INSTALLED, XGBStratifiedKFold, DataFrame
|
||||
from ._typing import _F, FPreProcCallable, BoosterParam
|
||||
|
||||
_CVFolds = Sequence["CVPack"]
|
||||
|
||||
|
||||
def _assert_new_callback(
|
||||
callbacks: Optional[Sequence[callback.TrainingCallback]]
|
||||
callbacks: Optional[Sequence[TrainingCallback]]
|
||||
) -> None:
|
||||
is_new_callback: bool = not callbacks or all(
|
||||
isinstance(c, callback.TrainingCallback) for c in callbacks
|
||||
isinstance(c, TrainingCallback) for c in callbacks
|
||||
)
|
||||
if not is_new_callback:
|
||||
link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
|
||||
@ -56,10 +60,10 @@ def train(
|
||||
feval: Optional[Metric] = None,
|
||||
maximize: Optional[bool] = None,
|
||||
early_stopping_rounds: Optional[int] = None,
|
||||
evals_result: callback.TrainingCallback.EvalsLog = None,
|
||||
evals_result: TrainingCallback.EvalsLog = None,
|
||||
verbose_eval: Optional[Union[bool, int]] = True,
|
||||
xgb_model: Optional[Union[str, os.PathLike, Booster, bytearray]] = None,
|
||||
callbacks: Optional[Sequence[callback.TrainingCallback]] = None,
|
||||
callbacks: Optional[Sequence[TrainingCallback]] = None,
|
||||
custom_metric: Optional[Metric] = None,
|
||||
) -> Booster:
|
||||
"""Train a booster with given parameters.
|
||||
@ -159,12 +163,12 @@ def train(
|
||||
_assert_new_callback(callbacks)
|
||||
if verbose_eval:
|
||||
verbose_eval = 1 if verbose_eval is True else verbose_eval
|
||||
callbacks.append(callback.EvaluationMonitor(period=verbose_eval))
|
||||
callbacks.append(EvaluationMonitor(period=verbose_eval))
|
||||
if early_stopping_rounds:
|
||||
callbacks.append(
|
||||
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
|
||||
EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
|
||||
)
|
||||
cb_container = callback.CallbackContainer(
|
||||
cb_container = CallbackContainer(
|
||||
callbacks,
|
||||
metric=metric_fn,
|
||||
# For old `feval` parameter, the behavior is unchanged. For the new
|
||||
@ -194,71 +198,73 @@ def train(
|
||||
|
||||
class CVPack:
|
||||
""""Auxiliary datastruct to hold one fold of CV."""
|
||||
def __init__(self, dtrain, dtest, param):
|
||||
def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None:
|
||||
""""Initialize the CVPack"""
|
||||
self.dtrain = dtrain
|
||||
self.dtest = dtest
|
||||
self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
|
||||
self.bst = Booster(param, [dtrain, dtest])
|
||||
|
||||
def __getattr__(self, name):
|
||||
def _inner(*args, **kwargs):
|
||||
def __getattr__(self, name: str) -> _F:
|
||||
def _inner(*args: Any, **kwargs: Any) -> Any:
|
||||
return getattr(self.bst, name)(*args, **kwargs)
|
||||
return _inner
|
||||
return cast(_F, _inner)
|
||||
|
||||
def update(self, iteration, fobj):
|
||||
def update(self, iteration: int, fobj: Optional[Objective]) -> None:
|
||||
""""Update the boosters for one iteration"""
|
||||
self.bst.update(self.dtrain, iteration, fobj)
|
||||
|
||||
def eval(self, iteration, feval, output_margin):
|
||||
def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str:
|
||||
""""Evaluate the CVPack for one iteration."""
|
||||
return self.bst.eval_set(self.watchlist, iteration, feval, output_margin)
|
||||
|
||||
|
||||
class _PackedBooster:
|
||||
def __init__(self, cvfolds) -> None:
|
||||
def __init__(self, cvfolds: _CVFolds) -> None:
|
||||
self.cvfolds = cvfolds
|
||||
|
||||
def update(self, iteration, obj):
|
||||
def update(self, iteration: int, obj: Optional[Objective]) -> None:
|
||||
'''Iterate through folds for update'''
|
||||
for fold in self.cvfolds:
|
||||
fold.update(iteration, obj)
|
||||
|
||||
def eval(self, iteration, feval, output_margin):
|
||||
def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]:
|
||||
'''Iterate through folds for eval'''
|
||||
result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
|
||||
return result
|
||||
|
||||
def set_attr(self, **kwargs):
|
||||
def set_attr(self, **kwargs: Optional[str]) -> Any:
|
||||
'''Iterate through folds for setting attributes'''
|
||||
for f in self.cvfolds:
|
||||
f.bst.set_attr(**kwargs)
|
||||
|
||||
def attr(self, key):
|
||||
def attr(self, key: str) -> Optional[str]:
|
||||
'''Redirect to booster attr.'''
|
||||
return self.cvfolds[0].bst.attr(key)
|
||||
|
||||
def set_param(self, params, value=None):
|
||||
def set_param(self,
|
||||
params: Union[Dict, Iterable[Tuple[str, Any]], str],
|
||||
value: Optional[str] = None) -> None:
|
||||
"""Iterate through folds for set_param"""
|
||||
for f in self.cvfolds:
|
||||
f.bst.set_param(params, value)
|
||||
|
||||
def num_boosted_rounds(self):
|
||||
def num_boosted_rounds(self) -> int:
|
||||
'''Number of boosted rounds.'''
|
||||
return self.cvfolds[0].num_boosted_rounds()
|
||||
|
||||
@property
|
||||
def best_iteration(self):
|
||||
def best_iteration(self) -> int:
|
||||
'''Get best_iteration'''
|
||||
return int(self.cvfolds[0].bst.attr("best_iteration"))
|
||||
return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))
|
||||
|
||||
@property
|
||||
def best_score(self):
|
||||
def best_score(self) -> float:
|
||||
"""Get best_score."""
|
||||
return float(self.cvfolds[0].bst.attr("best_score"))
|
||||
return float(cast(float, self.cvfolds[0].bst.attr("best_score")))
|
||||
|
||||
|
||||
def groups_to_rows(groups, boundaries):
|
||||
def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Given group row boundaries, convert ground indexes to row indexes
|
||||
:param groups: list of groups for testing
|
||||
@ -268,7 +274,9 @@ def groups_to_rows(groups, boundaries):
|
||||
return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups])
|
||||
|
||||
|
||||
def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
|
||||
def mkgroupfold(dall: DMatrix, nfold: int, param: BoosterParam,
|
||||
evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
|
||||
shuffle: bool = True) -> List[CVPack]:
|
||||
"""
|
||||
Make n folds for cross-validation maintaining groups
|
||||
:return: cross-validation folds
|
||||
@ -308,8 +316,10 @@ def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
|
||||
return ret
|
||||
|
||||
|
||||
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
|
||||
folds=None, shuffle=True):
|
||||
def mknfold(dall: DMatrix, nfold: int, param: BoosterParam, seed: int,
|
||||
evals: Sequence[str] = (), fpreproc: FPreProcCallable = None,
|
||||
stratified: bool = False, folds: XGBStratifiedKFold = None, shuffle: bool = True
|
||||
) -> List[CVPack]:
|
||||
"""
|
||||
Make an n-fold list of CVPack from random indices.
|
||||
"""
|
||||
@ -362,11 +372,27 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
|
||||
return ret
|
||||
|
||||
|
||||
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
|
||||
metrics=(), obj: Optional[Objective] = None,
|
||||
feval=None, maximize=None, early_stopping_rounds=None,
|
||||
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True,
|
||||
seed=0, callbacks=None, shuffle=True, custom_metric: Optional[Metric] = None):
|
||||
def cv(
|
||||
params: BoosterParam,
|
||||
dtrain: DMatrix,
|
||||
num_boost_round: int = 10,
|
||||
nfold: int = 3,
|
||||
stratified: bool = False,
|
||||
folds: XGBStratifiedKFold = None,
|
||||
metrics: Sequence[str] = (),
|
||||
obj: Optional[Objective] = None,
|
||||
feval: Optional[Metric] = None,
|
||||
maximize: bool = None,
|
||||
early_stopping_rounds: int = None,
|
||||
fpreproc: FPreProcCallable = None,
|
||||
as_pandas: bool = True,
|
||||
verbose_eval: Optional[Union[int, bool]] = None,
|
||||
show_stdv: bool = True,
|
||||
seed: int = 0,
|
||||
callbacks: Sequence[TrainingCallback] = None,
|
||||
shuffle: bool = True,
|
||||
custom_metric: Optional[Metric] = None,
|
||||
) -> Union[Dict[str, float], DataFrame]:
|
||||
# pylint: disable = invalid-name
|
||||
"""Cross-validation with given parameters.
|
||||
|
||||
@ -477,7 +503,7 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
||||
|
||||
params.pop("eval_metric", None)
|
||||
|
||||
results = {}
|
||||
results: Dict[str, List[float]] = {}
|
||||
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
|
||||
stratified, folds, shuffle)
|
||||
|
||||
@ -490,13 +516,13 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
||||
if verbose_eval:
|
||||
verbose_eval = 1 if verbose_eval is True else verbose_eval
|
||||
callbacks.append(
|
||||
callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
|
||||
EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
|
||||
)
|
||||
if early_stopping_rounds:
|
||||
callbacks.append(
|
||||
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
|
||||
EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
|
||||
)
|
||||
callbacks = callback.CallbackContainer(
|
||||
callbacks_container = CallbackContainer(
|
||||
callbacks,
|
||||
metric=metric_fn,
|
||||
is_cv=True,
|
||||
@ -504,16 +530,16 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
||||
)
|
||||
|
||||
booster = _PackedBooster(cvfolds)
|
||||
callbacks.before_training(booster)
|
||||
callbacks_container.before_training(booster)
|
||||
|
||||
for i in range(num_boost_round):
|
||||
if callbacks.before_iteration(booster, i, dtrain, None):
|
||||
if callbacks_container.before_iteration(booster, i, dtrain, None):
|
||||
break
|
||||
booster.update(i, obj)
|
||||
|
||||
should_break = callbacks.after_iteration(booster, i, dtrain, None)
|
||||
res = callbacks.aggregated_cv
|
||||
for key, mean, std in res:
|
||||
should_break = callbacks_container.after_iteration(booster, i, dtrain, None)
|
||||
res = callbacks_container.aggregated_cv
|
||||
for key, mean, std in cast(List[Tuple[str, float, float]], res):
|
||||
if key + '-mean' not in results:
|
||||
results[key + '-mean'] = []
|
||||
if key + '-std' not in results:
|
||||
@ -532,6 +558,6 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
callbacks.after_training(booster)
|
||||
callbacks_container.after_training(booster)
|
||||
|
||||
return results
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user