Add type hints to core.py (#7707)

Co-authored-by: Chengyang Gu <bridgream@gmail.com>
Co-authored-by: jiamingy <jm.yuan@outlook.com>
This commit is contained in:
Chengyang 2022-03-23 09:12:14 -04:00 committed by GitHub
parent 66cb4afc6c
commit c92ab2ce49
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 279 additions and 200 deletions

View File

@ -0,0 +1,60 @@
"""Shared typing definition."""
import ctypes
import os
from typing import Optional, List, Any, TypeVar, Union
# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
# cudf.DataFrame/cupy.array/dlpack
DataType = Any
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatureNames = Optional[List[str]]
ArrayLike = Any
PathLike = Union[str, os.PathLike]
CupyT = ArrayLike # maybe need a stub for cupy arrays
NumpyOrCupy = Any
# ctypes
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64 # pylint: disable=C0103
CTypeT = Union[
ctypes.c_void_p,
ctypes.c_char_p,
ctypes.c_int,
ctypes.c_float,
ctypes.c_uint,
ctypes.c_size_t,
]
# supported numeric types
CNumeric = Union[
ctypes.c_float,
ctypes.c_double,
ctypes.c_uint,
ctypes.c_uint64,
ctypes.c_int32,
ctypes.c_int64,
]
# c pointer types
# real type should be, as defined in typeshed
# but this has to be put in a .pyi file
# c_str_ptr_t = ctypes.pointer[ctypes.c_char]
CStrPtr = ctypes.pointer
# c_str_pptr_t = ctypes.pointer[ctypes.c_char_p]
CStrPptr = ctypes.pointer
# c_float_ptr_t = ctypes.pointer[ctypes.c_float]
CFloatPtr = ctypes.pointer
# c_numeric_ptr_t = Union[
# ctypes.pointer[ctypes.c_float], ctypes.pointer[ctypes.c_double],
# ctypes.pointer[ctypes.c_uint], ctypes.pointer[ctypes.c_uint64],
# ctypes.pointer[ctypes.c_int32], ctypes.pointer[ctypes.c_int64]
# ]
CNumericPtr = ctypes.pointer
# template parameter
_T = TypeVar("_T")

View File

@ -4,7 +4,7 @@
from abc import ABC, abstractmethod
from collections.abc import Mapping
from typing import List, Optional, Any, Union, Dict, TypeVar
from typing import Callable, Tuple, cast, Sequence
from typing import Callable, Tuple, cast, Sequence, Type, Iterable
import ctypes
import os
import re
@ -17,22 +17,30 @@ from inspect import signature, Parameter
import numpy as np
import scipy.sparse
from .compat import (STRING_TYPES, DataFrame, py_str, PANDAS_INSTALLED,
lazy_isinstance)
from .compat import STRING_TYPES, DataFrame, py_str, PANDAS_INSTALLED, lazy_isinstance
from .libpath import find_lib_path
# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
c_bst_ulong = ctypes.c_uint64
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatNamesT = Optional[List[str]]
from ._typing import (
CStrPptr,
c_bst_ulong,
CNumeric,
DataType,
CNumericPtr,
CStrPtr,
CTypeT,
ArrayLike,
CFloatPtr,
NumpyOrCupy,
FeatureNames,
_T,
CupyT,
)
class XGBoostError(ValueError):
"""Error thrown by xgboost trainer."""
def from_pystr_to_cstr(data: Union[str, List[str]]):
def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, CStrPptr]:
"""Convert a Python str or list of Python str to C pointer
Parameters
@ -44,14 +52,14 @@ def from_pystr_to_cstr(data: Union[str, List[str]]):
if isinstance(data, str):
return bytes(data, "utf-8")
if isinstance(data, list):
pointers = (ctypes.c_char_p * len(data))()
data = [bytes(d, 'utf-8') for d in data]
pointers[:] = data
pointers: ctypes.pointer = (ctypes.c_char_p * len(data))()
data_as_bytes = [bytes(d, 'utf-8') for d in data]
pointers[:] = data_as_bytes
return pointers
raise TypeError()
def from_cstr_to_pystr(data, length) -> List[str]:
def from_cstr_to_pystr(data: CStrPptr, length: c_bst_ulong) -> List[str]:
"""Revert C pointer to Python str
Parameters
@ -64,9 +72,9 @@ def from_cstr_to_pystr(data, length) -> List[str]:
res = []
for i in range(length.value):
try:
res.append(str(data[i].decode('ascii')))
res.append(str(data[i].decode('ascii'))) # type: ignore
except UnicodeDecodeError:
res.append(str(data[i].decode('utf-8')))
res.append(str(data[i].decode('utf-8'))) # type: ignore
return res
@ -91,7 +99,7 @@ def _convert_ntree_limit(
return iteration_range
def _expect(expectations, got):
def _expect(expectations: Sequence[Type], got: Type) -> str:
"""Translate input error into string.
Parameters
@ -130,7 +138,7 @@ def _load_lib() -> ctypes.CDLL:
lib_paths = find_lib_path()
if not lib_paths:
# This happens only when building document.
return None # type: ignore
return None # type: ignore
try:
pathBackup = os.environ['PATH'].split(os.pathsep)
except KeyError:
@ -167,7 +175,7 @@ Likely causes:
Error message(s): {os_error_list}
""")
lib.XGBGetLastError.restype = ctypes.c_char_p
lib.callback = _get_log_callback_func()
lib.callback = _get_log_callback_func() # type: ignore
if lib.XGBRegisterLogCallback(lib.callback) != 0:
raise XGBoostError(lib.XGBGetLastError())
return lib
@ -192,7 +200,7 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
def _has_categorical(booster: "Booster", data: Any) -> bool:
def _has_categorical(booster: "Booster", data: DataType) -> bool:
"""Check whether the booster and input data for prediction contain categorical data.
"""
@ -224,8 +232,8 @@ def build_info() -> dict:
return res
def _numpy2ctypes_type(dtype):
_NUMPY_TO_CTYPES_MAPPING = {
def _numpy2ctypes_type(dtype: Type[np.number]) -> Type[CNumeric]:
_NUMPY_TO_CTYPES_MAPPING: Dict[Type[np.number], Type[CNumeric]] = {
np.float32: ctypes.c_float,
np.float64: ctypes.c_double,
np.uint32: ctypes.c_uint,
@ -242,7 +250,7 @@ def _numpy2ctypes_type(dtype):
return _NUMPY_TO_CTYPES_MAPPING[dtype]
def _cuda_array_interface(data) -> bytes:
def _cuda_array_interface(data: DataType) -> bytes:
assert (
data.dtype.hasobject is False
), "Input data contains `object` dtype. Expecting numeric data."
@ -253,9 +261,9 @@ def _cuda_array_interface(data) -> bytes:
return interface_str
def ctypes2numpy(cptr, length, dtype) -> np.ndarray:
def ctypes2numpy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> np.ndarray:
"""Convert a ctypes pointer array to a numpy array."""
ctype = _numpy2ctypes_type(dtype)
ctype: Type[CNumeric] = _numpy2ctypes_type(dtype)
if not isinstance(cptr, ctypes.POINTER(ctype)):
raise RuntimeError(f"expected {ctype} pointer")
res = np.zeros(length, dtype=dtype)
@ -264,7 +272,7 @@ def ctypes2numpy(cptr, length, dtype) -> np.ndarray:
return res
def ctypes2cupy(cptr, length, dtype):
def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT:
"""Convert a ctypes pointer array to a cupy array."""
# pylint: disable=import-error
import cupy
@ -290,7 +298,7 @@ def ctypes2cupy(cptr, length, dtype):
return arr
def ctypes2buffer(cptr, length) -> bytearray:
def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray:
"""Convert ctypes pointer to buffer type."""
if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
raise RuntimeError('expected char pointer')
@ -301,25 +309,30 @@ def ctypes2buffer(cptr, length) -> bytearray:
return res
def c_str(string):
def c_str(string: str) -> ctypes.c_char_p:
"""Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values):
def c_array(ctype: Type[CTypeT], values: ArrayLike) -> ctypes.Array:
"""Convert a python string to c array."""
if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
return (ctype * len(values)).from_buffer_copy(values)
return (ctype * len(values))(*values)
def _prediction_output(shape, dims, predts, is_cuda):
arr_shape: np.ndarray = ctypes2numpy(shape, dims.value, np.uint64)
def _prediction_output(
shape: CNumericPtr,
dims: c_bst_ulong,
predts: CFloatPtr,
is_cuda: bool
) -> NumpyOrCupy:
arr_shape = ctypes2numpy(shape, dims.value, np.uint64)
length = int(np.prod(arr_shape))
if is_cuda:
arr_predict = ctypes2cupy(predts, length, np.float32)
else:
arr_predict: np.ndarray = ctypes2numpy(predts, length, np.float32)
arr_predict = ctypes2numpy(predts, length, np.float32)
arr_predict = arr_predict.reshape(arr_shape)
return arr_predict
@ -415,7 +428,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
def data_handle(
data: Any,
*,
feature_names: FeatNamesT = None,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
@ -472,7 +485,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
# Nicolas Tresegnie
# Sylvain Marie
# License: BSD 3 clause
def _deprecate_positional_args(f):
def _deprecate_positional_args(f: Callable[..., _T]) -> Callable[..., _T]:
"""Decorator for methods that issues warnings for positional arguments
Using the keyword-only argument syntax in pep 3102, arguments after the
@ -496,7 +509,7 @@ def _deprecate_positional_args(f):
kwonly_args.append(name)
@wraps(f)
def inner_f(*args, **kwargs):
def inner_f(*args: Any, **kwargs: Any) -> _T:
extra_args = len(args) - len(all_args)
if extra_args > 0:
# ignore first 'self' argument for instance methods
@ -529,21 +542,21 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
@_deprecate_positional_args
def __init__(
self,
data,
label=None,
data: DataType,
label: Optional[ArrayLike] = None,
*,
weight=None,
base_margin=None,
weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
missing: Optional[float] = None,
silent=False,
feature_names: FeatNamesT = None,
silent: bool = False,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None,
group=None,
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_weights=None,
group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None,
label_lower_bound: Optional[ArrayLike] = None,
label_upper_bound: Optional[ArrayLike] = None,
feature_weights: Optional[ArrayLike] = None,
enable_categorical: bool = False,
) -> None:
"""Parameters
@ -658,7 +671,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
"nthread": self.nthread,
"cache_prefix": it.cache_prefix if it.cache_prefix else "",
}
args = from_pystr_to_cstr(json.dumps(args))
args_cstr = from_pystr_to_cstr(json.dumps(args))
handle = ctypes.c_void_p()
reset_callback, next_callback = it.get_callbacks(
True, enable_categorical
@ -668,7 +681,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
it.proxy.handle,
reset_callback,
next_callback,
args,
args_cstr,
ctypes.byref(handle),
)
it.reraise()
@ -685,16 +698,16 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
def set_info(
self,
*,
label=None,
weight=None,
base_margin=None,
group=None,
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_names: FeatNamesT = None,
label: Optional[ArrayLike] = None,
weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None,
label_lower_bound: Optional[ArrayLike] = None,
label_upper_bound: Optional[ArrayLike] = None,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
feature_weights=None
feature_weights: Optional[ArrayLike] = None
) -> None:
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
from .data import dispatch_meta_backend
@ -763,7 +776,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
ctypes.byref(ret)))
return ctypes2numpy(ret, length.value, np.uint32)
def set_float_info(self, field: str, data) -> None:
def set_float_info(self, field: str, data: ArrayLike) -> None:
"""Set float type property into the DMatrix.
Parameters
@ -777,7 +790,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
from .data import dispatch_meta_backend
dispatch_meta_backend(self, data, field, 'float')
def set_float_info_npy2d(self, field: str, data) -> None:
def set_float_info_npy2d(self, field: str, data: ArrayLike) -> None:
"""Set float type property into the DMatrix
for numpy 2d array input
@ -792,7 +805,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
from .data import dispatch_meta_backend
dispatch_meta_backend(self, data, field, 'float')
def set_uint_info(self, field: str, data) -> None:
def set_uint_info(self, field: str, data: ArrayLike) -> None:
"""Set uint type property into the DMatrix.
Parameters
@ -806,7 +819,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
from .data import dispatch_meta_backend
dispatch_meta_backend(self, data, field, 'uint32')
def save_binary(self, fname, silent=True) -> None:
def save_binary(self, fname: Union[str, os.PathLike], silent: bool = True) -> None:
"""Save DMatrix to an XGBoost buffer. Saved binary can be later loaded
by providing the path to :py:func:`xgboost.DMatrix` as input.
@ -822,7 +835,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
c_str(fname),
ctypes.c_int(silent)))
def set_label(self, label) -> None:
def set_label(self, label: ArrayLike) -> None:
"""Set label of dmatrix
Parameters
@ -833,7 +846,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
from .data import dispatch_meta_backend
dispatch_meta_backend(self, label, 'label', 'float')
def set_weight(self, weight) -> None:
def set_weight(self, weight: ArrayLike) -> None:
"""Set weight of each instance.
Parameters
@ -852,7 +865,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
from .data import dispatch_meta_backend
dispatch_meta_backend(self, weight, 'weight', 'float')
def set_base_margin(self, margin) -> None:
def set_base_margin(self, margin: ArrayLike) -> None:
"""Set base margin of booster to start from.
This can be used to specify a prediction value of existing model to be
@ -869,7 +882,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
from .data import dispatch_meta_backend
dispatch_meta_backend(self, margin, 'base_margin', 'float')
def set_group(self, group) -> None:
def set_group(self, group: ArrayLike) -> None:
"""Set group size of DMatrix (used for ranking).
Parameters
@ -997,7 +1010,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
return feature_names
@feature_names.setter
def feature_names(self, feature_names: FeatNamesT) -> None:
def feature_names(self, feature_names: FeatureNames) -> None:
"""Set feature names (column labels).
Parameters
@ -1026,9 +1039,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
not any(x in f for x in set(('[', ']', '<')))
for f in feature_names):
raise ValueError('feature_names must be string, and may not contain [, ] or <')
c_feature_names = [bytes(f, encoding='utf-8') for f in feature_names]
feature_names_bytes = [bytes(f, encoding='utf-8') for f in feature_names]
c_feature_names = (ctypes.c_char_p *
len(c_feature_names))(*c_feature_names)
len(feature_names_bytes))(*feature_names_bytes)
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str('feature_name'),
c_feature_names,
@ -1091,10 +1104,10 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
feature_types = [feature_types]
except TypeError:
feature_types = [feature_types]
c_feature_types = [bytes(f, encoding='utf-8')
feature_types_bytes = [bytes(f, encoding='utf-8')
for f in feature_types]
c_feature_types = (ctypes.c_char_p *
len(c_feature_types))(*c_feature_types)
len(feature_types_bytes))(*feature_types_bytes)
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str('feature_type'),
c_feature_types,
@ -1118,11 +1131,11 @@ class _ProxyDMatrix(DMatrix):
"""
def __init__(self): # pylint: disable=super-init-not-called
def __init__(self) -> None: # pylint: disable=super-init-not-called
self.handle = ctypes.c_void_p()
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
def _set_data_from_cuda_interface(self, data) -> None:
def _set_data_from_cuda_interface(self, data: DataType) -> None:
"""Set data from CUDA array interface."""
interface = data.__cuda_array_interface__
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
@ -1130,14 +1143,14 @@ class _ProxyDMatrix(DMatrix):
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
)
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
def _set_data_from_cuda_columnar(self, data: DataType, cat_codes: list) -> None:
"""Set data from CUDA columnar format."""
from .data import _cudf_array_interfaces
interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
def _set_data_from_array(self, data: np.ndarray):
def _set_data_from_array(self, data: np.ndarray) -> None:
"""Set data from numpy array."""
from .data import _array_interface
@ -1145,7 +1158,7 @@ class _ProxyDMatrix(DMatrix):
_LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data))
)
def _set_data_from_csr(self, csr):
def _set_data_from_csr(self, csr: scipy.sparse.csr_matrix) -> None:
"""Set data from scipy csr"""
from .data import _array_interface
@ -1175,24 +1188,24 @@ class DeviceQuantileDMatrix(DMatrix):
@_deprecate_positional_args
def __init__( # pylint: disable=super-init-not-called
self,
data,
label=None,
data: DataType,
label: Optional[ArrayLike] = None,
*,
weight=None,
base_margin=None,
missing=None,
silent=False,
feature_names: FeatNamesT = None,
feature_types=None,
weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
missing: Optional[float] = None,
silent: bool = False,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None,
max_bin: int = 256,
group=None,
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_weights=None,
group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None,
label_lower_bound: Optional[ArrayLike] = None,
label_upper_bound: Optional[ArrayLike] = None,
feature_weights: Optional[ArrayLike] = None,
enable_categorical: bool = False,
):
) -> None:
self.max_bin = max_bin
self.missing = missing if missing is not None else np.nan
self.nthread = nthread if nthread is not None else 1
@ -1223,7 +1236,7 @@ class DeviceQuantileDMatrix(DMatrix):
enable_categorical=enable_categorical,
)
def _init(self, data, enable_categorical: bool, **meta) -> None:
def _init(self, data: DataType, enable_categorical: bool, **meta: Any) -> None:
from .data import (
_is_dlpack,
_transform_dlpack,
@ -1304,9 +1317,10 @@ def _configure_metrics(params: Union[Dict, List]) -> Union[Dict, List]:
params = dict((k, v) for k, v in params.items())
eval_metrics = params["eval_metric"]
params.pop("eval_metric", None)
params = list(params.items())
params_list = list(params.items())
for eval_metric in eval_metrics:
params += [("eval_metric", eval_metric)]
params_list += [("eval_metric", eval_metric)]
return params_list
return params
@ -1417,7 +1431,7 @@ class Booster:
"Constrained features are not a subset of training data feature names"
) from e
def _configure_constraints(self, params: Union[Dict, List]) -> Union[Dict, List]:
def _configure_constraints(self, params: Union[List, Dict]) -> Union[List, Dict]:
if isinstance(params, dict):
value = params.get("monotone_constraints")
if value:
@ -1546,7 +1560,7 @@ class Booster:
def __copy__(self) -> "Booster":
return self.__deepcopy__(None)
def __deepcopy__(self, _) -> "Booster":
def __deepcopy__(self, _: Any) -> "Booster":
'''Return a copy of booster.'''
return Booster(model_file=self)
@ -1629,8 +1643,8 @@ class Booster:
def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
if features is not None:
assert isinstance(features, list)
c_feature_info = [bytes(f, encoding="utf-8") for f in features]
c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
c_feature_info = (ctypes.c_char_p * len(feature_info_bytes))(*feature_info_bytes)
_check_call(
_LIB.XGBoosterSetStrFeatureInfo(
self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
@ -1664,10 +1678,14 @@ class Booster:
return self._get_feature_info("feature_name")
@feature_names.setter
def feature_names(self, features: FeatNamesT) -> None:
def feature_names(self, features: FeatureNames) -> None:
self._set_feature_info(features, "feature_name")
def set_param(self, params, value=None):
def set_param(
self,
params: Union[Dict, Iterable[Tuple[str, Any]], str],
value: Optional[str] = None
) -> None:
"""Set parameters into the Booster.
Parameters
@ -1966,14 +1984,14 @@ class Booster:
def inplace_predict(
self,
data: Any,
data: DataType,
iteration_range: Tuple[int, int] = (0, 0),
predict_type: str = "value",
missing: float = np.nan,
validate_features: bool = True,
base_margin: Any = None,
strict_shape: bool = False
):
) -> NumpyOrCupy:
"""Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction does not
cache the prediction result.
@ -2232,11 +2250,11 @@ class Booster:
raise TypeError('Unknown file type: ', fname)
if self.attr("best_iteration") is not None:
self.best_iteration = int(self.attr("best_iteration"))
self.best_iteration = int(self.attr("best_iteration")) # type: ignore
if self.attr("best_score") is not None:
self.best_score = float(self.attr("best_score"))
self.best_score = float(self.attr("best_score")) # type: ignore
if self.attr("best_ntree_limit") is not None:
self.best_ntree_limit = int(self.attr("best_ntree_limit"))
self.best_ntree_limit = int(self.attr("best_ntree_limit")) # type: ignore
def num_boosted_rounds(self) -> int:
'''Get number of boosted rounds. For gblinear this is reset to 0 after
@ -2255,7 +2273,8 @@ class Booster:
_check_call(_LIB.XGBoosterGetNumFeature(self.handle, ctypes.byref(features)))
return features.value
def dump_model(self, fout, fmap='', with_stats=False, dump_format="text"):
def dump_model(self, fout: Union[str, os.PathLike], fmap: Union[str, os.PathLike] = '',
with_stats: bool = False, dump_format: str = "text") -> None:
"""Dump model into a text or JSON file. Unlike :py:meth:`save_model`, the
output format is primarily used for visualization or interpretation,
hence it's more human readable but cannot be loaded back to XGBoost.
@ -2274,24 +2293,25 @@ class Booster:
if isinstance(fout, (STRING_TYPES, os.PathLike)):
fout = os.fspath(os.path.expanduser(fout))
# pylint: disable=consider-using-with
fout = open(fout, 'w', encoding="utf-8")
fout_obj = open(fout, 'w', encoding="utf-8")
need_close = True
else:
fout_obj = fout
need_close = False
ret = self.get_dump(fmap, with_stats, dump_format)
if dump_format == 'json':
fout.write('[\n')
fout_obj.write('[\n')
for i, _ in enumerate(ret):
fout.write(ret[i])
fout_obj.write(ret[i])
if i < len(ret) - 1:
fout.write(",\n")
fout.write('\n]')
fout_obj.write(",\n")
fout_obj.write('\n]')
else:
for i, _ in enumerate(ret):
fout.write(f"booster[{i}]:\n")
fout.write(ret[i])
fout_obj.write(f"booster[{i}]:\n")
fout_obj.write(ret[i])
if need_close:
fout.close()
fout_obj.close()
def get_dump(
self,
@ -2438,11 +2458,11 @@ class Booster:
tree_ids = []
node_ids = []
fids = []
splits = []
categories: List[Optional[float]] = []
y_directs = []
n_directs = []
missings = []
splits: List[Union[float, str]] = []
categories: List[Union[Optional[float], List[str]]] = []
y_directs: List[Union[float, str]] = []
n_directs: List[Union[float, str]] = []
missings: List[Union[float, str]] = []
gains = []
covers = []
@ -2483,9 +2503,9 @@ class Booster:
# categorical
parse = fid[0].split(":")
cats = parse[1][1:-1] # strip the {}
cats = cats.split(",")
cats_split = cats.split(",")
splits.append(float("NAN"))
categories.append(cats if cats else None)
categories.append(cats_split if cats_split else None)
else:
raise ValueError("Failed to parse model text dump.")
stats = re.split('=|,', fid[1])

View File

@ -57,7 +57,7 @@ from .compat import lazy_isinstance
from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric
from .core import _deprecate_positional_args, _has_categorical
from .data import FeatNamesT
from .data import FeatureNames
from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip
from .sklearn import XGBModel, XGBClassifier, XGBRegressorBase, XGBClassifierBase
@ -326,7 +326,7 @@ class DaskDMatrix:
base_margin: Optional[_DaskCollection] = None,
missing: float = None,
silent: bool = False, # pylint: disable=unused-argument
feature_names: FeatNamesT = None,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None,
@ -602,7 +602,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
qid: Optional[List[Any]] = None,
label_lower_bound: Optional[List[Any]] = None,
label_upper_bound: Optional[List[Any]] = None,
feature_names: FeatNamesT = None,
feature_names: FeatureNames = None,
feature_types: Optional[Union[Any, List[Any]]] = None,
) -> None:
self._data = data
@ -645,7 +645,7 @@ class DaskPartitionIter(DataIter): # pylint: disable=R0902
if self._iter == len(self._data):
# Return 0 when there's no more batch.
return 0
feature_names: FeatNamesT = None
feature_names: FeatureNames = None
if self._feature_names:
feature_names = self._feature_names
else:
@ -696,7 +696,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
base_margin: Optional[_DaskCollection] = None,
missing: float = None,
silent: bool = False, # disable=unused-argument
feature_names: FeatNamesT = None,
feature_names: FeatureNames = None,
feature_types: Optional[Union[Any, List[Any]]] = None,
max_bin: int = 256,
group: Optional[_DaskCollection] = None,
@ -733,7 +733,7 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
def _create_device_quantile_dmatrix(
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[Union[Any, List[Any]]],
feature_weights: Optional[Any],
missing: float,
@ -774,7 +774,7 @@ def _create_device_quantile_dmatrix(
def _create_dmatrix(
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[Union[Any, List[Any]]],
feature_weights: Optional[Any],
missing: float,

View File

@ -12,7 +12,7 @@ import numpy as np
from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatNamesT
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
from .compat import lazy_isinstance, DataFrame
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
@ -69,7 +69,7 @@ def _from_scipy_csr(
data,
missing,
nthread,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
"""Initialize data from a CSR matrix."""
@ -108,7 +108,7 @@ def _is_scipy_csc(data):
def _from_scipy_csc(
data,
missing,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
if len(data.indices) != len(data.data):
@ -164,7 +164,7 @@ def _from_numpy_array(
data,
missing,
nthread,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
"""Initialize data from a 2-D numpy matrix.
@ -245,11 +245,11 @@ be set to `True`.""" + err
def _transform_pandas_df(
data: DataFrame,
enable_categorical: bool,
feature_names: FeatNamesT = None,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
meta: Optional[str] = None,
meta_type: Optional[str] = None,
) -> Tuple[np.ndarray, FeatNamesT, Optional[List[str]]]:
) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype
@ -313,9 +313,9 @@ def _from_pandas_df(
enable_categorical: bool,
missing: float,
nthread: int,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]:
data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
@ -355,7 +355,7 @@ def _from_pandas_series(
missing: float,
nthread: int,
enable_categorical: bool,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
from pandas.api.types import is_categorical_dtype
@ -386,7 +386,7 @@ _dt_type_mapper2 = {'bool': 'i', 'int': 'int', 'real': 'float'}
def _transform_dt_df(
data,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
meta=None,
meta_type=None,
@ -427,10 +427,10 @@ def _from_dt_df(
data,
missing,
nthread,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatNamesT, Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]:
if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df(
@ -594,7 +594,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df(
data,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
enable_categorical: bool,
):
@ -660,7 +660,7 @@ def _from_cudf_df(
data,
missing,
nthread,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]:
@ -710,7 +710,7 @@ def _from_cupy_array(
data,
missing,
nthread,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
"""Initialize DMatrix from cupy ndarray."""
@ -757,7 +757,7 @@ def _from_dlpack(
data,
missing,
nthread,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
data = _transform_dlpack(data)
@ -772,7 +772,7 @@ def _is_uri(data):
def _from_uri(
data,
missing,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
_warn_unused_missing(data, missing)
@ -792,7 +792,7 @@ def _from_list(
data,
missing,
n_threads,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
array = np.array(data)
@ -808,7 +808,7 @@ def _from_tuple(
data,
missing,
n_threads,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
):
return _from_list(data, missing, n_threads, feature_names, feature_types)
@ -844,7 +844,7 @@ def dispatch_data_backend(
data,
missing,
threads,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
enable_categorical: bool = False,
):
@ -1076,7 +1076,7 @@ class SingleBatchInternalIter(DataIter): # pylint: disable=R0902
def _proxy_transform(
data,
feature_names: FeatNamesT,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
enable_categorical: bool,
):

View File

@ -14,6 +14,7 @@ from .core import Metric
from .training import train
from .callback import TrainingCallback
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
from ._typing import ArrayLike
# Do not use class names on scikit-learn directly. Re-define the classes on
# .compat to guarantee the behavior without scikit-learn
@ -25,8 +26,6 @@ from .compat import (
XGBoostLabelEncoder,
)
array_like = Any
class XGBRankerMixIn: # pylint: disable=too-few-public-methods
"""MixIn for ranking, defines the _estimator_type usually defined in scikit-learn base
@ -862,19 +861,19 @@ class XGBModel(XGBModelBase):
@_deprecate_positional_args
def fit(
self,
X: array_like,
y: array_like,
X: ArrayLike,
y: ArrayLike,
*,
sample_weight: Optional[array_like] = None,
base_margin: Optional[array_like] = None,
eval_set: Optional[Sequence[Tuple[array_like, array_like]]] = None,
sample_weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
early_stopping_rounds: Optional[int] = None,
verbose: Optional[bool] = True,
xgb_model: Optional[Union[Booster, str, "XGBModel"]] = None,
sample_weight_eval_set: Optional[Sequence[array_like]] = None,
base_margin_eval_set: Optional[Sequence[array_like]] = None,
feature_weights: Optional[array_like] = None,
sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
feature_weights: Optional[ArrayLike] = None,
callbacks: Optional[Sequence[TrainingCallback]] = None
) -> "XGBModel":
# pylint: disable=invalid-name,attribute-defined-outside-init
@ -1001,11 +1000,11 @@ class XGBModel(XGBModelBase):
def predict(
self,
X: array_like,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[array_like] = None,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
"""Predict with `X`. If the model is trained with early stopping, then `best_iteration`
@ -1077,7 +1076,7 @@ class XGBModel(XGBModelBase):
)
def apply(
self, X: array_like,
self, X: ArrayLike,
ntree_limit: int = 0,
iteration_range: Optional[Tuple[int, int]] = None
) -> np.ndarray:
@ -1317,19 +1316,19 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
@_deprecate_positional_args
def fit(
self,
X: array_like,
y: array_like,
X: ArrayLike,
y: ArrayLike,
*,
sample_weight: Optional[array_like] = None,
base_margin: Optional[array_like] = None,
eval_set: Optional[Sequence[Tuple[array_like, array_like]]] = None,
sample_weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
early_stopping_rounds: Optional[int] = None,
verbose: Optional[bool] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[array_like]] = None,
base_margin_eval_set: Optional[Sequence[array_like]] = None,
feature_weights: Optional[array_like] = None,
sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
feature_weights: Optional[ArrayLike] = None,
callbacks: Optional[Sequence[TrainingCallback]] = None
) -> "XGBClassifier":
# pylint: disable = attribute-defined-outside-init,too-many-statements
@ -1425,11 +1424,11 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def predict(
self,
X: array_like,
X: ArrayLike,
output_margin: bool = False,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[array_like] = None,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
class_probs = super().predict(
@ -1464,10 +1463,10 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def predict_proba(
self,
X: array_like,
X: ArrayLike,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
base_margin: Optional[array_like] = None,
base_margin: Optional[ArrayLike] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> np.ndarray:
""" Predict the probability of each `X` example being of a given class.
@ -1558,19 +1557,19 @@ class XGBRFClassifier(XGBClassifier):
@_deprecate_positional_args
def fit(
self,
X: array_like,
y: array_like,
X: ArrayLike,
y: ArrayLike,
*,
sample_weight: Optional[array_like] = None,
base_margin: Optional[array_like] = None,
eval_set: Optional[Sequence[Tuple[array_like, array_like]]] = None,
sample_weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
early_stopping_rounds: Optional[int] = None,
verbose: Optional[bool] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[array_like]] = None,
base_margin_eval_set: Optional[Sequence[array_like]] = None,
feature_weights: Optional[array_like] = None,
sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
feature_weights: Optional[ArrayLike] = None,
callbacks: Optional[Sequence[TrainingCallback]] = None
) -> "XGBRFClassifier":
args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
@ -1630,19 +1629,19 @@ class XGBRFRegressor(XGBRegressor):
@_deprecate_positional_args
def fit(
self,
X: array_like,
y: array_like,
X: ArrayLike,
y: ArrayLike,
*,
sample_weight: Optional[array_like] = None,
base_margin: Optional[array_like] = None,
eval_set: Optional[Sequence[Tuple[array_like, array_like]]] = None,
sample_weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
early_stopping_rounds: Optional[int] = None,
verbose: Optional[bool] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[array_like]] = None,
base_margin_eval_set: Optional[Sequence[array_like]] = None,
feature_weights: Optional[array_like] = None,
sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
feature_weights: Optional[ArrayLike] = None,
callbacks: Optional[Sequence[TrainingCallback]] = None
) -> "XGBRFRegressor":
args = {k: v for k, v in locals().items() if k not in ("self", "__class__")}
@ -1705,23 +1704,23 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
@_deprecate_positional_args
def fit(
self,
X: array_like,
y: array_like,
X: ArrayLike,
y: ArrayLike,
*,
group: Optional[array_like] = None,
qid: Optional[array_like] = None,
sample_weight: Optional[array_like] = None,
base_margin: Optional[array_like] = None,
eval_set: Optional[Sequence[Tuple[array_like, array_like]]] = None,
eval_group: Optional[Sequence[array_like]] = None,
eval_qid: Optional[Sequence[array_like]] = None,
group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None,
sample_weight: Optional[ArrayLike] = None,
base_margin: Optional[ArrayLike] = None,
eval_set: Optional[Sequence[Tuple[ArrayLike, ArrayLike]]] = None,
eval_group: Optional[Sequence[ArrayLike]] = None,
eval_qid: Optional[Sequence[ArrayLike]] = None,
eval_metric: Optional[Union[str, Sequence[str], Metric]] = None,
early_stopping_rounds: Optional[int] = None,
verbose: Optional[bool] = False,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[array_like]] = None,
base_margin_eval_set: Optional[Sequence[array_like]] = None,
feature_weights: Optional[array_like] = None,
sample_weight_eval_set: Optional[Sequence[ArrayLike]] = None,
base_margin_eval_set: Optional[Sequence[ArrayLike]] = None,
feature_weights: Optional[ArrayLike] = None,
callbacks: Optional[Sequence[TrainingCallback]] = None
) -> "XGBRanker":
# pylint: disable = attribute-defined-outside-init,arguments-differ