Typehint for Sklearn. (#6799)

This commit is contained in:
Jiaming Yuan 2021-04-14 06:55:21 +08:00 committed by GitHub
parent 3d919db0c0
commit dee5ef2dfd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 335 additions and 262 deletions

View File

@ -243,7 +243,7 @@ jobs:
architecture: 'x64'
- name: Install Python packages
run: |
python -m pip install wheel setuptools mypy dask[complete] distributed
python -m pip install wheel setuptools mypy pandas dask[complete] distributed
- name: Run mypy
run: |
make mypy

View File

@ -91,8 +91,9 @@ endif
# If any of the dask tests failed, contributor won't see the other error.
mypy:
cd python-package; \
mypy ./xgboost/dask.py ../tests/python/test_with_dask.py --follow-imports=silent; \
mypy ../tests/python-gpu/test_gpu_with_dask.py --follow-imports=silent; \
mypy ./xgboost/dask.py && \
mypy ../tests/python-gpu/test_gpu_with_dask.py && \
mypy ./xgboost/sklearn.py || exit 1; \
mypy . || true ;
clean:

View File

@ -3,4 +3,5 @@ description-file = README.rst
[mypy]
ignore_missing_imports = True
disallow_untyped_defs = True
disallow_untyped_defs = True
follow_imports = silent

View File

@ -276,6 +276,9 @@ class TrainingCallback(ABC):
.. versionadded:: 1.3.0
'''
EvalsLog = Dict[str, Dict[str, Union[List[float], List[Tuple[float, float]]]]]
def __init__(self):
pass
@ -287,13 +290,11 @@ class TrainingCallback(ABC):
'''Run after training is finished.'''
return model
def before_iteration(self, model, epoch: int,
evals_log: 'CallbackContainer.EvalsLog') -> bool:
def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
'''Run before each iteration. Return True when training should stop.'''
return False
def after_iteration(self, model, epoch: int,
evals_log: 'CallbackContainer.EvalsLog') -> bool:
def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
'''Run after each iteration. Return True when training should stop.'''
return False
@ -351,7 +352,7 @@ class CallbackContainer:
'''
EvalsLog = Dict[str, Dict[str, Union[List[float], List[Tuple[float, float]]]]]
EvalsLog = TrainingCallback.EvalsLog
def __init__(self,
callbacks: List[TrainingCallback],

View File

@ -1,6 +1,7 @@
# coding: utf-8
# pylint: disable= invalid-name, unused-import
"""For compatibility and optional dependencies."""
from typing import Any
import sys
import types
import importlib.util
@ -36,7 +37,7 @@ except ImportError:
MultiIndex = object
Int64Index = object
DataFrame = object
DataFrame: Any = object
Series = object
pandas_concat = None
PANDAS_INSTALLED = False
@ -109,10 +110,12 @@ except pkg_resources.DistributionNotFound:
try:
import sparse
import scipy.sparse as scipy_sparse
from scipy.sparse import csr_matrix as scipy_csr
SCIPY_INSTALLED = True
except ImportError:
sparse = False
scipy_sparse = False
scipy_csr: Any = object
SCIPY_INSTALLED = False

View File

@ -96,7 +96,11 @@ def from_cstr_to_pystr(data, length) -> List[str]:
return res
def _convert_ntree_limit(booster, ntree_limit, iteration_range):
def _convert_ntree_limit(
booster: "Booster",
ntree_limit: Optional[int],
iteration_range: Optional[Tuple[int, int]]
) -> Optional[Tuple[int, int]]:
if ntree_limit is not None and ntree_limit != 0:
warnings.warn(
"ntree_limit is deprecated, use `iteration_range` or model "
@ -1234,7 +1238,7 @@ class Booster(object):
params += [('eval_metric', eval_metric)]
return params
def _transform_monotone_constrains(self, value: Union[dict, str]) -> str:
def _transform_monotone_constrains(self, value: Union[Dict[str, int], str]) -> str:
if isinstance(value, str):
return value
@ -1246,7 +1250,9 @@ class Booster(object):
return '(' + ','.join([str(value.get(feature_name, 0))
for feature_name in self.feature_names]) + ')'
def _transform_interaction_constraints(self, value: Union[list, str]) -> str:
def _transform_interaction_constraints(
self, value: Union[List[Tuple[str]], str]
) -> str:
if isinstance(value, str):
return value
@ -1447,7 +1453,7 @@ class Booster(object):
attr_names = from_cstr_to_pystr(sarr, length)
return {n: self.attr(n) for n in attr_names}
def set_attr(self, **kwargs):
def set_attr(self, **kwargs: Optional[str]) -> None:
"""Set the attribute of the Booster.
Parameters
@ -1971,7 +1977,7 @@ class Booster(object):
"Data type:" + str(type(data)) + " not supported by inplace prediction."
)
def save_model(self, fname):
def save_model(self, fname: Union[str, os.PathLike]):
"""Save the model to a file.
The model is saved in an XGBoost internal format which is universal among the

View File

@ -1028,7 +1028,8 @@ async def _direct_predict_impl( # pylint: disable=too-many-branches
# Somehow dask fail to infer output shape change for 2-dim prediction, and
# `chunks = (None, output_shape[1])` doesn't work due to None is not
# supported in map_blocks.
chunks = list(data.chunks)
chunks: Optional[List[Tuple]] = list(data.chunks)
assert isinstance(chunks, list)
chunks[1] = (output_shape[1], )
else:
chunks = None
@ -1633,7 +1634,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
)
if callable(self.objective):
obj = _objective_decorator(self.objective)
obj: Optional[Callable] = _objective_decorator(self.objective)
else:
obj = None
model, metric, params = self._configure_fit(
@ -1734,7 +1735,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
params["objective"] = "binary:logistic"
if callable(self.objective):
obj = _objective_decorator(self.objective)
obj: Optional[Callable] = _objective_decorator(self.objective)
else:
obj = None
model, metric, params = self._configure_fit(

File diff suppressed because it is too large Load Diff

View File

@ -269,7 +269,9 @@ class TestDistributedGPU:
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.parametrize("model", ["boosting"])
def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None:
def test_dask_classifier(
self, model: str, local_cuda_cluster: LocalCUDACluster
) -> None:
import dask_cudf
with Client(local_cuda_cluster) as client:
X_, y_, w_ = generate_array(with_weights=True)

View File

@ -60,25 +60,25 @@ class TestInteractionConstraints:
def test_interaction_constraints_feature_names(self):
with pytest.raises(ValueError):
constraints = [('feature_0', 'feature_1')]
self.run_interaction_constraints(tree_method='exact',
self.run_interaction_constraints(tree_method='exact',
interaction_constraints=constraints)
with pytest.raises(ValueError):
constraints = [('feature_0', 'feature_3')]
feature_names = ['feature_0', 'feature_1', 'feature_2']
self.run_interaction_constraints(tree_method='exact',
feature_names=feature_names,
self.run_interaction_constraints(tree_method='exact',
feature_names=feature_names,
interaction_constraints=constraints)
constraints = [('feature_0', 'feature_1')]
feature_names = ['feature_0', 'feature_1', 'feature_2']
self.run_interaction_constraints(tree_method='exact',
feature_names=feature_names,
self.run_interaction_constraints(tree_method='exact',
feature_names=feature_names,
interaction_constraints=constraints)
@pytest.mark.skipif(**tm.no_sklearn())
def training_accuracy(self, tree_method):
"""Test accuracy, reused by GPU tests."""
from sklearn.metrics import accuracy_score
dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
@ -101,11 +101,6 @@ class TestInteractionConstraints:
pred_dtest = (bst.predict(dtest) < 0.5)
assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
def test_hist_training_accuracy(self):
self.training_accuracy(tree_method='hist')
def test_exact_training_accuracy(self):
self.training_accuracy(tree_method='exact')
def test_approx_training_accuracy(self):
self.training_accuracy(tree_method='approx')
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
def test_hist_training_accuracy(self, tree_method):
self.training_accuracy(tree_method=tree_method)

View File

@ -22,14 +22,14 @@ def is_correctly_constrained(learner, feature_names=None):
for i in range(n):
fixed_x = fixed_xs_values[i] * np.ones((n, 1))
monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x,
monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x,
feature_names=feature_names)
monotonically_increasing_y = learner.predict(
monotonically_increasing_dset
)
monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x,
monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x,
feature_names=feature_names)
monotonically_decreasing_y = learner.predict(
monotonically_decreasing_dset
@ -105,7 +105,7 @@ class TestMonotoneConstraints:
@pytest.mark.parametrize('format', [dict, list])
def test_monotone_constraints_feature_names(self, format):
# next check monotonicity when initializing monotone_constraints by feature names
params = {
'tree_method': 'hist', 'verbosity': 1,
@ -119,13 +119,13 @@ class TestMonotoneConstraints:
with pytest.raises(ValueError):
xgb.train(params, training_dset)
feature_names =[ 'feature_0', 'feature_2']
feature_names = ['feature_0', 'feature_2']
training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
with pytest.raises(ValueError):
xgb.train(params, training_dset_w_feature_names)
feature_names =[ 'feature_0', 'feature_1']
feature_names = ['feature_0', 'feature_1']
training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
constrained_learner = xgb.train(