Typehint for Sklearn. (#6799)

This commit is contained in:
Jiaming Yuan 2021-04-14 06:55:21 +08:00 committed by GitHub
parent 3d919db0c0
commit dee5ef2dfd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 335 additions and 262 deletions

View File

@ -243,7 +243,7 @@ jobs:
architecture: 'x64' architecture: 'x64'
- name: Install Python packages - name: Install Python packages
run: | run: |
python -m pip install wheel setuptools mypy dask[complete] distributed python -m pip install wheel setuptools mypy pandas dask[complete] distributed
- name: Run mypy - name: Run mypy
run: | run: |
make mypy make mypy

View File

@ -91,8 +91,9 @@ endif
# If any of the dask tests failed, contributor won't see the other error. # If any of the dask tests failed, contributor won't see the other error.
mypy: mypy:
cd python-package; \ cd python-package; \
mypy ./xgboost/dask.py ../tests/python/test_with_dask.py --follow-imports=silent; \ mypy ./xgboost/dask.py && \
mypy ../tests/python-gpu/test_gpu_with_dask.py --follow-imports=silent; \ mypy ../tests/python-gpu/test_gpu_with_dask.py && \
mypy ./xgboost/sklearn.py || exit 1; \
mypy . || true ; mypy . || true ;
clean: clean:

View File

@ -3,4 +3,5 @@ description-file = README.rst
[mypy] [mypy]
ignore_missing_imports = True ignore_missing_imports = True
disallow_untyped_defs = True disallow_untyped_defs = True
follow_imports = silent

View File

@ -276,6 +276,9 @@ class TrainingCallback(ABC):
.. versionadded:: 1.3.0 .. versionadded:: 1.3.0
''' '''
EvalsLog = Dict[str, Dict[str, Union[List[float], List[Tuple[float, float]]]]]
def __init__(self): def __init__(self):
pass pass
@ -287,13 +290,11 @@ class TrainingCallback(ABC):
'''Run after training is finished.''' '''Run after training is finished.'''
return model return model
def before_iteration(self, model, epoch: int, def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
evals_log: 'CallbackContainer.EvalsLog') -> bool:
'''Run before each iteration. Return True when training should stop.''' '''Run before each iteration. Return True when training should stop.'''
return False return False
def after_iteration(self, model, epoch: int, def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
evals_log: 'CallbackContainer.EvalsLog') -> bool:
'''Run after each iteration. Return True when training should stop.''' '''Run after each iteration. Return True when training should stop.'''
return False return False
@ -351,7 +352,7 @@ class CallbackContainer:
''' '''
EvalsLog = Dict[str, Dict[str, Union[List[float], List[Tuple[float, float]]]]] EvalsLog = TrainingCallback.EvalsLog
def __init__(self, def __init__(self,
callbacks: List[TrainingCallback], callbacks: List[TrainingCallback],

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
# pylint: disable= invalid-name, unused-import # pylint: disable= invalid-name, unused-import
"""For compatibility and optional dependencies.""" """For compatibility and optional dependencies."""
from typing import Any
import sys import sys
import types import types
import importlib.util import importlib.util
@ -36,7 +37,7 @@ except ImportError:
MultiIndex = object MultiIndex = object
Int64Index = object Int64Index = object
DataFrame = object DataFrame: Any = object
Series = object Series = object
pandas_concat = None pandas_concat = None
PANDAS_INSTALLED = False PANDAS_INSTALLED = False
@ -109,10 +110,12 @@ except pkg_resources.DistributionNotFound:
try: try:
import sparse import sparse
import scipy.sparse as scipy_sparse import scipy.sparse as scipy_sparse
from scipy.sparse import csr_matrix as scipy_csr
SCIPY_INSTALLED = True SCIPY_INSTALLED = True
except ImportError: except ImportError:
sparse = False sparse = False
scipy_sparse = False scipy_sparse = False
scipy_csr: Any = object
SCIPY_INSTALLED = False SCIPY_INSTALLED = False

View File

@ -96,7 +96,11 @@ def from_cstr_to_pystr(data, length) -> List[str]:
return res return res
def _convert_ntree_limit(booster, ntree_limit, iteration_range): def _convert_ntree_limit(
booster: "Booster",
ntree_limit: Optional[int],
iteration_range: Optional[Tuple[int, int]]
) -> Optional[Tuple[int, int]]:
if ntree_limit is not None and ntree_limit != 0: if ntree_limit is not None and ntree_limit != 0:
warnings.warn( warnings.warn(
"ntree_limit is deprecated, use `iteration_range` or model " "ntree_limit is deprecated, use `iteration_range` or model "
@ -1234,7 +1238,7 @@ class Booster(object):
params += [('eval_metric', eval_metric)] params += [('eval_metric', eval_metric)]
return params return params
def _transform_monotone_constrains(self, value: Union[dict, str]) -> str: def _transform_monotone_constrains(self, value: Union[Dict[str, int], str]) -> str:
if isinstance(value, str): if isinstance(value, str):
return value return value
@ -1246,7 +1250,9 @@ class Booster(object):
return '(' + ','.join([str(value.get(feature_name, 0)) return '(' + ','.join([str(value.get(feature_name, 0))
for feature_name in self.feature_names]) + ')' for feature_name in self.feature_names]) + ')'
def _transform_interaction_constraints(self, value: Union[list, str]) -> str: def _transform_interaction_constraints(
self, value: Union[List[Tuple[str]], str]
) -> str:
if isinstance(value, str): if isinstance(value, str):
return value return value
@ -1447,7 +1453,7 @@ class Booster(object):
attr_names = from_cstr_to_pystr(sarr, length) attr_names = from_cstr_to_pystr(sarr, length)
return {n: self.attr(n) for n in attr_names} return {n: self.attr(n) for n in attr_names}
def set_attr(self, **kwargs): def set_attr(self, **kwargs: Optional[str]) -> None:
"""Set the attribute of the Booster. """Set the attribute of the Booster.
Parameters Parameters
@ -1971,7 +1977,7 @@ class Booster(object):
"Data type:" + str(type(data)) + " not supported by inplace prediction." "Data type:" + str(type(data)) + " not supported by inplace prediction."
) )
def save_model(self, fname): def save_model(self, fname: Union[str, os.PathLike]):
"""Save the model to a file. """Save the model to a file.
The model is saved in an XGBoost internal format which is universal among the The model is saved in an XGBoost internal format which is universal among the

View File

@ -1028,7 +1028,8 @@ async def _direct_predict_impl( # pylint: disable=too-many-branches
# Somehow dask fail to infer output shape change for 2-dim prediction, and # Somehow dask fail to infer output shape change for 2-dim prediction, and
# `chunks = (None, output_shape[1])` doesn't work due to None is not # `chunks = (None, output_shape[1])` doesn't work due to None is not
# supported in map_blocks. # supported in map_blocks.
chunks = list(data.chunks) chunks: Optional[List[Tuple]] = list(data.chunks)
assert isinstance(chunks, list)
chunks[1] = (output_shape[1], ) chunks[1] = (output_shape[1], )
else: else:
chunks = None chunks = None
@ -1633,7 +1634,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
) )
if callable(self.objective): if callable(self.objective):
obj = _objective_decorator(self.objective) obj: Optional[Callable] = _objective_decorator(self.objective)
else: else:
obj = None obj = None
model, metric, params = self._configure_fit( model, metric, params = self._configure_fit(
@ -1734,7 +1735,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
params["objective"] = "binary:logistic" params["objective"] = "binary:logistic"
if callable(self.objective): if callable(self.objective):
obj = _objective_decorator(self.objective) obj: Optional[Callable] = _objective_decorator(self.objective)
else: else:
obj = None obj = None
model, metric, params = self._configure_fit( model, metric, params = self._configure_fit(

File diff suppressed because it is too large Load Diff

View File

@ -269,7 +269,9 @@ class TestDistributedGPU:
@pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.parametrize("model", ["boosting"]) @pytest.mark.parametrize("model", ["boosting"])
def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None: def test_dask_classifier(
self, model: str, local_cuda_cluster: LocalCUDACluster
) -> None:
import dask_cudf import dask_cudf
with Client(local_cuda_cluster) as client: with Client(local_cuda_cluster) as client:
X_, y_, w_ = generate_array(with_weights=True) X_, y_, w_ = generate_array(with_weights=True)

View File

@ -60,25 +60,25 @@ class TestInteractionConstraints:
def test_interaction_constraints_feature_names(self): def test_interaction_constraints_feature_names(self):
with pytest.raises(ValueError): with pytest.raises(ValueError):
constraints = [('feature_0', 'feature_1')] constraints = [('feature_0', 'feature_1')]
self.run_interaction_constraints(tree_method='exact', self.run_interaction_constraints(tree_method='exact',
interaction_constraints=constraints) interaction_constraints=constraints)
with pytest.raises(ValueError): with pytest.raises(ValueError):
constraints = [('feature_0', 'feature_3')] constraints = [('feature_0', 'feature_3')]
feature_names = ['feature_0', 'feature_1', 'feature_2'] feature_names = ['feature_0', 'feature_1', 'feature_2']
self.run_interaction_constraints(tree_method='exact', self.run_interaction_constraints(tree_method='exact',
feature_names=feature_names, feature_names=feature_names,
interaction_constraints=constraints) interaction_constraints=constraints)
constraints = [('feature_0', 'feature_1')] constraints = [('feature_0', 'feature_1')]
feature_names = ['feature_0', 'feature_1', 'feature_2'] feature_names = ['feature_0', 'feature_1', 'feature_2']
self.run_interaction_constraints(tree_method='exact', self.run_interaction_constraints(tree_method='exact',
feature_names=feature_names, feature_names=feature_names,
interaction_constraints=constraints) interaction_constraints=constraints)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
def training_accuracy(self, tree_method): def training_accuracy(self, tree_method):
"""Test accuracy, reused by GPU tests."""
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1') dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1') dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
@ -101,11 +101,6 @@ class TestInteractionConstraints:
pred_dtest = (bst.predict(dtest) < 0.5) pred_dtest = (bst.predict(dtest) < 0.5)
assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1 assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1
def test_hist_training_accuracy(self): @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
self.training_accuracy(tree_method='hist') def test_hist_training_accuracy(self, tree_method):
self.training_accuracy(tree_method=tree_method)
def test_exact_training_accuracy(self):
self.training_accuracy(tree_method='exact')
def test_approx_training_accuracy(self):
self.training_accuracy(tree_method='approx')

View File

@ -22,14 +22,14 @@ def is_correctly_constrained(learner, feature_names=None):
for i in range(n): for i in range(n):
fixed_x = fixed_xs_values[i] * np.ones((n, 1)) fixed_x = fixed_xs_values[i] * np.ones((n, 1))
monotonically_increasing_x = np.column_stack((variable_x, fixed_x)) monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x, monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x,
feature_names=feature_names) feature_names=feature_names)
monotonically_increasing_y = learner.predict( monotonically_increasing_y = learner.predict(
monotonically_increasing_dset monotonically_increasing_dset
) )
monotonically_decreasing_x = np.column_stack((fixed_x, variable_x)) monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x, monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x,
feature_names=feature_names) feature_names=feature_names)
monotonically_decreasing_y = learner.predict( monotonically_decreasing_y = learner.predict(
monotonically_decreasing_dset monotonically_decreasing_dset
@ -105,7 +105,7 @@ class TestMonotoneConstraints:
@pytest.mark.parametrize('format', [dict, list]) @pytest.mark.parametrize('format', [dict, list])
def test_monotone_constraints_feature_names(self, format): def test_monotone_constraints_feature_names(self, format):
# next check monotonicity when initializing monotone_constraints by feature names # next check monotonicity when initializing monotone_constraints by feature names
params = { params = {
'tree_method': 'hist', 'verbosity': 1, 'tree_method': 'hist', 'verbosity': 1,
@ -119,13 +119,13 @@ class TestMonotoneConstraints:
with pytest.raises(ValueError): with pytest.raises(ValueError):
xgb.train(params, training_dset) xgb.train(params, training_dset)
feature_names =[ 'feature_0', 'feature_2'] feature_names = ['feature_0', 'feature_2']
training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names) training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
with pytest.raises(ValueError): with pytest.raises(ValueError):
xgb.train(params, training_dset_w_feature_names) xgb.train(params, training_dset_w_feature_names)
feature_names =[ 'feature_0', 'feature_1'] feature_names = ['feature_0', 'feature_1']
training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names) training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)
constrained_learner = xgb.train( constrained_learner = xgb.train(