Typehint for Sklearn. (#6799)

2021-04-14 06:55:21 +08:00 · 2021-04-14 06:55:21 +08:00 · dee5ef2dfd
commit dee5ef2dfd
parent 3d919db0c0
11 changed files with 335 additions and 262 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -243,7 +243,7 @@ jobs:
        architecture: 'x64'
    - name: Install Python packages
      run: |
-        python -m pip install wheel setuptools mypy dask[complete] distributed
+        python -m pip install wheel setuptools mypy pandas dask[complete] distributed
    - name: Run mypy
      run: |
        make mypy
--- a/5
+++ b/5
@ -91,8 +91,9 @@ endif
 # If any of the dask tests failed, contributor won't see the other error.
 mypy:
 	cd python-package; \
-	mypy ./xgboost/dask.py ../tests/python/test_with_dask.py --follow-imports=silent; \
-	mypy ../tests/python-gpu/test_gpu_with_dask.py --follow-imports=silent; \
+	mypy ./xgboost/dask.py && \
+	mypy ../tests/python-gpu/test_gpu_with_dask.py && \
+	mypy ./xgboost/sklearn.py || exit 1; \
 	mypy . || true ;

 clean:
--- a/python-package/setup.cfg
+++ b/python-package/setup.cfg
@ -3,4 +3,5 @@ description-file = README.rst

 [mypy]
 ignore_missing_imports = True
-disallow_untyped_defs = True
+disallow_untyped_defs = True
+follow_imports = silent
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@ -276,6 +276,9 @@ class TrainingCallback(ABC):
    .. versionadded:: 1.3.0

    '''
+
+    EvalsLog = Dict[str, Dict[str, Union[List[float], List[Tuple[float, float]]]]]
+
    def __init__(self):
        pass

@ -287,13 +290,11 @@ class TrainingCallback(ABC):
        '''Run after training is finished.'''
        return model

-    def before_iteration(self, model, epoch: int,
-                         evals_log: 'CallbackContainer.EvalsLog') -> bool:
+    def before_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
        '''Run before each iteration.  Return True when training should stop.'''
        return False

-    def after_iteration(self, model, epoch: int,
-                        evals_log: 'CallbackContainer.EvalsLog') -> bool:
+    def after_iteration(self, model, epoch: int, evals_log: EvalsLog) -> bool:
        '''Run after each iteration.  Return True when training should stop.'''
        return False

@ -351,7 +352,7 @@ class CallbackContainer:

    '''

-    EvalsLog = Dict[str, Dict[str, Union[List[float], List[Tuple[float, float]]]]]
+    EvalsLog = TrainingCallback.EvalsLog

    def __init__(self,
                 callbacks: List[TrainingCallback],
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@ -1,6 +1,7 @@
 # coding: utf-8
 # pylint: disable= invalid-name,  unused-import
 """For compatibility and optional dependencies."""
+from typing import Any
 import sys
 import types
 import importlib.util
@ -36,7 +37,7 @@ except ImportError:

    MultiIndex = object
    Int64Index = object
-    DataFrame = object
+    DataFrame: Any = object
    Series = object
    pandas_concat = None
    PANDAS_INSTALLED = False
@ -109,10 +110,12 @@ except pkg_resources.DistributionNotFound:
 try:
    import sparse
    import scipy.sparse as scipy_sparse
+    from scipy.sparse import csr_matrix as scipy_csr
    SCIPY_INSTALLED = True
 except ImportError:
    sparse = False
    scipy_sparse = False
+    scipy_csr: Any = object
    SCIPY_INSTALLED = False


--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -96,7 +96,11 @@ def from_cstr_to_pystr(data, length) -> List[str]:
    return res


-def _convert_ntree_limit(booster, ntree_limit, iteration_range):
+def _convert_ntree_limit(
+    booster: "Booster",
+    ntree_limit: Optional[int],
+    iteration_range: Optional[Tuple[int, int]]
+) -> Optional[Tuple[int, int]]:
    if ntree_limit is not None and ntree_limit != 0:
        warnings.warn(
            "ntree_limit is deprecated, use `iteration_range` or model "
@ -1234,7 +1238,7 @@ class Booster(object):
                params += [('eval_metric', eval_metric)]
        return params

-    def _transform_monotone_constrains(self, value: Union[dict, str]) -> str:
+    def _transform_monotone_constrains(self, value: Union[Dict[str, int], str]) -> str:
        if isinstance(value, str):
            return value

@ -1246,7 +1250,9 @@ class Booster(object):
        return '(' + ','.join([str(value.get(feature_name, 0))
                               for feature_name in self.feature_names]) + ')'

-    def _transform_interaction_constraints(self, value: Union[list, str]) -> str:
+    def _transform_interaction_constraints(
+        self, value: Union[List[Tuple[str]], str]
+    ) -> str:
        if isinstance(value, str):
            return value

@ -1447,7 +1453,7 @@ class Booster(object):
        attr_names = from_cstr_to_pystr(sarr, length)
        return {n: self.attr(n) for n in attr_names}

-    def set_attr(self, **kwargs):
+    def set_attr(self, **kwargs: Optional[str]) -> None:
        """Set the attribute of the Booster.

        Parameters
@ -1971,7 +1977,7 @@ class Booster(object):
            "Data type:" + str(type(data)) + " not supported by inplace prediction."
        )

-    def save_model(self, fname):
+    def save_model(self, fname: Union[str, os.PathLike]):
        """Save the model to a file.

        The model is saved in an XGBoost internal format which is universal among the
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@ -1028,7 +1028,8 @@ async def _direct_predict_impl(  # pylint: disable=too-many-branches
            # Somehow dask fail to infer output shape change for 2-dim prediction, and
            #  `chunks = (None, output_shape[1])` doesn't work due to None is not
            #  supported in map_blocks.
-            chunks = list(data.chunks)
+            chunks: Optional[List[Tuple]] = list(data.chunks)
+            assert isinstance(chunks, list)
            chunks[1] = (output_shape[1], )
        else:
            chunks = None
@ -1633,7 +1634,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
        )

        if callable(self.objective):
-            obj = _objective_decorator(self.objective)
+            obj: Optional[Callable] = _objective_decorator(self.objective)
        else:
            obj = None
        model, metric, params = self._configure_fit(
@ -1734,7 +1735,7 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
            params["objective"] = "binary:logistic"

        if callable(self.objective):
-            obj = _objective_decorator(self.objective)
+            obj: Optional[Callable] = _objective_decorator(self.objective)
        else:
            obj = None
        model, metric, params = self._configure_fit(
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@ -269,7 +269,9 @@ class TestDistributedGPU:
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.parametrize("model", ["boosting"])
-    def test_dask_classifier(self, model, local_cuda_cluster: LocalCUDACluster) -> None:
+    def test_dask_classifier(
+        self, model: str, local_cuda_cluster: LocalCUDACluster
+    ) -> None:
        import dask_cudf
        with Client(local_cuda_cluster) as client:
            X_, y_, w_ = generate_array(with_weights=True)
--- a/tests/python/test_interaction_constraints.py
+++ b/tests/python/test_interaction_constraints.py
@ -60,25 +60,25 @@ class TestInteractionConstraints:
    def test_interaction_constraints_feature_names(self):
        with pytest.raises(ValueError):
            constraints = [('feature_0', 'feature_1')]
-            self.run_interaction_constraints(tree_method='exact', 
+            self.run_interaction_constraints(tree_method='exact',
                                             interaction_constraints=constraints)

        with pytest.raises(ValueError):
            constraints = [('feature_0', 'feature_3')]
            feature_names = ['feature_0', 'feature_1', 'feature_2']
-            self.run_interaction_constraints(tree_method='exact', 
-                                             feature_names=feature_names, 
+            self.run_interaction_constraints(tree_method='exact',
+                                             feature_names=feature_names,
                                             interaction_constraints=constraints)

-    
        constraints = [('feature_0', 'feature_1')]
        feature_names = ['feature_0', 'feature_1', 'feature_2']
-        self.run_interaction_constraints(tree_method='exact', 
-                                         feature_names=feature_names, 
+        self.run_interaction_constraints(tree_method='exact',
+                                         feature_names=feature_names,
                                         interaction_constraints=constraints)

    @pytest.mark.skipif(**tm.no_sklearn())
    def training_accuracy(self, tree_method):
+        """Test accuracy, reused by GPU tests."""
        from sklearn.metrics import accuracy_score
        dtrain = xgboost.DMatrix(dpath + 'agaricus.txt.train?indexing_mode=1')
        dtest = xgboost.DMatrix(dpath + 'agaricus.txt.test?indexing_mode=1')
@ -101,11 +101,6 @@ class TestInteractionConstraints:
        pred_dtest = (bst.predict(dtest) < 0.5)
        assert accuracy_score(dtest.get_label(), pred_dtest) < 0.1

-    def test_hist_training_accuracy(self):
-        self.training_accuracy(tree_method='hist')
-
-    def test_exact_training_accuracy(self):
-        self.training_accuracy(tree_method='exact')
-
-    def test_approx_training_accuracy(self):
-        self.training_accuracy(tree_method='approx')
+    @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
+    def test_hist_training_accuracy(self, tree_method):
+        self.training_accuracy(tree_method=tree_method)
--- a/tests/python/test_monotone_constraints.py
+++ b/tests/python/test_monotone_constraints.py
@ -22,14 +22,14 @@ def is_correctly_constrained(learner, feature_names=None):
    for i in range(n):
        fixed_x = fixed_xs_values[i] * np.ones((n, 1))
        monotonically_increasing_x = np.column_stack((variable_x, fixed_x))
-        monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x, 
+        monotonically_increasing_dset = xgb.DMatrix(monotonically_increasing_x,
                                                    feature_names=feature_names)
        monotonically_increasing_y = learner.predict(
            monotonically_increasing_dset
        )

        monotonically_decreasing_x = np.column_stack((fixed_x, variable_x))
-        monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x, 
+        monotonically_decreasing_dset = xgb.DMatrix(monotonically_decreasing_x,
                                                    feature_names=feature_names)
        monotonically_decreasing_y = learner.predict(
            monotonically_decreasing_dset
@ -105,7 +105,7 @@ class TestMonotoneConstraints:

    @pytest.mark.parametrize('format', [dict, list])
    def test_monotone_constraints_feature_names(self, format):
-        
+
        # next check monotonicity when initializing monotone_constraints by feature names
        params = {
            'tree_method': 'hist', 'verbosity': 1,
@ -119,13 +119,13 @@ class TestMonotoneConstraints:
        with pytest.raises(ValueError):
            xgb.train(params, training_dset)

-        feature_names =[ 'feature_0', 'feature_2']
+        feature_names = ['feature_0', 'feature_2']
        training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)

        with pytest.raises(ValueError):
            xgb.train(params, training_dset_w_feature_names)
-        
-        feature_names =[ 'feature_0', 'feature_1']
+
+        feature_names = ['feature_0', 'feature_1']
        training_dset_w_feature_names = xgb.DMatrix(x, label=y, feature_names=feature_names)

        constrained_learner = xgb.train(