[breaking] Bump Python requirement to 3.10. (#10434)

- Bump the Python requirement. - Fix type hints. - Use loky to avoid deadlock. - Workaround cupy-numpy compatibility issue on Windows caused by the `safe` casting rule. - Simplify the repartitioning logic to avoid dask errors.
2024-07-30 17:31:06 +08:00
parent 757aafc131
commit 827d0e8edb
33 changed files with 284 additions and 286 deletions
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -14,7 +14,7 @@ authors = [
    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
 version = "2.2.0-dev"
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 license = { text = "Apache-2.0" }
 classifiers = [
    "License :: OSI Approved :: Apache Software License",
@@ -22,8 +22,6 @@ classifiers = [
    "Operating System :: OS Independent",
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -14,6 +14,7 @@ from collections.abc import Mapping
 from enum import IntEnum, unique
 from functools import wraps
 from inspect import Parameter, signature
+from types import EllipsisType
 from typing import (
    Any,
    Callable,
@@ -1826,7 +1827,7 @@ class Booster:
            state["handle"] = handle
        self.__dict__.update(state)

-    def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
+    def __getitem__(self, val: Union[Integer, tuple, slice, EllipsisType]) -> "Booster":
        """Get a slice of the tree-based model.

        .. versionadded:: 1.3.0
@@ -1835,21 +1836,20 @@ class Booster:
        # convert to slice for all other types
        if isinstance(val, (np.integer, int)):
            val = slice(int(val), int(val + 1))
-        if isinstance(val, type(Ellipsis)):
+        if isinstance(val, EllipsisType):
            val = slice(0, 0)
        if isinstance(val, tuple):
            raise ValueError("Only supports slicing through 1 dimension.")
        # All supported types are now slice
-        # FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
        if not isinstance(val, slice):
-            msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
+            msg = _expect((int, slice, np.integer, EllipsisType), type(val))
            raise TypeError(msg)

-        if isinstance(val.start, type(Ellipsis)) or val.start is None:
+        if isinstance(val.start, EllipsisType) or val.start is None:
            start = 0
        else:
            start = val.start
-        if isinstance(val.stop, type(Ellipsis)) or val.stop is None:
+        if isinstance(val.stop, EllipsisType) or val.stop is None:
            stop = 0
        else:
            stop = val.stop
--- a/python-package/xgboost/dask/init.py
+++ b/python-package/xgboost/dask/init.py
@@ -292,7 +292,7 @@ class DaskDMatrix:
    @_deprecate_positional_args
    def __init__(
        self,
-        client: "distributed.Client",
+        client: Optional["distributed.Client"],
        data: _DataT,
        label: Optional[_DaskCollection] = None,
        *,
@@ -663,7 +663,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
    @_deprecate_positional_args
    def __init__(
        self,
-        client: "distributed.Client",
+        client: Optional["distributed.Client"],
        data: _DataT,
        label: Optional[_DaskCollection] = None,
        *,
@@ -674,7 +674,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
        feature_names: Optional[FeatureNames] = None,
        feature_types: Optional[Union[Any, List[Any]]] = None,
        max_bin: Optional[int] = None,
-        ref: Optional[DMatrix] = None,
+        ref: Optional[DaskDMatrix] = None,
        group: Optional[_DaskCollection] = None,
        qid: Optional[_DaskCollection] = None,
        label_lower_bound: Optional[_DaskCollection] = None,
@@ -1832,8 +1832,8 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
        sample_weight: Optional[_DaskCollection] = None,
        base_margin: Optional[_DaskCollection] = None,
        eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        verbose: Union[int, bool] = True,
-        xgb_model: Optional[Union[Booster, XGBModel]] = None,
+        verbose: Optional[Union[int, bool]] = True,
+        xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
        sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
        base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
        feature_weights: Optional[_DaskCollection] = None,
@@ -1940,8 +1940,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        sample_weight: Optional[_DaskCollection] = None,
        base_margin: Optional[_DaskCollection] = None,
        eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        verbose: Union[int, bool] = True,
-        xgb_model: Optional[Union[Booster, XGBModel]] = None,
+        verbose: Optional[Union[int, bool]] = True,
+        xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
        sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
        base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
        feature_weights: Optional[_DaskCollection] = None,
@@ -2122,8 +2122,8 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
        eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
        eval_group: Optional[Sequence[_DaskCollection]] = None,
        eval_qid: Optional[Sequence[_DaskCollection]] = None,
-        verbose: Union[int, bool] = False,
-        xgb_model: Optional[Union[XGBModel, Booster]] = None,
+        verbose: Optional[Union[int, bool]] = False,
+        xgb_model: Optional[Union[XGBModel, str, Booster]] = None,
        sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
        base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
        feature_weights: Optional[_DaskCollection] = None,
@@ -2185,8 +2185,8 @@ class DaskXGBRFRegressor(DaskXGBRegressor):
        sample_weight: Optional[_DaskCollection] = None,
        base_margin: Optional[_DaskCollection] = None,
        eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        verbose: Union[int, bool] = True,
-        xgb_model: Optional[Union[Booster, XGBModel]] = None,
+        verbose: Optional[Union[int, bool]] = True,
+        xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
        sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
        base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
        feature_weights: Optional[_DaskCollection] = None,
@@ -2246,8 +2246,8 @@ class DaskXGBRFClassifier(DaskXGBClassifier):
        sample_weight: Optional[_DaskCollection] = None,
        base_margin: Optional[_DaskCollection] = None,
        eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
-        verbose: Union[int, bool] = True,
-        xgb_model: Optional[Union[Booster, XGBModel]] = None,
+        verbose: Optional[Union[int, bool]] = True,
+        xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
        sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
        base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
        feature_weights: Optional[_DaskCollection] = None,
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,17 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, List, Optional, Sequence, Tuple, cast
+from typing import (
+    Any,
+    Callable,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeGuard,
+    Union,
+    cast,
+)

 import numpy as np

@@ -212,7 +222,7 @@ def is_scipy_coo(data: DataType) -> bool:
    return is_array or is_matrix


-def _is_np_array_like(data: DataType) -> bool:
+def _is_np_array_like(data: DataType) -> TypeGuard[np.ndarray]:
    return hasattr(data, "__array_interface__")


@@ -241,7 +251,7 @@ def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:


 def _from_numpy_array(
-    data: DataType,
+    data: np.ndarray,
    missing: FloatCompatible,
    nthread: int,
    feature_names: Optional[FeatureNames],
@@ -266,7 +276,7 @@ def _from_numpy_array(
    return handle, feature_names, feature_types


-def _is_pandas_df(data: DataType) -> bool:
+def _is_pandas_df(data: DataType) -> TypeGuard[DataFrame]:
    try:
        import pandas as pd
    except ImportError:
@@ -1057,12 +1067,12 @@ def _from_dlpack(
    return _from_cupy_array(data, missing, nthread, feature_names, feature_types)


-def _is_uri(data: DataType) -> bool:
+def _is_uri(data: DataType) -> TypeGuard[Union[str, os.PathLike]]:
    return isinstance(data, (str, os.PathLike))


 def _from_uri(
-    data: DataType,
+    data: Union[str, os.PathLike],
    missing: Optional[FloatCompatible],
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
@@ -1080,7 +1090,7 @@ def _from_uri(
    return handle, feature_names, feature_types


-def _is_list(data: DataType) -> bool:
+def _is_list(data: DataType) -> TypeGuard[list]:
    return isinstance(data, list)


@@ -1099,7 +1109,7 @@ def _from_list(
    )


-def _is_tuple(data: DataType) -> bool:
+def _is_tuple(data: DataType) -> TypeGuard[tuple]:
    return isinstance(data, tuple)


@@ -1116,7 +1126,7 @@ def _from_tuple(
    )


-def _is_iter(data: DataType) -> bool:
+def _is_iter(data: DataType) -> TypeGuard[DataIter]:
    return isinstance(data, DataIter)


--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -6,14 +6,12 @@ change without notice.
 # pylint: disable=invalid-name,missing-function-docstring,import-error
 import gc
 import importlib.util
-import multiprocessing
 import os
 import platform
 import queue
 import socket
 import sys
 import threading
-from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
 from platform import system
@@ -46,6 +44,7 @@ from xgboost.testing.data import (
    get_digits,
    get_sparse,
    make_batches,
+    make_sparse_regression,
    memory,
 )

@@ -115,6 +114,10 @@ def no_dask() -> PytestSkip:
    return no_mod("dask")


+def no_loky() -> PytestSkip:
+    return no_mod("loky")
+
+
 def no_dask_ml() -> PytestSkip:
    if sys.platform.startswith("win"):
        return {"reason": "Unsupported platform.", "condition": True}
@@ -136,7 +139,14 @@ def no_arrow() -> PytestSkip:


 def no_modin() -> PytestSkip:
-    return no_mod("modin")
+    try:
+        import modin.pandas as md
+
+        md.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
+
+    except ImportError:
+        return {"reason": "Failed import modin.", "condition": True}
+    return {"reason": "Failed import modin.", "condition": True}


 def no_dt() -> PytestSkip:
@@ -487,94 +497,6 @@ def _cat_sampled_from() -> strategies.SearchStrategy:

 categorical_dataset_strategy: strategies.SearchStrategy = _cat_sampled_from()

-
-# pylint: disable=too-many-locals
-@memory.cache
-def make_sparse_regression(
-    n_samples: int, n_features: int, sparsity: float, as_dense: bool
-) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
-    """Make sparse matrix.
-
-    Parameters
-    ----------
-
-    as_dense:
-
-      Return the matrix as np.ndarray with missing values filled by NaN
-
-    """
-    if not hasattr(np.random, "default_rng"):
-        rng = np.random.RandomState(1994)
-        X = sparse.random(
-            m=n_samples,
-            n=n_features,
-            density=1.0 - sparsity,
-            random_state=rng,
-            format="csr",
-        )
-        y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
-        return X, y
-
-    # Use multi-thread to speed up the generation, convenient if you use this function
-    # for benchmarking.
-    n_threads = min(multiprocessing.cpu_count(), n_features)
-
-    def random_csc(t_id: int) -> sparse.csc_matrix:
-        rng = np.random.default_rng(1994 * t_id)
-        thread_size = n_features // n_threads
-        if t_id == n_threads - 1:
-            n_features_tloc = n_features - t_id * thread_size
-        else:
-            n_features_tloc = thread_size
-
-        X = sparse.random(
-            m=n_samples,
-            n=n_features_tloc,
-            density=1.0 - sparsity,
-            random_state=rng,
-        ).tocsc()
-        y = np.zeros((n_samples, 1))
-
-        for i in range(X.shape[1]):
-            size = X.indptr[i + 1] - X.indptr[i]
-            if size != 0:
-                y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
-
-        return X, y
-
-    futures = []
-    with ThreadPoolExecutor(max_workers=n_threads) as executor:
-        for i in range(n_threads):
-            futures.append(executor.submit(random_csc, i))
-
-    X_results = []
-    y_results = []
-    for f in futures:
-        X, y = f.result()
-        X_results.append(X)
-        y_results.append(y)
-
-    assert len(y_results) == n_threads
-
-    csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
-    y = np.asarray(y_results)
-    y = y.reshape((y.shape[0], y.shape[1])).T
-    y = np.sum(y, axis=1)
-
-    assert csr.shape[0] == n_samples
-    assert csr.shape[1] == n_features
-    assert y.shape[0] == n_samples
-
-    if as_dense:
-        arr = csr.toarray()
-        assert arr.shape[0] == n_samples
-        assert arr.shape[1] == n_features
-        arr[arr == 0] = np.nan
-        return arr, y
-
-    return csr, y
-
-
 sparse_datasets_strategy = strategies.sampled_from(
    [
        TestDataset(
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -1,7 +1,9 @@
 # pylint: disable=invalid-name
 """Utilities for data generation."""
+import multiprocessing
 import os
 import zipfile
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import (
    TYPE_CHECKING,
@@ -523,7 +525,7 @@ def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
    if use_cupy:
        import cupy  # pylint: disable=import-error

-        rng = cupy.random.RandomState(random_state)
+        rng = cupy.random.RandomState(np.uint64(random_state))
    else:
        rng = np.random.RandomState(random_state)
    for i in range(n_batches):
@@ -843,3 +845,90 @@ def run_base_margin_info(
        base_margin = X.reshape(2, 5, 2, 5)
        with pytest.raises(ValueError, match=r".*base_margin.*"):
            Xy.set_base_margin(base_margin)
+
+
+# pylint: disable=too-many-locals
+@memory.cache
+def make_sparse_regression(
+    n_samples: int, n_features: int, sparsity: float, as_dense: bool
+) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
+    """Make sparse matrix.
+
+    Parameters
+    ----------
+
+    as_dense:
+
+      Return the matrix as np.ndarray with missing values filled by NaN
+
+    """
+    if not hasattr(np.random, "default_rng"):
+        rng = np.random.RandomState(1994)
+        X = sparse.random(
+            m=n_samples,
+            n=n_features,
+            density=1.0 - sparsity,
+            random_state=rng,
+            format="csr",
+        )
+        y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
+        return X, y
+
+    # Use multi-thread to speed up the generation, convenient if you use this function
+    # for benchmarking.
+    n_threads = min(multiprocessing.cpu_count(), n_features)
+
+    def random_csc(t_id: int) -> sparse.csc_matrix:
+        rng = np.random.default_rng(1994 * t_id)
+        thread_size = n_features // n_threads
+        if t_id == n_threads - 1:
+            n_features_tloc = n_features - t_id * thread_size
+        else:
+            n_features_tloc = thread_size
+
+        X = sparse.random(
+            m=n_samples,
+            n=n_features_tloc,
+            density=1.0 - sparsity,
+            random_state=rng,
+        ).tocsc()
+        y = np.zeros((n_samples, 1))
+
+        for i in range(X.shape[1]):
+            size = X.indptr[i + 1] - X.indptr[i]
+            if size != 0:
+                y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
+
+        return X, y
+
+    futures = []
+    with ThreadPoolExecutor(max_workers=n_threads) as executor:
+        for i in range(n_threads):
+            futures.append(executor.submit(random_csc, i))
+
+    X_results = []
+    y_results = []
+    for f in futures:
+        X, y = f.result()
+        X_results.append(X)
+        y_results.append(y)
+
+    assert len(y_results) == n_threads
+
+    csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
+    y = np.asarray(y_results)
+    y = y.reshape((y.shape[0], y.shape[1])).T
+    y = np.sum(y, axis=1)
+
+    assert csr.shape[0] == n_samples
+    assert csr.shape[1] == n_features
+    assert y.shape[0] == n_samples
+
+    if as_dense:
+        arr = csr.toarray()
+        assert arr.shape[0] == n_samples
+        assert arr.shape[1] == n_features
+        arr[arr == 0] = np.nan
+        return arr, y
+
+    return csr, y
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -198,7 +198,7 @@ class CVPack:
    def __init__(
        self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]
    ) -> None:
-        """ "Initialize the CVPack"""
+        """Initialize the CVPack."""
        self.dtrain = dtrain
        self.dtest = dtest
        self.watchlist = [(dtrain, "train"), (dtest, "test")]
@@ -277,7 +277,7 @@ class _PackedBooster:
        self.set_attr(best_score=score)


-def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
+def groups_to_rows(groups: np.ndarray, boundaries: np.ndarray) -> np.ndarray:
    """
    Given group row boundaries, convert ground indexes to row indexes
    :param groups: list of groups for testing