[breaking] Bump Python requirement to 3.10. (#10434)

- Bump the Python requirement.
- Fix type hints.
- Use loky to avoid deadlock.
- Workaround cupy-numpy compatibility issue on Windows caused by the `safe` casting rule.
- Simplify the repartitioning logic to avoid dask errors.
This commit is contained in:
Jiaming Yuan
2024-07-30 17:31:06 +08:00
committed by GitHub
parent 757aafc131
commit 827d0e8edb
33 changed files with 284 additions and 286 deletions

View File

@@ -14,7 +14,7 @@ authors = [
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
]
version = "2.2.0-dev"
requires-python = ">=3.8"
requires-python = ">=3.10"
license = { text = "Apache-2.0" }
classifiers = [
"License :: OSI Approved :: Apache Software License",
@@ -22,8 +22,6 @@ classifiers = [
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",

View File

@@ -14,6 +14,7 @@ from collections.abc import Mapping
from enum import IntEnum, unique
from functools import wraps
from inspect import Parameter, signature
from types import EllipsisType
from typing import (
Any,
Callable,
@@ -1826,7 +1827,7 @@ class Booster:
state["handle"] = handle
self.__dict__.update(state)
def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
def __getitem__(self, val: Union[Integer, tuple, slice, EllipsisType]) -> "Booster":
"""Get a slice of the tree-based model.
.. versionadded:: 1.3.0
@@ -1835,21 +1836,20 @@ class Booster:
# convert to slice for all other types
if isinstance(val, (np.integer, int)):
val = slice(int(val), int(val + 1))
if isinstance(val, type(Ellipsis)):
if isinstance(val, EllipsisType):
val = slice(0, 0)
if isinstance(val, tuple):
raise ValueError("Only supports slicing through 1 dimension.")
# All supported types are now slice
# FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
if not isinstance(val, slice):
msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
msg = _expect((int, slice, np.integer, EllipsisType), type(val))
raise TypeError(msg)
if isinstance(val.start, type(Ellipsis)) or val.start is None:
if isinstance(val.start, EllipsisType) or val.start is None:
start = 0
else:
start = val.start
if isinstance(val.stop, type(Ellipsis)) or val.stop is None:
if isinstance(val.stop, EllipsisType) or val.stop is None:
stop = 0
else:
stop = val.stop

View File

@@ -292,7 +292,7 @@ class DaskDMatrix:
@_deprecate_positional_args
def __init__(
self,
client: "distributed.Client",
client: Optional["distributed.Client"],
data: _DataT,
label: Optional[_DaskCollection] = None,
*,
@@ -663,7 +663,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
@_deprecate_positional_args
def __init__(
self,
client: "distributed.Client",
client: Optional["distributed.Client"],
data: _DataT,
label: Optional[_DaskCollection] = None,
*,
@@ -674,7 +674,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
feature_names: Optional[FeatureNames] = None,
feature_types: Optional[Union[Any, List[Any]]] = None,
max_bin: Optional[int] = None,
ref: Optional[DMatrix] = None,
ref: Optional[DaskDMatrix] = None,
group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None,
@@ -1832,8 +1832,8 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None,
verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None,
@@ -1940,8 +1940,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None,
verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None,
@@ -2122,8 +2122,8 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
eval_group: Optional[Sequence[_DaskCollection]] = None,
eval_qid: Optional[Sequence[_DaskCollection]] = None,
verbose: Union[int, bool] = False,
xgb_model: Optional[Union[XGBModel, Booster]] = None,
verbose: Optional[Union[int, bool]] = False,
xgb_model: Optional[Union[XGBModel, str, Booster]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None,
@@ -2185,8 +2185,8 @@ class DaskXGBRFRegressor(DaskXGBRegressor):
sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None,
verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None,
@@ -2246,8 +2246,8 @@ class DaskXGBRFClassifier(DaskXGBClassifier):
sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None,
verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None,

View File

@@ -5,7 +5,17 @@ import ctypes
import json
import os
import warnings
from typing import Any, Callable, List, Optional, Sequence, Tuple, cast
from typing import (
Any,
Callable,
List,
Optional,
Sequence,
Tuple,
TypeGuard,
Union,
cast,
)
import numpy as np
@@ -212,7 +222,7 @@ def is_scipy_coo(data: DataType) -> bool:
return is_array or is_matrix
def _is_np_array_like(data: DataType) -> bool:
def _is_np_array_like(data: DataType) -> TypeGuard[np.ndarray]:
return hasattr(data, "__array_interface__")
@@ -241,7 +251,7 @@ def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
def _from_numpy_array(
data: DataType,
data: np.ndarray,
missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames],
@@ -266,7 +276,7 @@ def _from_numpy_array(
return handle, feature_names, feature_types
def _is_pandas_df(data: DataType) -> bool:
def _is_pandas_df(data: DataType) -> TypeGuard[DataFrame]:
try:
import pandas as pd
except ImportError:
@@ -1057,12 +1067,12 @@ def _from_dlpack(
return _from_cupy_array(data, missing, nthread, feature_names, feature_types)
def _is_uri(data: DataType) -> bool:
def _is_uri(data: DataType) -> TypeGuard[Union[str, os.PathLike]]:
return isinstance(data, (str, os.PathLike))
def _from_uri(
data: DataType,
data: Union[str, os.PathLike],
missing: Optional[FloatCompatible],
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
@@ -1080,7 +1090,7 @@ def _from_uri(
return handle, feature_names, feature_types
def _is_list(data: DataType) -> bool:
def _is_list(data: DataType) -> TypeGuard[list]:
return isinstance(data, list)
@@ -1099,7 +1109,7 @@ def _from_list(
)
def _is_tuple(data: DataType) -> bool:
def _is_tuple(data: DataType) -> TypeGuard[tuple]:
return isinstance(data, tuple)
@@ -1116,7 +1126,7 @@ def _from_tuple(
)
def _is_iter(data: DataType) -> bool:
def _is_iter(data: DataType) -> TypeGuard[DataIter]:
return isinstance(data, DataIter)

View File

@@ -6,14 +6,12 @@ change without notice.
# pylint: disable=invalid-name,missing-function-docstring,import-error
import gc
import importlib.util
import multiprocessing
import os
import platform
import queue
import socket
import sys
import threading
from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager
from io import StringIO
from platform import system
@@ -46,6 +44,7 @@ from xgboost.testing.data import (
get_digits,
get_sparse,
make_batches,
make_sparse_regression,
memory,
)
@@ -115,6 +114,10 @@ def no_dask() -> PytestSkip:
return no_mod("dask")
def no_loky() -> PytestSkip:
return no_mod("loky")
def no_dask_ml() -> PytestSkip:
if sys.platform.startswith("win"):
return {"reason": "Unsupported platform.", "condition": True}
@@ -136,7 +139,14 @@ def no_arrow() -> PytestSkip:
def no_modin() -> PytestSkip:
return no_mod("modin")
try:
import modin.pandas as md
md.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
except ImportError:
return {"reason": "Failed import modin.", "condition": True}
return {"reason": "Failed import modin.", "condition": True}
def no_dt() -> PytestSkip:
@@ -487,94 +497,6 @@ def _cat_sampled_from() -> strategies.SearchStrategy:
categorical_dataset_strategy: strategies.SearchStrategy = _cat_sampled_from()
# pylint: disable=too-many-locals
@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
"""Make sparse matrix.
Parameters
----------
as_dense:
Return the matrix as np.ndarray with missing values filled by NaN
"""
if not hasattr(np.random, "default_rng"):
rng = np.random.RandomState(1994)
X = sparse.random(
m=n_samples,
n=n_features,
density=1.0 - sparsity,
random_state=rng,
format="csr",
)
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
return X, y
# Use multi-thread to speed up the generation, convenient if you use this function
# for benchmarking.
n_threads = min(multiprocessing.cpu_count(), n_features)
def random_csc(t_id: int) -> sparse.csc_matrix:
rng = np.random.default_rng(1994 * t_id)
thread_size = n_features // n_threads
if t_id == n_threads - 1:
n_features_tloc = n_features - t_id * thread_size
else:
n_features_tloc = thread_size
X = sparse.random(
m=n_samples,
n=n_features_tloc,
density=1.0 - sparsity,
random_state=rng,
).tocsc()
y = np.zeros((n_samples, 1))
for i in range(X.shape[1]):
size = X.indptr[i + 1] - X.indptr[i]
if size != 0:
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
return X, y
futures = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
for i in range(n_threads):
futures.append(executor.submit(random_csc, i))
X_results = []
y_results = []
for f in futures:
X, y = f.result()
X_results.append(X)
y_results.append(y)
assert len(y_results) == n_threads
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
y = np.asarray(y_results)
y = y.reshape((y.shape[0], y.shape[1])).T
y = np.sum(y, axis=1)
assert csr.shape[0] == n_samples
assert csr.shape[1] == n_features
assert y.shape[0] == n_samples
if as_dense:
arr = csr.toarray()
assert arr.shape[0] == n_samples
assert arr.shape[1] == n_features
arr[arr == 0] = np.nan
return arr, y
return csr, y
sparse_datasets_strategy = strategies.sampled_from(
[
TestDataset(

View File

@@ -1,7 +1,9 @@
# pylint: disable=invalid-name
"""Utilities for data generation."""
import multiprocessing
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
@@ -523,7 +525,7 @@ def make_batches( # pylint: disable=too-many-arguments,too-many-locals
if use_cupy:
import cupy # pylint: disable=import-error
rng = cupy.random.RandomState(random_state)
rng = cupy.random.RandomState(np.uint64(random_state))
else:
rng = np.random.RandomState(random_state)
for i in range(n_batches):
@@ -843,3 +845,90 @@ def run_base_margin_info(
base_margin = X.reshape(2, 5, 2, 5)
with pytest.raises(ValueError, match=r".*base_margin.*"):
Xy.set_base_margin(base_margin)
# pylint: disable=too-many-locals
@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
"""Make sparse matrix.
Parameters
----------
as_dense:
Return the matrix as np.ndarray with missing values filled by NaN
"""
if not hasattr(np.random, "default_rng"):
rng = np.random.RandomState(1994)
X = sparse.random(
m=n_samples,
n=n_features,
density=1.0 - sparsity,
random_state=rng,
format="csr",
)
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
return X, y
# Use multi-thread to speed up the generation, convenient if you use this function
# for benchmarking.
n_threads = min(multiprocessing.cpu_count(), n_features)
def random_csc(t_id: int) -> sparse.csc_matrix:
rng = np.random.default_rng(1994 * t_id)
thread_size = n_features // n_threads
if t_id == n_threads - 1:
n_features_tloc = n_features - t_id * thread_size
else:
n_features_tloc = thread_size
X = sparse.random(
m=n_samples,
n=n_features_tloc,
density=1.0 - sparsity,
random_state=rng,
).tocsc()
y = np.zeros((n_samples, 1))
for i in range(X.shape[1]):
size = X.indptr[i + 1] - X.indptr[i]
if size != 0:
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
return X, y
futures = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
for i in range(n_threads):
futures.append(executor.submit(random_csc, i))
X_results = []
y_results = []
for f in futures:
X, y = f.result()
X_results.append(X)
y_results.append(y)
assert len(y_results) == n_threads
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
y = np.asarray(y_results)
y = y.reshape((y.shape[0], y.shape[1])).T
y = np.sum(y, axis=1)
assert csr.shape[0] == n_samples
assert csr.shape[1] == n_features
assert y.shape[0] == n_samples
if as_dense:
arr = csr.toarray()
assert arr.shape[0] == n_samples
assert arr.shape[1] == n_features
arr[arr == 0] = np.nan
return arr, y
return csr, y

View File

@@ -198,7 +198,7 @@ class CVPack:
def __init__(
self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]
) -> None:
""" "Initialize the CVPack"""
"""Initialize the CVPack."""
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = [(dtrain, "train"), (dtest, "test")]
@@ -277,7 +277,7 @@ class _PackedBooster:
self.set_attr(best_score=score)
def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
def groups_to_rows(groups: np.ndarray, boundaries: np.ndarray) -> np.ndarray:
"""
Given group row boundaries, convert ground indexes to row indexes
:param groups: list of groups for testing