[breaking] Bump Python requirement to 3.10. (#10434)
- Bump the Python requirement. - Fix type hints. - Use loky to avoid deadlock. - Workaround cupy-numpy compatibility issue on Windows caused by the `safe` casting rule. - Simplify the repartitioning logic to avoid dask errors.
This commit is contained in:
@@ -14,7 +14,7 @@ authors = [
|
||||
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
|
||||
]
|
||||
version = "2.2.0-dev"
|
||||
requires-python = ">=3.8"
|
||||
requires-python = ">=3.10"
|
||||
license = { text = "Apache-2.0" }
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
@@ -22,8 +22,6 @@ classifiers = [
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
|
||||
@@ -14,6 +14,7 @@ from collections.abc import Mapping
|
||||
from enum import IntEnum, unique
|
||||
from functools import wraps
|
||||
from inspect import Parameter, signature
|
||||
from types import EllipsisType
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
@@ -1826,7 +1827,7 @@ class Booster:
|
||||
state["handle"] = handle
|
||||
self.__dict__.update(state)
|
||||
|
||||
def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster":
|
||||
def __getitem__(self, val: Union[Integer, tuple, slice, EllipsisType]) -> "Booster":
|
||||
"""Get a slice of the tree-based model.
|
||||
|
||||
.. versionadded:: 1.3.0
|
||||
@@ -1835,21 +1836,20 @@ class Booster:
|
||||
# convert to slice for all other types
|
||||
if isinstance(val, (np.integer, int)):
|
||||
val = slice(int(val), int(val + 1))
|
||||
if isinstance(val, type(Ellipsis)):
|
||||
if isinstance(val, EllipsisType):
|
||||
val = slice(0, 0)
|
||||
if isinstance(val, tuple):
|
||||
raise ValueError("Only supports slicing through 1 dimension.")
|
||||
# All supported types are now slice
|
||||
# FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
|
||||
if not isinstance(val, slice):
|
||||
msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val))
|
||||
msg = _expect((int, slice, np.integer, EllipsisType), type(val))
|
||||
raise TypeError(msg)
|
||||
|
||||
if isinstance(val.start, type(Ellipsis)) or val.start is None:
|
||||
if isinstance(val.start, EllipsisType) or val.start is None:
|
||||
start = 0
|
||||
else:
|
||||
start = val.start
|
||||
if isinstance(val.stop, type(Ellipsis)) or val.stop is None:
|
||||
if isinstance(val.stop, EllipsisType) or val.stop is None:
|
||||
stop = 0
|
||||
else:
|
||||
stop = val.stop
|
||||
|
||||
@@ -292,7 +292,7 @@ class DaskDMatrix:
|
||||
@_deprecate_positional_args
|
||||
def __init__(
|
||||
self,
|
||||
client: "distributed.Client",
|
||||
client: Optional["distributed.Client"],
|
||||
data: _DataT,
|
||||
label: Optional[_DaskCollection] = None,
|
||||
*,
|
||||
@@ -663,7 +663,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
|
||||
@_deprecate_positional_args
|
||||
def __init__(
|
||||
self,
|
||||
client: "distributed.Client",
|
||||
client: Optional["distributed.Client"],
|
||||
data: _DataT,
|
||||
label: Optional[_DaskCollection] = None,
|
||||
*,
|
||||
@@ -674,7 +674,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
|
||||
feature_names: Optional[FeatureNames] = None,
|
||||
feature_types: Optional[Union[Any, List[Any]]] = None,
|
||||
max_bin: Optional[int] = None,
|
||||
ref: Optional[DMatrix] = None,
|
||||
ref: Optional[DaskDMatrix] = None,
|
||||
group: Optional[_DaskCollection] = None,
|
||||
qid: Optional[_DaskCollection] = None,
|
||||
label_lower_bound: Optional[_DaskCollection] = None,
|
||||
@@ -1832,8 +1832,8 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
|
||||
sample_weight: Optional[_DaskCollection] = None,
|
||||
base_margin: Optional[_DaskCollection] = None,
|
||||
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
|
||||
verbose: Union[int, bool] = True,
|
||||
xgb_model: Optional[Union[Booster, XGBModel]] = None,
|
||||
verbose: Optional[Union[int, bool]] = True,
|
||||
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
|
||||
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
feature_weights: Optional[_DaskCollection] = None,
|
||||
@@ -1940,8 +1940,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
|
||||
sample_weight: Optional[_DaskCollection] = None,
|
||||
base_margin: Optional[_DaskCollection] = None,
|
||||
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
|
||||
verbose: Union[int, bool] = True,
|
||||
xgb_model: Optional[Union[Booster, XGBModel]] = None,
|
||||
verbose: Optional[Union[int, bool]] = True,
|
||||
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
|
||||
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
feature_weights: Optional[_DaskCollection] = None,
|
||||
@@ -2122,8 +2122,8 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
|
||||
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
|
||||
eval_group: Optional[Sequence[_DaskCollection]] = None,
|
||||
eval_qid: Optional[Sequence[_DaskCollection]] = None,
|
||||
verbose: Union[int, bool] = False,
|
||||
xgb_model: Optional[Union[XGBModel, Booster]] = None,
|
||||
verbose: Optional[Union[int, bool]] = False,
|
||||
xgb_model: Optional[Union[XGBModel, str, Booster]] = None,
|
||||
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
feature_weights: Optional[_DaskCollection] = None,
|
||||
@@ -2185,8 +2185,8 @@ class DaskXGBRFRegressor(DaskXGBRegressor):
|
||||
sample_weight: Optional[_DaskCollection] = None,
|
||||
base_margin: Optional[_DaskCollection] = None,
|
||||
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
|
||||
verbose: Union[int, bool] = True,
|
||||
xgb_model: Optional[Union[Booster, XGBModel]] = None,
|
||||
verbose: Optional[Union[int, bool]] = True,
|
||||
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
|
||||
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
feature_weights: Optional[_DaskCollection] = None,
|
||||
@@ -2246,8 +2246,8 @@ class DaskXGBRFClassifier(DaskXGBClassifier):
|
||||
sample_weight: Optional[_DaskCollection] = None,
|
||||
base_margin: Optional[_DaskCollection] = None,
|
||||
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
|
||||
verbose: Union[int, bool] = True,
|
||||
xgb_model: Optional[Union[Booster, XGBModel]] = None,
|
||||
verbose: Optional[Union[int, bool]] = True,
|
||||
xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
|
||||
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
|
||||
feature_weights: Optional[_DaskCollection] = None,
|
||||
|
||||
@@ -5,7 +5,17 @@ import ctypes
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from typing import Any, Callable, List, Optional, Sequence, Tuple, cast
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
TypeGuard,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -212,7 +222,7 @@ def is_scipy_coo(data: DataType) -> bool:
|
||||
return is_array or is_matrix
|
||||
|
||||
|
||||
def _is_np_array_like(data: DataType) -> bool:
|
||||
def _is_np_array_like(data: DataType) -> TypeGuard[np.ndarray]:
|
||||
return hasattr(data, "__array_interface__")
|
||||
|
||||
|
||||
@@ -241,7 +251,7 @@ def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
|
||||
|
||||
|
||||
def _from_numpy_array(
|
||||
data: DataType,
|
||||
data: np.ndarray,
|
||||
missing: FloatCompatible,
|
||||
nthread: int,
|
||||
feature_names: Optional[FeatureNames],
|
||||
@@ -266,7 +276,7 @@ def _from_numpy_array(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_pandas_df(data: DataType) -> bool:
|
||||
def _is_pandas_df(data: DataType) -> TypeGuard[DataFrame]:
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
@@ -1057,12 +1067,12 @@ def _from_dlpack(
|
||||
return _from_cupy_array(data, missing, nthread, feature_names, feature_types)
|
||||
|
||||
|
||||
def _is_uri(data: DataType) -> bool:
|
||||
def _is_uri(data: DataType) -> TypeGuard[Union[str, os.PathLike]]:
|
||||
return isinstance(data, (str, os.PathLike))
|
||||
|
||||
|
||||
def _from_uri(
|
||||
data: DataType,
|
||||
data: Union[str, os.PathLike],
|
||||
missing: Optional[FloatCompatible],
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
@@ -1080,7 +1090,7 @@ def _from_uri(
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
def _is_list(data: DataType) -> bool:
|
||||
def _is_list(data: DataType) -> TypeGuard[list]:
|
||||
return isinstance(data, list)
|
||||
|
||||
|
||||
@@ -1099,7 +1109,7 @@ def _from_list(
|
||||
)
|
||||
|
||||
|
||||
def _is_tuple(data: DataType) -> bool:
|
||||
def _is_tuple(data: DataType) -> TypeGuard[tuple]:
|
||||
return isinstance(data, tuple)
|
||||
|
||||
|
||||
@@ -1116,7 +1126,7 @@ def _from_tuple(
|
||||
)
|
||||
|
||||
|
||||
def _is_iter(data: DataType) -> bool:
|
||||
def _is_iter(data: DataType) -> TypeGuard[DataIter]:
|
||||
return isinstance(data, DataIter)
|
||||
|
||||
|
||||
|
||||
@@ -6,14 +6,12 @@ change without notice.
|
||||
# pylint: disable=invalid-name,missing-function-docstring,import-error
|
||||
import gc
|
||||
import importlib.util
|
||||
import multiprocessing
|
||||
import os
|
||||
import platform
|
||||
import queue
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextlib import contextmanager
|
||||
from io import StringIO
|
||||
from platform import system
|
||||
@@ -46,6 +44,7 @@ from xgboost.testing.data import (
|
||||
get_digits,
|
||||
get_sparse,
|
||||
make_batches,
|
||||
make_sparse_regression,
|
||||
memory,
|
||||
)
|
||||
|
||||
@@ -115,6 +114,10 @@ def no_dask() -> PytestSkip:
|
||||
return no_mod("dask")
|
||||
|
||||
|
||||
def no_loky() -> PytestSkip:
|
||||
return no_mod("loky")
|
||||
|
||||
|
||||
def no_dask_ml() -> PytestSkip:
|
||||
if sys.platform.startswith("win"):
|
||||
return {"reason": "Unsupported platform.", "condition": True}
|
||||
@@ -136,7 +139,14 @@ def no_arrow() -> PytestSkip:
|
||||
|
||||
|
||||
def no_modin() -> PytestSkip:
|
||||
return no_mod("modin")
|
||||
try:
|
||||
import modin.pandas as md
|
||||
|
||||
md.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
|
||||
|
||||
except ImportError:
|
||||
return {"reason": "Failed import modin.", "condition": True}
|
||||
return {"reason": "Failed import modin.", "condition": True}
|
||||
|
||||
|
||||
def no_dt() -> PytestSkip:
|
||||
@@ -487,94 +497,6 @@ def _cat_sampled_from() -> strategies.SearchStrategy:
|
||||
|
||||
categorical_dataset_strategy: strategies.SearchStrategy = _cat_sampled_from()
|
||||
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
@memory.cache
|
||||
def make_sparse_regression(
|
||||
n_samples: int, n_features: int, sparsity: float, as_dense: bool
|
||||
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
|
||||
"""Make sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
as_dense:
|
||||
|
||||
Return the matrix as np.ndarray with missing values filled by NaN
|
||||
|
||||
"""
|
||||
if not hasattr(np.random, "default_rng"):
|
||||
rng = np.random.RandomState(1994)
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
format="csr",
|
||||
)
|
||||
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
||||
return X, y
|
||||
|
||||
# Use multi-thread to speed up the generation, convenient if you use this function
|
||||
# for benchmarking.
|
||||
n_threads = min(multiprocessing.cpu_count(), n_features)
|
||||
|
||||
def random_csc(t_id: int) -> sparse.csc_matrix:
|
||||
rng = np.random.default_rng(1994 * t_id)
|
||||
thread_size = n_features // n_threads
|
||||
if t_id == n_threads - 1:
|
||||
n_features_tloc = n_features - t_id * thread_size
|
||||
else:
|
||||
n_features_tloc = thread_size
|
||||
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features_tloc,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
).tocsc()
|
||||
y = np.zeros((n_samples, 1))
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
size = X.indptr[i + 1] - X.indptr[i]
|
||||
if size != 0:
|
||||
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
|
||||
|
||||
return X, y
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for i in range(n_threads):
|
||||
futures.append(executor.submit(random_csc, i))
|
||||
|
||||
X_results = []
|
||||
y_results = []
|
||||
for f in futures:
|
||||
X, y = f.result()
|
||||
X_results.append(X)
|
||||
y_results.append(y)
|
||||
|
||||
assert len(y_results) == n_threads
|
||||
|
||||
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
|
||||
y = np.asarray(y_results)
|
||||
y = y.reshape((y.shape[0], y.shape[1])).T
|
||||
y = np.sum(y, axis=1)
|
||||
|
||||
assert csr.shape[0] == n_samples
|
||||
assert csr.shape[1] == n_features
|
||||
assert y.shape[0] == n_samples
|
||||
|
||||
if as_dense:
|
||||
arr = csr.toarray()
|
||||
assert arr.shape[0] == n_samples
|
||||
assert arr.shape[1] == n_features
|
||||
arr[arr == 0] = np.nan
|
||||
return arr, y
|
||||
|
||||
return csr, y
|
||||
|
||||
|
||||
sparse_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset(
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
# pylint: disable=invalid-name
|
||||
"""Utilities for data generation."""
|
||||
import multiprocessing
|
||||
import os
|
||||
import zipfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
@@ -523,7 +525,7 @@ def make_batches( # pylint: disable=too-many-arguments,too-many-locals
|
||||
if use_cupy:
|
||||
import cupy # pylint: disable=import-error
|
||||
|
||||
rng = cupy.random.RandomState(random_state)
|
||||
rng = cupy.random.RandomState(np.uint64(random_state))
|
||||
else:
|
||||
rng = np.random.RandomState(random_state)
|
||||
for i in range(n_batches):
|
||||
@@ -843,3 +845,90 @@ def run_base_margin_info(
|
||||
base_margin = X.reshape(2, 5, 2, 5)
|
||||
with pytest.raises(ValueError, match=r".*base_margin.*"):
|
||||
Xy.set_base_margin(base_margin)
|
||||
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
@memory.cache
|
||||
def make_sparse_regression(
|
||||
n_samples: int, n_features: int, sparsity: float, as_dense: bool
|
||||
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
|
||||
"""Make sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
as_dense:
|
||||
|
||||
Return the matrix as np.ndarray with missing values filled by NaN
|
||||
|
||||
"""
|
||||
if not hasattr(np.random, "default_rng"):
|
||||
rng = np.random.RandomState(1994)
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
format="csr",
|
||||
)
|
||||
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
||||
return X, y
|
||||
|
||||
# Use multi-thread to speed up the generation, convenient if you use this function
|
||||
# for benchmarking.
|
||||
n_threads = min(multiprocessing.cpu_count(), n_features)
|
||||
|
||||
def random_csc(t_id: int) -> sparse.csc_matrix:
|
||||
rng = np.random.default_rng(1994 * t_id)
|
||||
thread_size = n_features // n_threads
|
||||
if t_id == n_threads - 1:
|
||||
n_features_tloc = n_features - t_id * thread_size
|
||||
else:
|
||||
n_features_tloc = thread_size
|
||||
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features_tloc,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
).tocsc()
|
||||
y = np.zeros((n_samples, 1))
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
size = X.indptr[i + 1] - X.indptr[i]
|
||||
if size != 0:
|
||||
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
|
||||
|
||||
return X, y
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for i in range(n_threads):
|
||||
futures.append(executor.submit(random_csc, i))
|
||||
|
||||
X_results = []
|
||||
y_results = []
|
||||
for f in futures:
|
||||
X, y = f.result()
|
||||
X_results.append(X)
|
||||
y_results.append(y)
|
||||
|
||||
assert len(y_results) == n_threads
|
||||
|
||||
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
|
||||
y = np.asarray(y_results)
|
||||
y = y.reshape((y.shape[0], y.shape[1])).T
|
||||
y = np.sum(y, axis=1)
|
||||
|
||||
assert csr.shape[0] == n_samples
|
||||
assert csr.shape[1] == n_features
|
||||
assert y.shape[0] == n_samples
|
||||
|
||||
if as_dense:
|
||||
arr = csr.toarray()
|
||||
assert arr.shape[0] == n_samples
|
||||
assert arr.shape[1] == n_features
|
||||
arr[arr == 0] = np.nan
|
||||
return arr, y
|
||||
|
||||
return csr, y
|
||||
|
||||
@@ -198,7 +198,7 @@ class CVPack:
|
||||
def __init__(
|
||||
self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]
|
||||
) -> None:
|
||||
""" "Initialize the CVPack"""
|
||||
"""Initialize the CVPack."""
|
||||
self.dtrain = dtrain
|
||||
self.dtest = dtest
|
||||
self.watchlist = [(dtrain, "train"), (dtest, "test")]
|
||||
@@ -277,7 +277,7 @@ class _PackedBooster:
|
||||
self.set_attr(best_score=score)
|
||||
|
||||
|
||||
def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
|
||||
def groups_to_rows(groups: np.ndarray, boundaries: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Given group row boundaries, convert ground indexes to row indexes
|
||||
:param groups: list of groups for testing
|
||||
|
||||
Reference in New Issue
Block a user