[breaking] Bump Python requirement to 3.10. (#10434)

- Bump the Python requirement.
- Fix type hints.
- Use loky to avoid deadlock.
- Workaround cupy-numpy compatibility issue on Windows caused by the `safe` casting rule.
- Simplify the repartitioning logic to avoid dask errors.
This commit is contained in:
Jiaming Yuan 2024-07-30 17:31:06 +08:00 committed by GitHub
parent 757aafc131
commit 827d0e8edb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
33 changed files with 284 additions and 286 deletions

View File

@ -74,7 +74,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: [ubuntu-latest] os: [ubuntu-latest]
python-version: ["3.8"] python-version: ["3.10"]
steps: steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
with: with:
@ -116,7 +116,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: ["ubuntu-latest"] os: ["ubuntu-latest"]
python-version: ["3.8"] python-version: ["3.10"]
steps: steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
with: with:
@ -182,7 +182,7 @@ jobs:
submodules: 'true' submodules: 'true'
- uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
with: with:
python-version: "3.8" python-version: "3.10"
architecture: 'x64' architecture: 'x64'
- name: Install Python packages - name: Install Python packages
run: | run: |

View File

@ -84,7 +84,7 @@ jobs:
strategy: strategy:
matrix: matrix:
os: [macos-13, windows-latest] os: [macos-13, windows-latest]
python-version: ["3.8"] python-version: ["3.10"]
steps: steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
with: with:
@ -174,7 +174,7 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- {os: windows-latest, python-version: '3.8'} - {os: windows-latest, python-version: '3.10'}
steps: steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
@ -218,7 +218,7 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- {os: ubuntu-latest, python-version: "3.8"} - {os: ubuntu-latest, python-version: "3.10"}
steps: steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
@ -271,7 +271,7 @@ jobs:
strategy: strategy:
matrix: matrix:
config: config:
- {os: ubuntu-latest, python-version: "3.8"} - {os: ubuntu-latest, python-version: "3.10"}
steps: steps:
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
@ -318,10 +318,10 @@ jobs:
with: with:
submodules: 'true' submodules: 'true'
- name: Set up Python 3.8 - name: Set up Python 3.10
uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
with: with:
python-version: 3.8 python-version: "3.10"
- name: Install ninja - name: Install ninja
run: | run: |

View File

@ -36,7 +36,7 @@ jobs:
with: with:
miniforge-variant: Mambaforge miniforge-variant: Mambaforge
miniforge-version: latest miniforge-version: latest
python-version: 3.9 python-version: ["3.10"]
use-mamba: true use-mamba: true
- name: Build wheels - name: Build wheels
run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }}

View File

@ -86,7 +86,7 @@ jobs:
- uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
with: with:
python-version: "3.8" python-version: "3.10"
architecture: 'x64' architecture: 'x64'
- uses: r-lib/actions/setup-tinytex@v2 - uses: r-lib/actions/setup-tinytex@v2

View File

@ -106,8 +106,8 @@ plt.figure(figsize=(12, 13))
bst = xgb.train( bst = xgb.train(
params, params,
dmat, dmat,
15, num_boost_round=15,
[(dmat, "train")], evals=[(dmat, "train")],
evals_result=res, evals_result=res,
callbacks=[PlotIntermediateModel()], callbacks=[PlotIntermediateModel()],
) )

View File

@ -42,7 +42,7 @@ class IterForDMatrixDemo(xgboost.core.DataIter):
""" """
self.rows = ROWS_PER_BATCH self.rows = ROWS_PER_BATCH
self.cols = COLS self.cols = COLS
rng = cupy.random.RandomState(1994) rng = cupy.random.RandomState(numpy.uint64(1994))
self._data = [rng.randn(self.rows, self.cols)] * BATCHES self._data = [rng.randn(self.rows, self.cols)] * BATCHES
self._labels = [rng.randn(self.rows)] * BATCHES self._labels = [rng.randn(self.rows)] * BATCHES
self._weights = [rng.uniform(size=self.rows)] * BATCHES self._weights = [rng.uniform(size=self.rows)] * BATCHES

View File

@ -8,7 +8,7 @@ This directory contains a demo of Horizontal Federated Learning using
To run the demo, first build XGBoost with the federated learning plugin enabled (see the To run the demo, first build XGBoost with the federated learning plugin enabled (see the
[README](../../../plugin/federated/README.md)). [README](../../../plugin/federated/README.md)).
Install NVFlare (note that currently NVFlare only supports Python 3.8): Install NVFlare:
```shell ```shell
pip install nvflare pip install nvflare
``` ```

View File

@ -8,7 +8,7 @@ This directory contains a demo of Vertical Federated Learning using
To run the demo, first build XGBoost with the federated learning plugin enabled (see the To run the demo, first build XGBoost with the federated learning plugin enabled (see the
[README](../../../plugin/federated/README.md)). [README](../../../plugin/federated/README.md)).
Install NVFlare (note that currently NVFlare only supports Python 3.8): Install NVFlare:
```shell ```shell
pip install nvflare pip install nvflare
``` ```

View File

@ -286,7 +286,7 @@ latex_documents = [
] ]
intersphinx_mapping = { intersphinx_mapping = {
"python": ("https://docs.python.org/3.8", None), "python": ("https://docs.python.org/3.10", None),
"numpy": ("https://numpy.org/doc/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None),
"scipy": ("https://docs.scipy.org/doc/scipy/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),

View File

@ -14,7 +14,7 @@ authors = [
{ name = "Jiaming Yuan", email = "jm.yuan@outlook.com" } { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
] ]
version = "2.2.0-dev" version = "2.2.0-dev"
requires-python = ">=3.8" requires-python = ">=3.10"
license = { text = "Apache-2.0" } license = { text = "Apache-2.0" }
classifiers = [ classifiers = [
"License :: OSI Approved :: Apache Software License", "License :: OSI Approved :: Apache Software License",
@ -22,8 +22,6 @@ classifiers = [
"Operating System :: OS Independent", "Operating System :: OS Independent",
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",

View File

@ -14,6 +14,7 @@ from collections.abc import Mapping
from enum import IntEnum, unique from enum import IntEnum, unique
from functools import wraps from functools import wraps
from inspect import Parameter, signature from inspect import Parameter, signature
from types import EllipsisType
from typing import ( from typing import (
Any, Any,
Callable, Callable,
@ -1826,7 +1827,7 @@ class Booster:
state["handle"] = handle state["handle"] = handle
self.__dict__.update(state) self.__dict__.update(state)
def __getitem__(self, val: Union[Integer, tuple, slice]) -> "Booster": def __getitem__(self, val: Union[Integer, tuple, slice, EllipsisType]) -> "Booster":
"""Get a slice of the tree-based model. """Get a slice of the tree-based model.
.. versionadded:: 1.3.0 .. versionadded:: 1.3.0
@ -1835,21 +1836,20 @@ class Booster:
# convert to slice for all other types # convert to slice for all other types
if isinstance(val, (np.integer, int)): if isinstance(val, (np.integer, int)):
val = slice(int(val), int(val + 1)) val = slice(int(val), int(val + 1))
if isinstance(val, type(Ellipsis)): if isinstance(val, EllipsisType):
val = slice(0, 0) val = slice(0, 0)
if isinstance(val, tuple): if isinstance(val, tuple):
raise ValueError("Only supports slicing through 1 dimension.") raise ValueError("Only supports slicing through 1 dimension.")
# All supported types are now slice # All supported types are now slice
# FIXME(jiamingy): Use `types.EllipsisType` once Python 3.10 is used.
if not isinstance(val, slice): if not isinstance(val, slice):
msg = _expect((int, slice, np.integer, type(Ellipsis)), type(val)) msg = _expect((int, slice, np.integer, EllipsisType), type(val))
raise TypeError(msg) raise TypeError(msg)
if isinstance(val.start, type(Ellipsis)) or val.start is None: if isinstance(val.start, EllipsisType) or val.start is None:
start = 0 start = 0
else: else:
start = val.start start = val.start
if isinstance(val.stop, type(Ellipsis)) or val.stop is None: if isinstance(val.stop, EllipsisType) or val.stop is None:
stop = 0 stop = 0
else: else:
stop = val.stop stop = val.stop

View File

@ -292,7 +292,7 @@ class DaskDMatrix:
@_deprecate_positional_args @_deprecate_positional_args
def __init__( def __init__(
self, self,
client: "distributed.Client", client: Optional["distributed.Client"],
data: _DataT, data: _DataT,
label: Optional[_DaskCollection] = None, label: Optional[_DaskCollection] = None,
*, *,
@ -663,7 +663,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
@_deprecate_positional_args @_deprecate_positional_args
def __init__( def __init__(
self, self,
client: "distributed.Client", client: Optional["distributed.Client"],
data: _DataT, data: _DataT,
label: Optional[_DaskCollection] = None, label: Optional[_DaskCollection] = None,
*, *,
@ -674,7 +674,7 @@ class DaskQuantileDMatrix(DaskDMatrix):
feature_names: Optional[FeatureNames] = None, feature_names: Optional[FeatureNames] = None,
feature_types: Optional[Union[Any, List[Any]]] = None, feature_types: Optional[Union[Any, List[Any]]] = None,
max_bin: Optional[int] = None, max_bin: Optional[int] = None,
ref: Optional[DMatrix] = None, ref: Optional[DaskDMatrix] = None,
group: Optional[_DaskCollection] = None, group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None, qid: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None, label_lower_bound: Optional[_DaskCollection] = None,
@ -1832,8 +1832,8 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None, eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True, verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None, xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None, sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None, feature_weights: Optional[_DaskCollection] = None,
@ -1940,8 +1940,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None, eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True, verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None, xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None, sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None, feature_weights: Optional[_DaskCollection] = None,
@ -2122,8 +2122,8 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None, eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
eval_group: Optional[Sequence[_DaskCollection]] = None, eval_group: Optional[Sequence[_DaskCollection]] = None,
eval_qid: Optional[Sequence[_DaskCollection]] = None, eval_qid: Optional[Sequence[_DaskCollection]] = None,
verbose: Union[int, bool] = False, verbose: Optional[Union[int, bool]] = False,
xgb_model: Optional[Union[XGBModel, Booster]] = None, xgb_model: Optional[Union[XGBModel, str, Booster]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None, sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None, feature_weights: Optional[_DaskCollection] = None,
@ -2185,8 +2185,8 @@ class DaskXGBRFRegressor(DaskXGBRegressor):
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None, eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True, verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None, xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None, sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None, feature_weights: Optional[_DaskCollection] = None,
@ -2246,8 +2246,8 @@ class DaskXGBRFClassifier(DaskXGBClassifier):
sample_weight: Optional[_DaskCollection] = None, sample_weight: Optional[_DaskCollection] = None,
base_margin: Optional[_DaskCollection] = None, base_margin: Optional[_DaskCollection] = None,
eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None, eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]] = None,
verbose: Union[int, bool] = True, verbose: Optional[Union[int, bool]] = True,
xgb_model: Optional[Union[Booster, XGBModel]] = None, xgb_model: Optional[Union[Booster, str, XGBModel]] = None,
sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None, sample_weight_eval_set: Optional[Sequence[_DaskCollection]] = None,
base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None,
feature_weights: Optional[_DaskCollection] = None, feature_weights: Optional[_DaskCollection] = None,

View File

@ -5,7 +5,17 @@ import ctypes
import json import json
import os import os
import warnings import warnings
from typing import Any, Callable, List, Optional, Sequence, Tuple, cast from typing import (
Any,
Callable,
List,
Optional,
Sequence,
Tuple,
TypeGuard,
Union,
cast,
)
import numpy as np import numpy as np
@ -212,7 +222,7 @@ def is_scipy_coo(data: DataType) -> bool:
return is_array or is_matrix return is_array or is_matrix
def _is_np_array_like(data: DataType) -> bool: def _is_np_array_like(data: DataType) -> TypeGuard[np.ndarray]:
return hasattr(data, "__array_interface__") return hasattr(data, "__array_interface__")
@ -241,7 +251,7 @@ def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray:
def _from_numpy_array( def _from_numpy_array(
data: DataType, data: np.ndarray,
missing: FloatCompatible, missing: FloatCompatible,
nthread: int, nthread: int,
feature_names: Optional[FeatureNames], feature_names: Optional[FeatureNames],
@ -266,7 +276,7 @@ def _from_numpy_array(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_pandas_df(data: DataType) -> bool: def _is_pandas_df(data: DataType) -> TypeGuard[DataFrame]:
try: try:
import pandas as pd import pandas as pd
except ImportError: except ImportError:
@ -1057,12 +1067,12 @@ def _from_dlpack(
return _from_cupy_array(data, missing, nthread, feature_names, feature_types) return _from_cupy_array(data, missing, nthread, feature_names, feature_types)
def _is_uri(data: DataType) -> bool: def _is_uri(data: DataType) -> TypeGuard[Union[str, os.PathLike]]:
return isinstance(data, (str, os.PathLike)) return isinstance(data, (str, os.PathLike))
def _from_uri( def _from_uri(
data: DataType, data: Union[str, os.PathLike],
missing: Optional[FloatCompatible], missing: Optional[FloatCompatible],
feature_names: Optional[FeatureNames], feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
@ -1080,7 +1090,7 @@ def _from_uri(
return handle, feature_names, feature_types return handle, feature_names, feature_types
def _is_list(data: DataType) -> bool: def _is_list(data: DataType) -> TypeGuard[list]:
return isinstance(data, list) return isinstance(data, list)
@ -1099,7 +1109,7 @@ def _from_list(
) )
def _is_tuple(data: DataType) -> bool: def _is_tuple(data: DataType) -> TypeGuard[tuple]:
return isinstance(data, tuple) return isinstance(data, tuple)
@ -1116,7 +1126,7 @@ def _from_tuple(
) )
def _is_iter(data: DataType) -> bool: def _is_iter(data: DataType) -> TypeGuard[DataIter]:
return isinstance(data, DataIter) return isinstance(data, DataIter)

View File

@ -6,14 +6,12 @@ change without notice.
# pylint: disable=invalid-name,missing-function-docstring,import-error # pylint: disable=invalid-name,missing-function-docstring,import-error
import gc import gc
import importlib.util import importlib.util
import multiprocessing
import os import os
import platform import platform
import queue import queue
import socket import socket
import sys import sys
import threading import threading
from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager from contextlib import contextmanager
from io import StringIO from io import StringIO
from platform import system from platform import system
@ -46,6 +44,7 @@ from xgboost.testing.data import (
get_digits, get_digits,
get_sparse, get_sparse,
make_batches, make_batches,
make_sparse_regression,
memory, memory,
) )
@ -115,6 +114,10 @@ def no_dask() -> PytestSkip:
return no_mod("dask") return no_mod("dask")
def no_loky() -> PytestSkip:
return no_mod("loky")
def no_dask_ml() -> PytestSkip: def no_dask_ml() -> PytestSkip:
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
return {"reason": "Unsupported platform.", "condition": True} return {"reason": "Unsupported platform.", "condition": True}
@ -136,7 +139,14 @@ def no_arrow() -> PytestSkip:
def no_modin() -> PytestSkip: def no_modin() -> PytestSkip:
return no_mod("modin") try:
import modin.pandas as md
md.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
except ImportError:
return {"reason": "Failed import modin.", "condition": True}
return {"reason": "Failed import modin.", "condition": True}
def no_dt() -> PytestSkip: def no_dt() -> PytestSkip:
@ -487,94 +497,6 @@ def _cat_sampled_from() -> strategies.SearchStrategy:
categorical_dataset_strategy: strategies.SearchStrategy = _cat_sampled_from() categorical_dataset_strategy: strategies.SearchStrategy = _cat_sampled_from()
# pylint: disable=too-many-locals
@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
"""Make sparse matrix.
Parameters
----------
as_dense:
Return the matrix as np.ndarray with missing values filled by NaN
"""
if not hasattr(np.random, "default_rng"):
rng = np.random.RandomState(1994)
X = sparse.random(
m=n_samples,
n=n_features,
density=1.0 - sparsity,
random_state=rng,
format="csr",
)
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
return X, y
# Use multi-thread to speed up the generation, convenient if you use this function
# for benchmarking.
n_threads = min(multiprocessing.cpu_count(), n_features)
def random_csc(t_id: int) -> sparse.csc_matrix:
rng = np.random.default_rng(1994 * t_id)
thread_size = n_features // n_threads
if t_id == n_threads - 1:
n_features_tloc = n_features - t_id * thread_size
else:
n_features_tloc = thread_size
X = sparse.random(
m=n_samples,
n=n_features_tloc,
density=1.0 - sparsity,
random_state=rng,
).tocsc()
y = np.zeros((n_samples, 1))
for i in range(X.shape[1]):
size = X.indptr[i + 1] - X.indptr[i]
if size != 0:
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
return X, y
futures = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
for i in range(n_threads):
futures.append(executor.submit(random_csc, i))
X_results = []
y_results = []
for f in futures:
X, y = f.result()
X_results.append(X)
y_results.append(y)
assert len(y_results) == n_threads
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
y = np.asarray(y_results)
y = y.reshape((y.shape[0], y.shape[1])).T
y = np.sum(y, axis=1)
assert csr.shape[0] == n_samples
assert csr.shape[1] == n_features
assert y.shape[0] == n_samples
if as_dense:
arr = csr.toarray()
assert arr.shape[0] == n_samples
assert arr.shape[1] == n_features
arr[arr == 0] = np.nan
return arr, y
return csr, y
sparse_datasets_strategy = strategies.sampled_from( sparse_datasets_strategy = strategies.sampled_from(
[ [
TestDataset( TestDataset(

View File

@ -1,7 +1,9 @@
# pylint: disable=invalid-name # pylint: disable=invalid-name
"""Utilities for data generation.""" """Utilities for data generation."""
import multiprocessing
import os import os
import zipfile import zipfile
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
@ -523,7 +525,7 @@ def make_batches( # pylint: disable=too-many-arguments,too-many-locals
if use_cupy: if use_cupy:
import cupy # pylint: disable=import-error import cupy # pylint: disable=import-error
rng = cupy.random.RandomState(random_state) rng = cupy.random.RandomState(np.uint64(random_state))
else: else:
rng = np.random.RandomState(random_state) rng = np.random.RandomState(random_state)
for i in range(n_batches): for i in range(n_batches):
@ -843,3 +845,90 @@ def run_base_margin_info(
base_margin = X.reshape(2, 5, 2, 5) base_margin = X.reshape(2, 5, 2, 5)
with pytest.raises(ValueError, match=r".*base_margin.*"): with pytest.raises(ValueError, match=r".*base_margin.*"):
Xy.set_base_margin(base_margin) Xy.set_base_margin(base_margin)
# pylint: disable=too-many-locals
@memory.cache
def make_sparse_regression(
n_samples: int, n_features: int, sparsity: float, as_dense: bool
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
"""Make sparse matrix.
Parameters
----------
as_dense:
Return the matrix as np.ndarray with missing values filled by NaN
"""
if not hasattr(np.random, "default_rng"):
rng = np.random.RandomState(1994)
X = sparse.random(
m=n_samples,
n=n_features,
density=1.0 - sparsity,
random_state=rng,
format="csr",
)
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
return X, y
# Use multi-thread to speed up the generation, convenient if you use this function
# for benchmarking.
n_threads = min(multiprocessing.cpu_count(), n_features)
def random_csc(t_id: int) -> sparse.csc_matrix:
rng = np.random.default_rng(1994 * t_id)
thread_size = n_features // n_threads
if t_id == n_threads - 1:
n_features_tloc = n_features - t_id * thread_size
else:
n_features_tloc = thread_size
X = sparse.random(
m=n_samples,
n=n_features_tloc,
density=1.0 - sparsity,
random_state=rng,
).tocsc()
y = np.zeros((n_samples, 1))
for i in range(X.shape[1]):
size = X.indptr[i + 1] - X.indptr[i]
if size != 0:
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
return X, y
futures = []
with ThreadPoolExecutor(max_workers=n_threads) as executor:
for i in range(n_threads):
futures.append(executor.submit(random_csc, i))
X_results = []
y_results = []
for f in futures:
X, y = f.result()
X_results.append(X)
y_results.append(y)
assert len(y_results) == n_threads
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
y = np.asarray(y_results)
y = y.reshape((y.shape[0], y.shape[1])).T
y = np.sum(y, axis=1)
assert csr.shape[0] == n_samples
assert csr.shape[1] == n_features
assert y.shape[0] == n_samples
if as_dense:
arr = csr.toarray()
assert arr.shape[0] == n_samples
assert arr.shape[1] == n_features
arr[arr == 0] = np.nan
return arr, y
return csr, y

View File

@ -198,7 +198,7 @@ class CVPack:
def __init__( def __init__(
self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]] self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]
) -> None: ) -> None:
""" "Initialize the CVPack""" """Initialize the CVPack."""
self.dtrain = dtrain self.dtrain = dtrain
self.dtest = dtest self.dtest = dtest
self.watchlist = [(dtrain, "train"), (dtest, "test")] self.watchlist = [(dtrain, "train"), (dtest, "test")]
@ -277,7 +277,7 @@ class _PackedBooster:
self.set_attr(best_score=score) self.set_attr(best_score=score)
def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray: def groups_to_rows(groups: np.ndarray, boundaries: np.ndarray) -> np.ndarray:
""" """
Given group row boundaries, convert ground indexes to row indexes Given group row boundaries, convert ground indexes to row indexes
:param groups: list of groups for testing :param groups: list of groups for testing

View File

@ -27,7 +27,8 @@ RUN \
"nccl>=${NCCL_SHORT_VER}" \ "nccl>=${NCCL_SHORT_VER}" \
dask \ dask \
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \
python-kubernetes urllib3 graphviz hypothesis loky \
"pyspark>=3.4.0" cloudpickle cuda-python && \ "pyspark>=3.4.0" cloudpickle cuda-python && \
mamba clean --all --yes && \ mamba clean --all --yes && \
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

View File

@ -30,7 +30,8 @@ RUN \
"nccl>=${NCCL_SHORT_VER}" \ "nccl>=${NCCL_SHORT_VER}" \
dask \ dask \
"dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \
python-kubernetes urllib3 graphviz hypothesis loky \
"pyspark>=3.4.0" cloudpickle cuda-python && \ "pyspark>=3.4.0" cloudpickle cuda-python && \
mamba clean --all --yes && \ mamba clean --all --yes && \
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

View File

@ -2,7 +2,7 @@ name: aarch64_test
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- pip - pip
- wheel - wheel
- pytest - pytest
@ -26,7 +26,7 @@ dependencies:
- awscli - awscli
- numba - numba
- llvmlite - llvmlite
- cffi - loky
- pyarrow - pyarrow
- pyspark>=3.4.0 - pyspark>=3.4.0
- cloudpickle - cloudpickle

View File

@ -2,7 +2,7 @@ name: linux_cpu_test
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- cmake - cmake
- c-compiler - c-compiler
- cxx-compiler - cxx-compiler
@ -33,7 +33,7 @@ dependencies:
- boto3 - boto3
- awscli - awscli
- py-ubjson - py-ubjson
- cffi - loky
- pyarrow - pyarrow
- protobuf - protobuf
- cloudpickle - cloudpickle

View File

@ -3,7 +3,7 @@ channels:
- conda-forge - conda-forge
- https://software.repos.intel.com/python/conda/ - https://software.repos.intel.com/python/conda/
dependencies: dependencies:
- python=3.8 - python=3.10
- cmake - cmake
- c-compiler - c-compiler
- cxx-compiler - cxx-compiler

View File

@ -2,7 +2,7 @@ name: macos_test
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- pip - pip
- wheel - wheel
- pyyaml - pyyaml
@ -32,7 +32,7 @@ dependencies:
- jsonschema - jsonschema
- boto3 - boto3
- awscli - awscli
- cffi - loky
- pyarrow - pyarrow
- pyspark>=3.4.0 - pyspark>=3.4.0
- cloudpickle - cloudpickle

View File

@ -2,11 +2,11 @@ name: python_lint
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- pylint<3.2.4 # https://github.com/pylint-dev/pylint/issues/9751 - pylint<3.2.4 # https://github.com/pylint-dev/pylint/issues/9751
- wheel - wheel
- setuptools - setuptools
- mypy>=0.981 - mypy
- numpy - numpy
- scipy - scipy
- pandas - pandas

View File

@ -3,7 +3,7 @@ name: sdist_test
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- pip - pip
- wheel - wheel
- cmake - cmake

View File

@ -2,7 +2,7 @@ name: win64_env
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- wheel - wheel
- numpy - numpy
- scipy - scipy
@ -18,5 +18,5 @@ dependencies:
- python-graphviz - python-graphviz
- pip - pip
- py-ubjson - py-ubjson
- cffi - loky
- pyarrow - pyarrow

View File

@ -2,7 +2,7 @@ name: win64_env
channels: channels:
- conda-forge - conda-forge
dependencies: dependencies:
- python=3.8 - python=3.10
- numpy - numpy
- scipy - scipy
- matplotlib - matplotlib
@ -12,9 +12,9 @@ dependencies:
- boto3 - boto3
- hypothesis - hypothesis
- jsonschema - jsonschema
- cupy - cupy>=13.2
- python-graphviz - python-graphviz
- pip - pip
- py-ubjson - py-ubjson
- cffi - loky
- pyarrow - pyarrow

View File

@ -20,7 +20,7 @@ class TestQuantileDMatrix:
def test_dmatrix_feature_weights(self) -> None: def test_dmatrix_feature_weights(self) -> None:
import cupy as cp import cupy as cp
rng = cp.random.RandomState(1994) rng = cp.random.RandomState(np.uint64(1994))
data = rng.randn(5, 5) data = rng.randn(5, 5)
m = xgb.DMatrix(data) m = xgb.DMatrix(data)
@ -146,7 +146,7 @@ class TestQuantileDMatrix:
def test_metainfo(self) -> None: def test_metainfo(self) -> None:
import cupy as cp import cupy as cp
rng = cp.random.RandomState(1994) rng = cp.random.RandomState(np.uint64(1994))
rows = 10 rows = 10
cols = 3 cols = 3
@ -170,7 +170,7 @@ class TestQuantileDMatrix:
def test_ref_dmatrix(self) -> None: def test_ref_dmatrix(self) -> None:
import cupy as cp import cupy as cp
rng = cp.random.RandomState(1994) rng = cp.random.RandomState(np.uint64(1994))
self.cputest.run_ref_dmatrix(rng, "gpu_hist", False) self.cputest.run_ref_dmatrix(rng, "gpu_hist", False)
@given( @given(

View File

@ -66,7 +66,7 @@ def _test_from_cupy(DMatrixT):
def _test_cupy_training(DMatrixT): def _test_cupy_training(DMatrixT):
np.random.seed(1) np.random.seed(1)
cp.random.seed(1) cp.random.seed(np.uint64(1))
X = cp.random.randn(50, 10, dtype="float32") X = cp.random.randn(50, 10, dtype="float32")
y = cp.random.randn(50, dtype="float32") y = cp.random.randn(50, dtype="float32")
weights = np.random.random(50) + 1 weights = np.random.random(50) + 1
@ -131,7 +131,7 @@ def _test_cupy_metainfo(DMatrixT):
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
def test_cupy_training_with_sklearn(): def test_cupy_training_with_sklearn():
np.random.seed(1) np.random.seed(1)
cp.random.seed(1) cp.random.seed(np.uint64(1))
X = cp.random.randn(50, 10, dtype="float32") X = cp.random.randn(50, 10, dtype="float32")
y = (cp.random.randn(50, dtype="float32") > 0).astype("int8") y = (cp.random.randn(50, dtype="float32") > 0).astype("int8")
weights = np.random.random(50) + 1 weights = np.random.random(50) + 1
@ -210,7 +210,7 @@ class TestFromCupy:
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
def test_qid(self): def test_qid(self):
rng = cp.random.RandomState(1994) rng = cp.random.RandomState(np.uint64(1994))
rows = 100 rows = 100
cols = 10 cols = 10
X, y = rng.randn(rows, cols), rng.randn(rows) X, y = rng.randn(rows, cols), rng.randn(rows)

View File

@ -226,7 +226,7 @@ class TestGPUPredict:
cols = 10 cols = 10
missing = 11 # set to integer for testing missing = 11 # set to integer for testing
cp_rng = cp.random.RandomState(1994) cp_rng = cp.random.RandomState(np.uint64(1994))
cp.random.set_random_state(cp_rng) cp.random.set_random_state(cp_rng)
X = cp.random.randn(rows, cols) X = cp.random.randn(rows, cols)
@ -546,7 +546,7 @@ class TestGPUPredict:
rows = 1000 rows = 1000
cols = 10 cols = 10
rng = cp.random.RandomState(1994) rng = cp.random.RandomState(np.uint64(1994))
orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols) orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
y = rng.randint(low=0, high=127, size=rows) y = rng.randint(low=0, high=127, size=rows)
dtrain = xgb.DMatrix(orig, label=y) dtrain = xgb.DMatrix(orig, label=y)
@ -576,10 +576,10 @@ class TestGPUPredict:
# boolean # boolean
orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols) orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
predt_orig = booster.inplace_predict(orig) predt_orig = booster.inplace_predict(orig)
for dtype in [cp.bool8, cp.bool_]:
X = cp.array(orig, dtype=dtype) X = cp.array(orig, dtype=cp.bool_)
predt = booster.inplace_predict(X) predt = booster.inplace_predict(X)
cp.testing.assert_allclose(predt, predt_orig) cp.testing.assert_allclose(predt, predt_orig)
# unsupported types # unsupported types
for dtype in [ for dtype in [

View File

@ -425,8 +425,8 @@ class TestModels:
np.testing.assert_allclose(merged, single, atol=1e-6) np.testing.assert_allclose(merged, single, atol=1e-6)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize("booster", ["gbtree", "dart"]) @pytest.mark.parametrize("booster_name", ["gbtree", "dart"])
def test_slice(self, booster): def test_slice(self, booster_name: str) -> None:
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
num_classes = 3 num_classes = 3
@ -442,7 +442,7 @@ class TestModels:
"num_parallel_tree": num_parallel_tree, "num_parallel_tree": num_parallel_tree,
"subsample": 0.5, "subsample": 0.5,
"num_class": num_classes, "num_class": num_classes,
"booster": booster, "booster": booster_name,
"objective": "multi:softprob", "objective": "multi:softprob",
}, },
num_boost_round=num_boost_round, num_boost_round=num_boost_round,
@ -452,6 +452,8 @@ class TestModels:
assert len(booster.get_dump()) == total_trees assert len(booster.get_dump()) == total_trees
assert booster[...].num_boosted_rounds() == num_boost_round
self.run_slice( self.run_slice(
booster, dtrain, num_parallel_tree, num_classes, num_boost_round, False booster, dtrain, num_parallel_tree, num_classes, num_boost_round, False
) )

View File

@ -1,44 +1,46 @@
import multiprocessing
import socket import socket
import sys import sys
from threading import Thread from threading import Thread
import numpy as np import numpy as np
import pytest import pytest
from loky import get_reusable_executor
import xgboost as xgb import xgboost as xgb
from xgboost import RabitTracker, build_info, federated from xgboost import RabitTracker, build_info, federated
from xgboost import testing as tm from xgboost import testing as tm
def run_rabit_worker(rabit_env, world_size): def run_rabit_worker(rabit_env: dict, world_size: int) -> int:
with xgb.collective.CommunicatorContext(**rabit_env): with xgb.collective.CommunicatorContext(**rabit_env):
assert xgb.collective.get_world_size() == world_size assert xgb.collective.get_world_size() == world_size
assert xgb.collective.is_distributed() assert xgb.collective.is_distributed()
assert xgb.collective.get_processor_name() == socket.gethostname() assert xgb.collective.get_processor_name() == socket.gethostname()
ret = xgb.collective.broadcast("test1234", 0) ret = xgb.collective.broadcast("test1234", 0)
assert str(ret) == "test1234" assert str(ret) == "test1234"
ret = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM) reduced = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM)
assert np.array_equal(ret, np.asarray([2, 4, 6])) assert np.array_equal(reduced, np.asarray([2, 4, 6]))
return 0
@pytest.mark.skipif(**tm.no_loky())
def test_rabit_communicator() -> None: def test_rabit_communicator() -> None:
world_size = 2 world_size = 2
tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size) tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size)
tracker.start() tracker.start()
workers = [] workers = []
for _ in range(world_size): with get_reusable_executor(max_workers=world_size) as pool:
worker = multiprocessing.Process( for _ in range(world_size):
target=run_rabit_worker, args=(tracker.worker_args(), world_size) worker = pool.submit(
) run_rabit_worker, rabit_env=tracker.worker_args(), world_size=world_size
workers.append(worker) )
worker.start() workers.append(worker)
for worker in workers:
worker.join() for worker in workers:
assert worker.exitcode == 0 assert worker.result() == 0
def run_federated_worker(port: int, world_size: int, rank: int) -> None: def run_federated_worker(port: int, world_size: int, rank: int) -> int:
with xgb.collective.CommunicatorContext( with xgb.collective.CommunicatorContext(
dmlc_communicator="federated", dmlc_communicator="federated",
federated_server_address=f"localhost:{port}", federated_server_address=f"localhost:{port}",
@ -52,30 +54,28 @@ def run_federated_worker(port: int, world_size: int, rank: int) -> None:
assert str(bret) == "test1234" assert str(bret) == "test1234"
aret = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM) aret = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM)
assert np.array_equal(aret, np.asarray([2, 4, 6])) assert np.array_equal(aret, np.asarray([2, 4, 6]))
return 0
@pytest.mark.skipif(**tm.skip_win()) @pytest.mark.skipif(**tm.skip_win())
@pytest.mark.skipif(**tm.no_loky())
def test_federated_communicator(): def test_federated_communicator():
if not build_info()["USE_FEDERATED"]: if not build_info()["USE_FEDERATED"]:
pytest.skip("XGBoost not built with federated learning enabled") pytest.skip("XGBoost not built with federated learning enabled")
port = 9091 port = 9091
world_size = 2 world_size = 2
tracker = multiprocessing.Process( with get_reusable_executor(max_workers=world_size+1) as pool:
target=federated.run_federated_server, kwargs={"port": port, "n_workers": world_size, "blocking": False}
kwargs={"port": port, "n_workers": world_size, "blocking": False}, tracker = pool.submit(federated.run_federated_server, **kwargs)
) if not tracker.running():
tracker.start() raise RuntimeError("Error starting Federated Learning server")
if not tracker.is_alive():
raise Exception("Error starting Federated Learning server")
workers = [] workers = []
for rank in range(world_size): for rank in range(world_size):
worker = multiprocessing.Process( worker = pool.submit(
target=run_federated_worker, args=(port, world_size, rank) run_federated_worker, port=port, world_size=world_size, rank=rank
) )
workers.append(worker) workers.append(worker)
worker.start() for worker in workers:
for worker in workers: assert worker.result() == 0
worker.join()
assert worker.exitcode == 0

View File

@ -1,8 +1,9 @@
"""Copyright 2019-2023, XGBoost contributors""" """Copyright 2019-2024, XGBoost contributors"""
import asyncio import asyncio
import json import json
from collections import OrderedDict from collections import OrderedDict
from copy import copy
from inspect import signature from inspect import signature
from typing import Any, Dict, Type, TypeVar from typing import Any, Dict, Type, TypeVar
@ -53,15 +54,13 @@ except ImportError:
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None: def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
import cupy as cp import cupy as cp
import dask_cudf
cp.cuda.runtime.setDevice(0) cp.cuda.runtime.setDevice(0)
_X, _y, _ = generate_array() _X, _y, _ = generate_array()
X = dd.from_dask_array(_X) X = dd.from_dask_array(_X).to_backend("cudf")
y = dd.from_dask_array(_y) y = dd.from_dask_array(_y).to_backend("cudf")
X = X.map_partitions(cudf.from_pandas)
y = y.map_partitions(cudf.from_pandas)
dtrain = DMatrixT(client, X, y) dtrain = DMatrixT(client, X, y)
out = dxgb.train( out = dxgb.train(
@ -216,18 +215,22 @@ def test_tree_stats() -> None:
class TestDistributedGPU: class TestDistributedGPU:
@pytest.mark.skipif(**tm.no_cudf()) @pytest.mark.skipif(**tm.no_cudf())
def test_boost_from_prediction(self, local_cuda_client: Client) -> None: def test_boost_from_prediction(self, local_cuda_client: Client) -> None:
import cudf import dask_cudf
from sklearn.datasets import load_breast_cancer, load_iris from sklearn.datasets import load_breast_cancer, load_iris
X_, y_ = load_breast_cancer(return_X_y=True) X_, y_ = load_breast_cancer(return_X_y=True)
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas) X = dd.from_array(X_, chunksize=100).to_backend("cudf")
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas) y = dd.from_array(y_, chunksize=100).to_backend("cudf")
run_boost_from_prediction(X, y, "hist", "cuda", local_cuda_client) divisions = copy(X.divisions)
run_boost_from_prediction(X, y, "hist", "cuda", local_cuda_client, divisions)
X_, y_ = load_iris(return_X_y=True) X_, y_ = load_iris(return_X_y=True)
X = dd.from_array(X_, chunksize=50).map_partitions(cudf.from_pandas) X = dd.from_array(X_, chunksize=50).to_backend("cudf")
y = dd.from_array(y_, chunksize=50).map_partitions(cudf.from_pandas) y = dd.from_array(y_, chunksize=50).to_backend("cudf")
run_boost_from_prediction_multi_class(X, y, "hist", "cuda", local_cuda_client) divisions = copy(X.divisions)
run_boost_from_prediction_multi_class(
X, y, "hist", "cuda", local_cuda_client, divisions
)
def test_init_estimation(self, local_cuda_client: Client) -> None: def test_init_estimation(self, local_cuda_client: Client) -> None:
check_init_estimation("hist", "cuda", local_cuda_client) check_init_estimation("hist", "cuda", local_cuda_client)

View File

@ -1,4 +1,4 @@
"""Copyright 2019-2022 XGBoost contributors""" """Copyright 2019-2024, XGBoost contributors"""
import asyncio import asyncio
import json import json
@ -7,12 +7,24 @@ import pickle
import socket import socket
import tempfile import tempfile
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from copy import copy
from functools import partial from functools import partial
from itertools import starmap from itertools import starmap
from math import ceil from math import ceil
from operator import attrgetter, getitem from operator import attrgetter, getitem
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Generator, Literal, Optional, Tuple, Type, TypeVar, Union from typing import (
Any,
Dict,
Generator,
List,
Literal,
Optional,
Tuple,
Type,
TypeVar,
Union,
)
import hypothesis import hypothesis
import numpy as np import numpy as np
@ -133,34 +145,6 @@ def generate_array(
return X, y, None return X, y, None
def deterministic_persist_per_worker(
df: dd.DataFrame, client: "Client"
) -> dd.DataFrame:
# Got this script from https://github.com/dmlc/xgboost/issues/7927
# Query workers
n_workers = len(client.cluster.workers)
workers = map(attrgetter("worker_address"), client.cluster.workers.values())
# Slice data into roughly equal partitions
subpartition_size = ceil(df.npartitions / n_workers)
subpartition_divisions = range(
0, df.npartitions + subpartition_size, subpartition_size
)
subpartition_slices = starmap(slice, sliding_window(2, subpartition_divisions))
subpartitions = map(partial(getitem, df.partitions), subpartition_slices)
# Persist each subpartition on each worker
# Rebuild dataframe from persisted subpartitions
df2 = dd.concat(
[
sp.persist(workers=w, allow_other_workers=False)
for sp, w in zip(subpartitions, workers)
]
)
return df2
Margin = TypeVar("Margin", dd.DataFrame, dd.Series, None) Margin = TypeVar("Margin", dd.DataFrame, dd.Series, None)
@ -169,30 +153,14 @@ def deterministic_repartition(
X: dd.DataFrame, X: dd.DataFrame,
y: dd.Series, y: dd.Series,
m: Margin, m: Margin,
divisions,
) -> Tuple[dd.DataFrame, dd.Series, Margin]: ) -> Tuple[dd.DataFrame, dd.Series, Margin]:
# force repartition the data to avoid non-deterministic result X, y, margin = (
if any(X.map_partitions(lambda x: _is_cudf_df(x)).compute()): dd.repartition(X, divisions=divisions, force=True),
# dask_cudf seems to be doing fine for now dd.repartition(y, divisions=divisions, force=True),
return X, y, m dd.repartition(m, divisions=divisions, force=True) if m is not None else None,
)
X["_y"] = y return X, y, margin
if m is not None:
if isinstance(m, dd.DataFrame):
m_columns = m.columns
X = dd.concat([X, m], join="outer", axis=1)
else:
m_columns = ["_m"]
X["_m"] = m
X = deterministic_persist_per_worker(X, client)
y = X["_y"]
X = X[X.columns.difference(["_y"])]
if m is not None:
m = X[m_columns]
X = X[X.columns.difference(m_columns)]
return X, y, m
@pytest.mark.parametrize("to_frame", [True, False]) @pytest.mark.parametrize("to_frame", [True, False])
@ -218,10 +186,10 @@ def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client"):
def test_from_dask_dataframe() -> None: def test_from_dask_dataframe() -> None:
with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster: with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster:
with Client(cluster) as client: with Client(cluster) as client:
X, y, _ = generate_array() X_, y_, _ = generate_array()
X = dd.from_dask_array(X) X = dd.from_dask_array(X_)
y = dd.from_dask_array(y) y = dd.from_dask_array(y_)
dtrain = DaskDMatrix(client, X, y) dtrain = DaskDMatrix(client, X, y)
booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)["booster"] booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)["booster"]
@ -456,6 +424,7 @@ def run_boost_from_prediction_multi_class(
tree_method: str, tree_method: str,
device: str, device: str,
client: "Client", client: "Client",
divisions: List[int],
) -> None: ) -> None:
model_0 = xgb.dask.DaskXGBClassifier( model_0 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, learning_rate=0.3,
@ -464,7 +433,7 @@ def run_boost_from_prediction_multi_class(
max_bin=768, max_bin=768,
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_0.fit(X=X, y=y) model_0.fit(X=X, y=y)
margin = xgb.dask.inplace_predict( margin = xgb.dask.inplace_predict(
client, model_0.get_booster(), X, predict_type="margin" client, model_0.get_booster(), X, predict_type="margin"
@ -478,7 +447,7 @@ def run_boost_from_prediction_multi_class(
max_bin=768, max_bin=768,
device=device, device=device,
) )
X, y, margin = deterministic_repartition(client, X, y, margin) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
model_1.fit(X=X, y=y, base_margin=margin) model_1.fit(X=X, y=y, base_margin=margin)
predictions_1 = xgb.dask.predict( predictions_1 = xgb.dask.predict(
client, client,
@ -494,7 +463,7 @@ def run_boost_from_prediction_multi_class(
max_bin=768, max_bin=768,
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_2.fit(X=X, y=y) model_2.fit(X=X, y=y)
predictions_2 = xgb.dask.inplace_predict( predictions_2 = xgb.dask.inplace_predict(
client, model_2.get_booster(), X, predict_type="margin" client, model_2.get_booster(), X, predict_type="margin"
@ -517,6 +486,7 @@ def run_boost_from_prediction(
tree_method: str, tree_method: str,
device: str, device: str,
client: "Client", client: "Client",
divisions: List[int],
) -> None: ) -> None:
X, y = client.persist([X, y]) X, y = client.persist([X, y])
@ -527,7 +497,7 @@ def run_boost_from_prediction(
max_bin=512, max_bin=512,
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_0.fit(X=X, y=y) model_0.fit(X=X, y=y)
margin: dd.Series = model_0.predict(X, output_margin=True) margin: dd.Series = model_0.predict(X, output_margin=True)
@ -538,9 +508,9 @@ def run_boost_from_prediction(
max_bin=512, max_bin=512,
device=device, device=device,
) )
X, y, margin = deterministic_repartition(client, X, y, margin) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
model_1.fit(X=X, y=y, base_margin=margin) model_1.fit(X=X, y=y, base_margin=margin)
X, y, margin = deterministic_repartition(client, X, y, margin) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
predictions_1: dd.Series = model_1.predict(X, base_margin=margin) predictions_1: dd.Series = model_1.predict(X, base_margin=margin)
model_2 = xgb.dask.DaskXGBClassifier( model_2 = xgb.dask.DaskXGBClassifier(
@ -550,7 +520,7 @@ def run_boost_from_prediction(
max_bin=512, max_bin=512,
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_2.fit(X=X, y=y) model_2.fit(X=X, y=y)
predictions_2: dd.Series = model_2.predict(X) predictions_2: dd.Series = model_2.predict(X)
@ -563,13 +533,13 @@ def run_boost_from_prediction(
np.testing.assert_allclose(predt_1, predt_2, atol=1e-5) np.testing.assert_allclose(predt_1, predt_2, atol=1e-5)
margined = xgb.dask.DaskXGBClassifier(n_estimators=4) margined = xgb.dask.DaskXGBClassifier(n_estimators=4)
X, y, margin = deterministic_repartition(client, X, y, margin) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
margined.fit( margined.fit(
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin] X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
) )
unmargined = xgb.dask.DaskXGBClassifier(n_estimators=4) unmargined = xgb.dask.DaskXGBClassifier(n_estimators=4)
X, y, margin = deterministic_repartition(client, X, y, margin) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
unmargined.fit(X=X, y=y, eval_set=[(X, y)], base_margin=margin) unmargined.fit(X=X, y=y, eval_set=[(X, y)], base_margin=margin)
margined_res = margined.evals_result()["validation_0"]["logloss"] margined_res = margined.evals_result()["validation_0"]["logloss"]
@ -587,11 +557,13 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
X_, y_ = load_breast_cancer(return_X_y=True) X_, y_ = load_breast_cancer(return_X_y=True)
X, y = dd.from_array(X_, chunksize=200), dd.from_array(y_, chunksize=200) X, y = dd.from_array(X_, chunksize=200), dd.from_array(y_, chunksize=200)
run_boost_from_prediction(X, y, tree_method, "cpu", client) divisions = copy(X.divisions)
run_boost_from_prediction(X, y, tree_method, "cpu", client, divisions)
X_, y_ = load_digits(return_X_y=True) X_, y_ = load_digits(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100) X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
run_boost_from_prediction_multi_class(X, y, tree_method, "cpu", client) divisions = copy(X.divisions)
run_boost_from_prediction_multi_class(X, y, tree_method, "cpu", client, divisions)
def test_inplace_predict(client: "Client") -> None: def test_inplace_predict(client: "Client") -> None:
@ -1594,7 +1566,7 @@ class TestWithDask:
def test_empty_quantile_dmatrix(self, client: Client) -> None: def test_empty_quantile_dmatrix(self, client: Client) -> None:
X, y = make_categorical(client, 2, 30, 13) X, y = make_categorical(client, 2, 30, 13)
X_valid, y_valid = make_categorical(client, 10000, 30, 13) X_valid, y_valid = make_categorical(client, 10000, 30, 13)
X_valid, y_valid, _ = deterministic_repartition(client, X_valid, y_valid, None) divisions = copy(X_valid.divisions)
Xy = xgb.dask.DaskQuantileDMatrix(client, X, y, enable_categorical=True) Xy = xgb.dask.DaskQuantileDMatrix(client, X, y, enable_categorical=True)
Xy_valid = xgb.dask.DaskQuantileDMatrix( Xy_valid = xgb.dask.DaskQuantileDMatrix(