[breaking] Bump Python requirement to 3.10. (#10434)
- Bump the Python requirement. - Fix type hints. - Use loky to avoid deadlock. - Workaround cupy-numpy compatibility issue on Windows caused by the `safe` casting rule. - Simplify the repartitioning logic to avoid dask errors.
This commit is contained in:
@@ -27,7 +27,8 @@ RUN \
|
||||
"nccl>=${NCCL_SHORT_VER}" \
|
||||
dask \
|
||||
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \
|
||||
python-kubernetes urllib3 graphviz hypothesis loky \
|
||||
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
||||
mamba clean --all --yes && \
|
||||
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
|
||||
|
||||
@@ -30,7 +30,8 @@ RUN \
|
||||
"nccl>=${NCCL_SHORT_VER}" \
|
||||
dask \
|
||||
"dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
|
||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \
|
||||
python-kubernetes urllib3 graphviz hypothesis loky \
|
||||
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
||||
mamba clean --all --yes && \
|
||||
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
|
||||
|
||||
@@ -2,7 +2,7 @@ name: aarch64_test
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- pip
|
||||
- wheel
|
||||
- pytest
|
||||
@@ -26,7 +26,7 @@ dependencies:
|
||||
- awscli
|
||||
- numba
|
||||
- llvmlite
|
||||
- cffi
|
||||
- loky
|
||||
- pyarrow
|
||||
- pyspark>=3.4.0
|
||||
- cloudpickle
|
||||
|
||||
@@ -2,7 +2,7 @@ name: linux_cpu_test
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- cmake
|
||||
- c-compiler
|
||||
- cxx-compiler
|
||||
@@ -33,7 +33,7 @@ dependencies:
|
||||
- boto3
|
||||
- awscli
|
||||
- py-ubjson
|
||||
- cffi
|
||||
- loky
|
||||
- pyarrow
|
||||
- protobuf
|
||||
- cloudpickle
|
||||
|
||||
@@ -3,7 +3,7 @@ channels:
|
||||
- conda-forge
|
||||
- https://software.repos.intel.com/python/conda/
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- cmake
|
||||
- c-compiler
|
||||
- cxx-compiler
|
||||
|
||||
@@ -2,7 +2,7 @@ name: macos_test
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- pip
|
||||
- wheel
|
||||
- pyyaml
|
||||
@@ -32,7 +32,7 @@ dependencies:
|
||||
- jsonschema
|
||||
- boto3
|
||||
- awscli
|
||||
- cffi
|
||||
- loky
|
||||
- pyarrow
|
||||
- pyspark>=3.4.0
|
||||
- cloudpickle
|
||||
|
||||
@@ -2,11 +2,11 @@ name: python_lint
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- pylint<3.2.4 # https://github.com/pylint-dev/pylint/issues/9751
|
||||
- wheel
|
||||
- setuptools
|
||||
- mypy>=0.981
|
||||
- mypy
|
||||
- numpy
|
||||
- scipy
|
||||
- pandas
|
||||
|
||||
@@ -3,7 +3,7 @@ name: sdist_test
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- pip
|
||||
- wheel
|
||||
- cmake
|
||||
|
||||
@@ -2,7 +2,7 @@ name: win64_env
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- wheel
|
||||
- numpy
|
||||
- scipy
|
||||
@@ -18,5 +18,5 @@ dependencies:
|
||||
- python-graphviz
|
||||
- pip
|
||||
- py-ubjson
|
||||
- cffi
|
||||
- loky
|
||||
- pyarrow
|
||||
|
||||
@@ -2,7 +2,7 @@ name: win64_env
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- python=3.10
|
||||
- numpy
|
||||
- scipy
|
||||
- matplotlib
|
||||
@@ -12,9 +12,9 @@ dependencies:
|
||||
- boto3
|
||||
- hypothesis
|
||||
- jsonschema
|
||||
- cupy
|
||||
- cupy>=13.2
|
||||
- python-graphviz
|
||||
- pip
|
||||
- py-ubjson
|
||||
- cffi
|
||||
- loky
|
||||
- pyarrow
|
||||
|
||||
@@ -20,7 +20,7 @@ class TestQuantileDMatrix:
|
||||
def test_dmatrix_feature_weights(self) -> None:
|
||||
import cupy as cp
|
||||
|
||||
rng = cp.random.RandomState(1994)
|
||||
rng = cp.random.RandomState(np.uint64(1994))
|
||||
data = rng.randn(5, 5)
|
||||
m = xgb.DMatrix(data)
|
||||
|
||||
@@ -146,7 +146,7 @@ class TestQuantileDMatrix:
|
||||
def test_metainfo(self) -> None:
|
||||
import cupy as cp
|
||||
|
||||
rng = cp.random.RandomState(1994)
|
||||
rng = cp.random.RandomState(np.uint64(1994))
|
||||
|
||||
rows = 10
|
||||
cols = 3
|
||||
@@ -170,7 +170,7 @@ class TestQuantileDMatrix:
|
||||
def test_ref_dmatrix(self) -> None:
|
||||
import cupy as cp
|
||||
|
||||
rng = cp.random.RandomState(1994)
|
||||
rng = cp.random.RandomState(np.uint64(1994))
|
||||
self.cputest.run_ref_dmatrix(rng, "gpu_hist", False)
|
||||
|
||||
@given(
|
||||
|
||||
@@ -66,7 +66,7 @@ def _test_from_cupy(DMatrixT):
|
||||
|
||||
def _test_cupy_training(DMatrixT):
|
||||
np.random.seed(1)
|
||||
cp.random.seed(1)
|
||||
cp.random.seed(np.uint64(1))
|
||||
X = cp.random.randn(50, 10, dtype="float32")
|
||||
y = cp.random.randn(50, dtype="float32")
|
||||
weights = np.random.random(50) + 1
|
||||
@@ -131,7 +131,7 @@ def _test_cupy_metainfo(DMatrixT):
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_cupy_training_with_sklearn():
|
||||
np.random.seed(1)
|
||||
cp.random.seed(1)
|
||||
cp.random.seed(np.uint64(1))
|
||||
X = cp.random.randn(50, 10, dtype="float32")
|
||||
y = (cp.random.randn(50, dtype="float32") > 0).astype("int8")
|
||||
weights = np.random.random(50) + 1
|
||||
@@ -210,7 +210,7 @@ class TestFromCupy:
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_qid(self):
|
||||
rng = cp.random.RandomState(1994)
|
||||
rng = cp.random.RandomState(np.uint64(1994))
|
||||
rows = 100
|
||||
cols = 10
|
||||
X, y = rng.randn(rows, cols), rng.randn(rows)
|
||||
|
||||
@@ -226,7 +226,7 @@ class TestGPUPredict:
|
||||
cols = 10
|
||||
missing = 11 # set to integer for testing
|
||||
|
||||
cp_rng = cp.random.RandomState(1994)
|
||||
cp_rng = cp.random.RandomState(np.uint64(1994))
|
||||
cp.random.set_random_state(cp_rng)
|
||||
|
||||
X = cp.random.randn(rows, cols)
|
||||
@@ -546,7 +546,7 @@ class TestGPUPredict:
|
||||
|
||||
rows = 1000
|
||||
cols = 10
|
||||
rng = cp.random.RandomState(1994)
|
||||
rng = cp.random.RandomState(np.uint64(1994))
|
||||
orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
|
||||
y = rng.randint(low=0, high=127, size=rows)
|
||||
dtrain = xgb.DMatrix(orig, label=y)
|
||||
@@ -576,10 +576,10 @@ class TestGPUPredict:
|
||||
# boolean
|
||||
orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
|
||||
predt_orig = booster.inplace_predict(orig)
|
||||
for dtype in [cp.bool8, cp.bool_]:
|
||||
X = cp.array(orig, dtype=dtype)
|
||||
predt = booster.inplace_predict(X)
|
||||
cp.testing.assert_allclose(predt, predt_orig)
|
||||
|
||||
X = cp.array(orig, dtype=cp.bool_)
|
||||
predt = booster.inplace_predict(X)
|
||||
cp.testing.assert_allclose(predt, predt_orig)
|
||||
|
||||
# unsupported types
|
||||
for dtype in [
|
||||
|
||||
@@ -425,8 +425,8 @@ class TestModels:
|
||||
np.testing.assert_allclose(merged, single, atol=1e-6)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
@pytest.mark.parametrize("booster", ["gbtree", "dart"])
|
||||
def test_slice(self, booster):
|
||||
@pytest.mark.parametrize("booster_name", ["gbtree", "dart"])
|
||||
def test_slice(self, booster_name: str) -> None:
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
num_classes = 3
|
||||
@@ -442,7 +442,7 @@ class TestModels:
|
||||
"num_parallel_tree": num_parallel_tree,
|
||||
"subsample": 0.5,
|
||||
"num_class": num_classes,
|
||||
"booster": booster,
|
||||
"booster": booster_name,
|
||||
"objective": "multi:softprob",
|
||||
},
|
||||
num_boost_round=num_boost_round,
|
||||
@@ -452,6 +452,8 @@ class TestModels:
|
||||
|
||||
assert len(booster.get_dump()) == total_trees
|
||||
|
||||
assert booster[...].num_boosted_rounds() == num_boost_round
|
||||
|
||||
self.run_slice(
|
||||
booster, dtrain, num_parallel_tree, num_classes, num_boost_round, False
|
||||
)
|
||||
|
||||
@@ -1,44 +1,46 @@
|
||||
import multiprocessing
|
||||
import socket
|
||||
import sys
|
||||
from threading import Thread
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from loky import get_reusable_executor
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import RabitTracker, build_info, federated
|
||||
from xgboost import testing as tm
|
||||
|
||||
|
||||
def run_rabit_worker(rabit_env, world_size):
|
||||
def run_rabit_worker(rabit_env: dict, world_size: int) -> int:
|
||||
with xgb.collective.CommunicatorContext(**rabit_env):
|
||||
assert xgb.collective.get_world_size() == world_size
|
||||
assert xgb.collective.is_distributed()
|
||||
assert xgb.collective.get_processor_name() == socket.gethostname()
|
||||
ret = xgb.collective.broadcast("test1234", 0)
|
||||
assert str(ret) == "test1234"
|
||||
ret = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM)
|
||||
assert np.array_equal(ret, np.asarray([2, 4, 6]))
|
||||
reduced = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM)
|
||||
assert np.array_equal(reduced, np.asarray([2, 4, 6]))
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_loky())
|
||||
def test_rabit_communicator() -> None:
|
||||
world_size = 2
|
||||
tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size)
|
||||
tracker.start()
|
||||
workers = []
|
||||
for _ in range(world_size):
|
||||
worker = multiprocessing.Process(
|
||||
target=run_rabit_worker, args=(tracker.worker_args(), world_size)
|
||||
)
|
||||
workers.append(worker)
|
||||
worker.start()
|
||||
for worker in workers:
|
||||
worker.join()
|
||||
assert worker.exitcode == 0
|
||||
with get_reusable_executor(max_workers=world_size) as pool:
|
||||
for _ in range(world_size):
|
||||
worker = pool.submit(
|
||||
run_rabit_worker, rabit_env=tracker.worker_args(), world_size=world_size
|
||||
)
|
||||
workers.append(worker)
|
||||
|
||||
for worker in workers:
|
||||
assert worker.result() == 0
|
||||
|
||||
|
||||
def run_federated_worker(port: int, world_size: int, rank: int) -> None:
|
||||
def run_federated_worker(port: int, world_size: int, rank: int) -> int:
|
||||
with xgb.collective.CommunicatorContext(
|
||||
dmlc_communicator="federated",
|
||||
federated_server_address=f"localhost:{port}",
|
||||
@@ -52,30 +54,28 @@ def run_federated_worker(port: int, world_size: int, rank: int) -> None:
|
||||
assert str(bret) == "test1234"
|
||||
aret = xgb.collective.allreduce(np.asarray([1, 2, 3]), xgb.collective.Op.SUM)
|
||||
assert np.array_equal(aret, np.asarray([2, 4, 6]))
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.skip_win())
|
||||
@pytest.mark.skipif(**tm.no_loky())
|
||||
def test_federated_communicator():
|
||||
if not build_info()["USE_FEDERATED"]:
|
||||
pytest.skip("XGBoost not built with federated learning enabled")
|
||||
|
||||
port = 9091
|
||||
world_size = 2
|
||||
tracker = multiprocessing.Process(
|
||||
target=federated.run_federated_server,
|
||||
kwargs={"port": port, "n_workers": world_size, "blocking": False},
|
||||
)
|
||||
tracker.start()
|
||||
if not tracker.is_alive():
|
||||
raise Exception("Error starting Federated Learning server")
|
||||
with get_reusable_executor(max_workers=world_size+1) as pool:
|
||||
kwargs={"port": port, "n_workers": world_size, "blocking": False}
|
||||
tracker = pool.submit(federated.run_federated_server, **kwargs)
|
||||
if not tracker.running():
|
||||
raise RuntimeError("Error starting Federated Learning server")
|
||||
|
||||
workers = []
|
||||
for rank in range(world_size):
|
||||
worker = multiprocessing.Process(
|
||||
target=run_federated_worker, args=(port, world_size, rank)
|
||||
)
|
||||
workers.append(worker)
|
||||
worker.start()
|
||||
for worker in workers:
|
||||
worker.join()
|
||||
assert worker.exitcode == 0
|
||||
workers = []
|
||||
for rank in range(world_size):
|
||||
worker = pool.submit(
|
||||
run_federated_worker, port=port, world_size=world_size, rank=rank
|
||||
)
|
||||
workers.append(worker)
|
||||
for worker in workers:
|
||||
assert worker.result() == 0
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
"""Copyright 2019-2023, XGBoost contributors"""
|
||||
"""Copyright 2019-2024, XGBoost contributors"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
from copy import copy
|
||||
from inspect import signature
|
||||
from typing import Any, Dict, Type, TypeVar
|
||||
|
||||
@@ -53,15 +54,13 @@ except ImportError:
|
||||
|
||||
def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
|
||||
import cupy as cp
|
||||
import dask_cudf
|
||||
|
||||
cp.cuda.runtime.setDevice(0)
|
||||
_X, _y, _ = generate_array()
|
||||
|
||||
X = dd.from_dask_array(_X)
|
||||
y = dd.from_dask_array(_y)
|
||||
|
||||
X = X.map_partitions(cudf.from_pandas)
|
||||
y = y.map_partitions(cudf.from_pandas)
|
||||
X = dd.from_dask_array(_X).to_backend("cudf")
|
||||
y = dd.from_dask_array(_y).to_backend("cudf")
|
||||
|
||||
dtrain = DMatrixT(client, X, y)
|
||||
out = dxgb.train(
|
||||
@@ -216,18 +215,22 @@ def test_tree_stats() -> None:
|
||||
class TestDistributedGPU:
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_boost_from_prediction(self, local_cuda_client: Client) -> None:
|
||||
import cudf
|
||||
import dask_cudf
|
||||
from sklearn.datasets import load_breast_cancer, load_iris
|
||||
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
|
||||
run_boost_from_prediction(X, y, "hist", "cuda", local_cuda_client)
|
||||
X = dd.from_array(X_, chunksize=100).to_backend("cudf")
|
||||
y = dd.from_array(y_, chunksize=100).to_backend("cudf")
|
||||
divisions = copy(X.divisions)
|
||||
run_boost_from_prediction(X, y, "hist", "cuda", local_cuda_client, divisions)
|
||||
|
||||
X_, y_ = load_iris(return_X_y=True)
|
||||
X = dd.from_array(X_, chunksize=50).map_partitions(cudf.from_pandas)
|
||||
y = dd.from_array(y_, chunksize=50).map_partitions(cudf.from_pandas)
|
||||
run_boost_from_prediction_multi_class(X, y, "hist", "cuda", local_cuda_client)
|
||||
X = dd.from_array(X_, chunksize=50).to_backend("cudf")
|
||||
y = dd.from_array(y_, chunksize=50).to_backend("cudf")
|
||||
divisions = copy(X.divisions)
|
||||
run_boost_from_prediction_multi_class(
|
||||
X, y, "hist", "cuda", local_cuda_client, divisions
|
||||
)
|
||||
|
||||
def test_init_estimation(self, local_cuda_client: Client) -> None:
|
||||
check_init_estimation("hist", "cuda", local_cuda_client)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Copyright 2019-2022 XGBoost contributors"""
|
||||
"""Copyright 2019-2024, XGBoost contributors"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
@@ -7,12 +7,24 @@ import pickle
|
||||
import socket
|
||||
import tempfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
from itertools import starmap
|
||||
from math import ceil
|
||||
from operator import attrgetter, getitem
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Generator, Literal, Optional, Tuple, Type, TypeVar, Union
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Generator,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
import hypothesis
|
||||
import numpy as np
|
||||
@@ -133,34 +145,6 @@ def generate_array(
|
||||
return X, y, None
|
||||
|
||||
|
||||
def deterministic_persist_per_worker(
|
||||
df: dd.DataFrame, client: "Client"
|
||||
) -> dd.DataFrame:
|
||||
# Got this script from https://github.com/dmlc/xgboost/issues/7927
|
||||
# Query workers
|
||||
n_workers = len(client.cluster.workers)
|
||||
workers = map(attrgetter("worker_address"), client.cluster.workers.values())
|
||||
|
||||
# Slice data into roughly equal partitions
|
||||
subpartition_size = ceil(df.npartitions / n_workers)
|
||||
subpartition_divisions = range(
|
||||
0, df.npartitions + subpartition_size, subpartition_size
|
||||
)
|
||||
subpartition_slices = starmap(slice, sliding_window(2, subpartition_divisions))
|
||||
subpartitions = map(partial(getitem, df.partitions), subpartition_slices)
|
||||
|
||||
# Persist each subpartition on each worker
|
||||
# Rebuild dataframe from persisted subpartitions
|
||||
df2 = dd.concat(
|
||||
[
|
||||
sp.persist(workers=w, allow_other_workers=False)
|
||||
for sp, w in zip(subpartitions, workers)
|
||||
]
|
||||
)
|
||||
|
||||
return df2
|
||||
|
||||
|
||||
Margin = TypeVar("Margin", dd.DataFrame, dd.Series, None)
|
||||
|
||||
|
||||
@@ -169,30 +153,14 @@ def deterministic_repartition(
|
||||
X: dd.DataFrame,
|
||||
y: dd.Series,
|
||||
m: Margin,
|
||||
divisions,
|
||||
) -> Tuple[dd.DataFrame, dd.Series, Margin]:
|
||||
# force repartition the data to avoid non-deterministic result
|
||||
if any(X.map_partitions(lambda x: _is_cudf_df(x)).compute()):
|
||||
# dask_cudf seems to be doing fine for now
|
||||
return X, y, m
|
||||
|
||||
X["_y"] = y
|
||||
if m is not None:
|
||||
if isinstance(m, dd.DataFrame):
|
||||
m_columns = m.columns
|
||||
X = dd.concat([X, m], join="outer", axis=1)
|
||||
else:
|
||||
m_columns = ["_m"]
|
||||
X["_m"] = m
|
||||
|
||||
X = deterministic_persist_per_worker(X, client)
|
||||
|
||||
y = X["_y"]
|
||||
X = X[X.columns.difference(["_y"])]
|
||||
if m is not None:
|
||||
m = X[m_columns]
|
||||
X = X[X.columns.difference(m_columns)]
|
||||
|
||||
return X, y, m
|
||||
X, y, margin = (
|
||||
dd.repartition(X, divisions=divisions, force=True),
|
||||
dd.repartition(y, divisions=divisions, force=True),
|
||||
dd.repartition(m, divisions=divisions, force=True) if m is not None else None,
|
||||
)
|
||||
return X, y, margin
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_frame", [True, False])
|
||||
@@ -218,10 +186,10 @@ def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client"):
|
||||
def test_from_dask_dataframe() -> None:
|
||||
with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster:
|
||||
with Client(cluster) as client:
|
||||
X, y, _ = generate_array()
|
||||
X_, y_, _ = generate_array()
|
||||
|
||||
X = dd.from_dask_array(X)
|
||||
y = dd.from_dask_array(y)
|
||||
X = dd.from_dask_array(X_)
|
||||
y = dd.from_dask_array(y_)
|
||||
|
||||
dtrain = DaskDMatrix(client, X, y)
|
||||
booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)["booster"]
|
||||
@@ -456,6 +424,7 @@ def run_boost_from_prediction_multi_class(
|
||||
tree_method: str,
|
||||
device: str,
|
||||
client: "Client",
|
||||
divisions: List[int],
|
||||
) -> None:
|
||||
model_0 = xgb.dask.DaskXGBClassifier(
|
||||
learning_rate=0.3,
|
||||
@@ -464,7 +433,7 @@ def run_boost_from_prediction_multi_class(
|
||||
max_bin=768,
|
||||
device=device,
|
||||
)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||
model_0.fit(X=X, y=y)
|
||||
margin = xgb.dask.inplace_predict(
|
||||
client, model_0.get_booster(), X, predict_type="margin"
|
||||
@@ -478,7 +447,7 @@ def run_boost_from_prediction_multi_class(
|
||||
max_bin=768,
|
||||
device=device,
|
||||
)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||
model_1.fit(X=X, y=y, base_margin=margin)
|
||||
predictions_1 = xgb.dask.predict(
|
||||
client,
|
||||
@@ -494,7 +463,7 @@ def run_boost_from_prediction_multi_class(
|
||||
max_bin=768,
|
||||
device=device,
|
||||
)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||
model_2.fit(X=X, y=y)
|
||||
predictions_2 = xgb.dask.inplace_predict(
|
||||
client, model_2.get_booster(), X, predict_type="margin"
|
||||
@@ -517,6 +486,7 @@ def run_boost_from_prediction(
|
||||
tree_method: str,
|
||||
device: str,
|
||||
client: "Client",
|
||||
divisions: List[int],
|
||||
) -> None:
|
||||
X, y = client.persist([X, y])
|
||||
|
||||
@@ -527,7 +497,7 @@ def run_boost_from_prediction(
|
||||
max_bin=512,
|
||||
device=device,
|
||||
)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||
model_0.fit(X=X, y=y)
|
||||
margin: dd.Series = model_0.predict(X, output_margin=True)
|
||||
|
||||
@@ -538,9 +508,9 @@ def run_boost_from_prediction(
|
||||
max_bin=512,
|
||||
device=device,
|
||||
)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||
model_1.fit(X=X, y=y, base_margin=margin)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||
predictions_1: dd.Series = model_1.predict(X, base_margin=margin)
|
||||
|
||||
model_2 = xgb.dask.DaskXGBClassifier(
|
||||
@@ -550,7 +520,7 @@ def run_boost_from_prediction(
|
||||
max_bin=512,
|
||||
device=device,
|
||||
)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None)
|
||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||
model_2.fit(X=X, y=y)
|
||||
predictions_2: dd.Series = model_2.predict(X)
|
||||
|
||||
@@ -563,13 +533,13 @@ def run_boost_from_prediction(
|
||||
np.testing.assert_allclose(predt_1, predt_2, atol=1e-5)
|
||||
|
||||
margined = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||
margined.fit(
|
||||
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
|
||||
)
|
||||
|
||||
unmargined = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin)
|
||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||
unmargined.fit(X=X, y=y, eval_set=[(X, y)], base_margin=margin)
|
||||
|
||||
margined_res = margined.evals_result()["validation_0"]["logloss"]
|
||||
@@ -587,11 +557,13 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
|
||||
|
||||
X_, y_ = load_breast_cancer(return_X_y=True)
|
||||
X, y = dd.from_array(X_, chunksize=200), dd.from_array(y_, chunksize=200)
|
||||
run_boost_from_prediction(X, y, tree_method, "cpu", client)
|
||||
divisions = copy(X.divisions)
|
||||
run_boost_from_prediction(X, y, tree_method, "cpu", client, divisions)
|
||||
|
||||
X_, y_ = load_digits(return_X_y=True)
|
||||
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
|
||||
run_boost_from_prediction_multi_class(X, y, tree_method, "cpu", client)
|
||||
divisions = copy(X.divisions)
|
||||
run_boost_from_prediction_multi_class(X, y, tree_method, "cpu", client, divisions)
|
||||
|
||||
|
||||
def test_inplace_predict(client: "Client") -> None:
|
||||
@@ -1594,7 +1566,7 @@ class TestWithDask:
|
||||
def test_empty_quantile_dmatrix(self, client: Client) -> None:
|
||||
X, y = make_categorical(client, 2, 30, 13)
|
||||
X_valid, y_valid = make_categorical(client, 10000, 30, 13)
|
||||
X_valid, y_valid, _ = deterministic_repartition(client, X_valid, y_valid, None)
|
||||
divisions = copy(X_valid.divisions)
|
||||
|
||||
Xy = xgb.dask.DaskQuantileDMatrix(client, X, y, enable_categorical=True)
|
||||
Xy_valid = xgb.dask.DaskQuantileDMatrix(
|
||||
|
||||
Reference in New Issue
Block a user