[dask] Use distributed.MultiLock (#6743)
* [dask] Use `distributed.MultiLock` This enables training multiple models in parallel. * Conditionally import `MultiLock`. * Use async train directly in scikit learn interface. * Use `worker_client` when available.
This commit is contained in:
@@ -27,7 +27,7 @@ def run_rabit_ops(client, n_workers):
|
||||
from xgboost.dask import RabitContext, _get_rabit_args
|
||||
from xgboost import rabit
|
||||
|
||||
workers = list(_get_client_workers(client).keys())
|
||||
workers = _get_client_workers(client)
|
||||
rabit_args = client.sync(_get_rabit_args, len(workers), client)
|
||||
assert not rabit.is_distributed()
|
||||
n_workers_from_dask = len(workers)
|
||||
|
||||
@@ -9,6 +9,7 @@ import scipy
|
||||
import json
|
||||
from typing import List, Tuple, Dict, Optional, Type, Any
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import tempfile
|
||||
from sklearn.datasets import make_classification
|
||||
import sklearn
|
||||
@@ -43,9 +44,9 @@ kCols = 10
|
||||
kWorkers = 5
|
||||
|
||||
|
||||
def _get_client_workers(client: "Client") -> Dict[str, Dict]:
|
||||
def _get_client_workers(client: "Client") -> List[str]:
|
||||
workers = client.scheduler_info()['workers']
|
||||
return workers
|
||||
return list(workers.keys())
|
||||
|
||||
|
||||
def generate_array(
|
||||
@@ -646,7 +647,7 @@ def test_with_asyncio() -> None:
|
||||
|
||||
|
||||
async def generate_concurrent_trainings() -> None:
|
||||
async def train():
|
||||
async def train() -> None:
|
||||
async with LocalCluster(n_workers=2,
|
||||
threads_per_worker=1,
|
||||
asynchronous=True,
|
||||
@@ -967,7 +968,7 @@ class TestWithDask:
|
||||
|
||||
with LocalCluster(n_workers=4) as cluster:
|
||||
with Client(cluster) as client:
|
||||
workers = list(_get_client_workers(client).keys())
|
||||
workers = _get_client_workers(client)
|
||||
rabit_args = client.sync(
|
||||
xgb.dask._get_rabit_args, len(workers), client)
|
||||
futures = client.map(runit,
|
||||
@@ -1000,7 +1001,7 @@ class TestWithDask:
|
||||
def test_n_workers(self) -> None:
|
||||
with LocalCluster(n_workers=2) as cluster:
|
||||
with Client(cluster) as client:
|
||||
workers = list(_get_client_workers(client).keys())
|
||||
workers = _get_client_workers(client)
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
dX = client.submit(da.from_array, X, workers=[workers[0]]).result()
|
||||
@@ -1090,7 +1091,7 @@ class TestWithDask:
|
||||
X, y, _ = generate_array()
|
||||
n_partitions = X.npartitions
|
||||
m = xgb.dask.DaskDMatrix(client, X, y)
|
||||
workers = list(_get_client_workers(client).keys())
|
||||
workers = _get_client_workers(client)
|
||||
rabit_args = client.sync(xgb.dask._get_rabit_args, len(workers), client)
|
||||
n_workers = len(workers)
|
||||
|
||||
@@ -1285,6 +1286,82 @@ def test_dask_unsupported_features(client: "Client") -> None:
|
||||
)
|
||||
|
||||
|
||||
def test_parallel_submits(client: "Client") -> None:
|
||||
"""Test for running multiple train simultaneously from single clients."""
|
||||
try:
|
||||
from distributed import MultiLock # NOQA
|
||||
except ImportError:
|
||||
pytest.skip("`distributed.MultiLock' is not available")
|
||||
|
||||
from sklearn.datasets import load_digits
|
||||
|
||||
futures = []
|
||||
workers = _get_client_workers(client)
|
||||
n_submits = len(workers)
|
||||
for i in range(n_submits):
|
||||
X_, y_ = load_digits(return_X_y=True)
|
||||
X = dd.from_array(X_, chunksize=32)
|
||||
y = dd.from_array(y_, chunksize=32)
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
verbosity=1,
|
||||
n_estimators=i + 1,
|
||||
eval_metric="merror",
|
||||
use_label_encoder=False,
|
||||
)
|
||||
f = client.submit(cls.fit, X, y, pure=False)
|
||||
futures.append(f)
|
||||
|
||||
classifiers = client.gather(futures)
|
||||
assert len(classifiers) == n_submits
|
||||
for i, cls in enumerate(classifiers):
|
||||
assert cls.get_booster().num_boosted_rounds() == i + 1
|
||||
|
||||
|
||||
def test_parallel_submit_multi_clients() -> None:
|
||||
"""Test for running multiple train simultaneously from multiple clients."""
|
||||
try:
|
||||
from distributed import MultiLock # NOQA
|
||||
except ImportError:
|
||||
pytest.skip("`distributed.MultiLock' is not available")
|
||||
|
||||
from sklearn.datasets import load_digits
|
||||
|
||||
with LocalCluster(n_workers=4) as cluster:
|
||||
with Client(cluster) as client:
|
||||
workers = _get_client_workers(client)
|
||||
|
||||
n_submits = len(workers)
|
||||
assert n_submits == 4
|
||||
futures = []
|
||||
|
||||
for i in range(n_submits):
|
||||
client = Client(cluster)
|
||||
X_, y_ = load_digits(return_X_y=True)
|
||||
X_ += 1.0
|
||||
X = dd.from_array(X_, chunksize=32)
|
||||
y = dd.from_array(y_, chunksize=32)
|
||||
cls = xgb.dask.DaskXGBClassifier(
|
||||
verbosity=1,
|
||||
n_estimators=i + 1,
|
||||
eval_metric="merror",
|
||||
use_label_encoder=False,
|
||||
)
|
||||
f = client.submit(cls.fit, X, y, pure=False)
|
||||
futures.append((client, f))
|
||||
|
||||
t_futures = []
|
||||
with ThreadPoolExecutor(max_workers=16) as e:
|
||||
for i in range(n_submits):
|
||||
def _() -> xgb.dask.DaskXGBClassifier:
|
||||
return futures[i][0].compute(futures[i][1]).result()
|
||||
|
||||
f = e.submit(_)
|
||||
t_futures.append(f)
|
||||
|
||||
for i, f in enumerate(t_futures):
|
||||
assert f.result().get_booster().num_boosted_rounds() == i + 1
|
||||
|
||||
|
||||
class TestDaskCallbacks:
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_early_stopping(self, client: "Client") -> None:
|
||||
|
||||
Reference in New Issue
Block a user