merge latest changes
This commit is contained in:
@@ -12,6 +12,7 @@ from hypothesis._settings import duration
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
from xgboost.collective import CommunicatorContext
|
||||
from xgboost.testing.params import hist_parameter_strategy
|
||||
|
||||
pytestmark = [
|
||||
@@ -572,6 +573,73 @@ def test_with_asyncio(local_cuda_client: Client) -> None:
|
||||
assert isinstance(output["history"], dict)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
condition=not xgb.build_info()["USE_DLOPEN_NCCL"] and not xgb.build_info()["USE_DLOPEN_RCCL"],
|
||||
reason="Not compiled with dlopen.",
|
||||
)
|
||||
def test_invalid_nccl(local_cuda_client: Client) -> None:
|
||||
client = local_cuda_client
|
||||
workers = tm.get_client_workers(client)
|
||||
args = client.sync(
|
||||
dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
|
||||
)
|
||||
|
||||
def run(wid: int) -> None:
|
||||
ctx = CommunicatorContext(dmlc_nccl_path="foo", **args)
|
||||
X, y, w = tm.make_regression(n_samples=10, n_features=10, use_cupy=True)
|
||||
|
||||
with ctx:
|
||||
with pytest.raises(ValueError, match=r"pip install"):
|
||||
xgb.QuantileDMatrix(X, y, weight=w)
|
||||
|
||||
futures = client.map(run, range(len(workers)), workers=workers)
|
||||
client.gather(futures)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
condition=not xgb.build_info()["USE_DLOPEN_NCCL"] and not xgb.build_info()["USE_DLOPEN_RCCL"],
|
||||
reason="Not compiled with dlopen.",
|
||||
)
|
||||
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||
def test_nccl_load(local_cuda_client: Client, tree_method: str) -> None:
|
||||
X, y, w = tm.make_regression(128, 16, use_cupy=True)
|
||||
|
||||
def make_model() -> None:
|
||||
xgb.XGBRegressor(
|
||||
device="cuda",
|
||||
tree_method=tree_method,
|
||||
objective="reg:quantileerror",
|
||||
verbosity=2,
|
||||
quantile_alpha=[0.2, 0.8],
|
||||
).fit(X, y, sample_weight=w)
|
||||
|
||||
# no nccl load when using single-node.
|
||||
with tm.captured_output() as (out, err):
|
||||
make_model()
|
||||
assert out.getvalue().find("NCCL") == -1
|
||||
assert err.getvalue().find("NCCL") == -1
|
||||
|
||||
client = local_cuda_client
|
||||
workers = tm.get_client_workers(client)
|
||||
args = client.sync(
|
||||
dxgb._get_rabit_args, len(workers), dxgb._get_dask_config(), client
|
||||
)
|
||||
|
||||
# nccl is loaded
|
||||
def run(wid: int) -> None:
|
||||
# FIXME(jiamingy): https://github.com/dmlc/xgboost/issues/9147
|
||||
from xgboost.core import _LIB, _register_log_callback
|
||||
_register_log_callback(_LIB)
|
||||
|
||||
with CommunicatorContext(**args):
|
||||
with tm.captured_output() as (out, err):
|
||||
make_model()
|
||||
assert out.getvalue().find("Loaded shared NCCL") != -1, out.getvalue()
|
||||
|
||||
futures = client.map(run, range(len(workers)), workers=workers)
|
||||
client.gather(futures)
|
||||
|
||||
|
||||
async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
|
||||
async with Client(scheduler_address, asynchronous=True) as client:
|
||||
import cupy as cp
|
||||
|
||||
@@ -1931,6 +1931,7 @@ class TestWithDask:
|
||||
cls.client = client
|
||||
cls.fit(X, y)
|
||||
predt_0 = cls.predict(X)
|
||||
proba_0 = cls.predict_proba(X)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
path = os.path.join(tmpdir, "model.pkl")
|
||||
@@ -1940,7 +1941,9 @@ class TestWithDask:
|
||||
with open(path, "rb") as fd:
|
||||
cls = pickle.load(fd)
|
||||
predt_1 = cls.predict(X)
|
||||
proba_1 = cls.predict_proba(X)
|
||||
np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
|
||||
np.testing.assert_allclose(proba_0.compute(), proba_1.compute())
|
||||
|
||||
path = os.path.join(tmpdir, "cls.json")
|
||||
cls.save_model(path)
|
||||
@@ -1949,16 +1952,20 @@ class TestWithDask:
|
||||
cls.load_model(path)
|
||||
assert cls.n_classes_ == 10
|
||||
predt_2 = cls.predict(X)
|
||||
proba_2 = cls.predict_proba(X)
|
||||
|
||||
np.testing.assert_allclose(predt_0.compute(), predt_2.compute())
|
||||
np.testing.assert_allclose(proba_0.compute(), proba_2.compute())
|
||||
|
||||
# Use single node to load
|
||||
cls = xgb.XGBClassifier()
|
||||
cls.load_model(path)
|
||||
assert cls.n_classes_ == 10
|
||||
predt_3 = cls.predict(X_)
|
||||
proba_3 = cls.predict_proba(X_)
|
||||
|
||||
np.testing.assert_allclose(predt_0.compute(), predt_3)
|
||||
np.testing.assert_allclose(proba_0.compute(), proba_3)
|
||||
|
||||
|
||||
def test_dask_unsupported_features(client: "Client") -> None:
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import Generator, Sequence, Type
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from pyspark import SparkConf
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
@@ -932,6 +933,113 @@ class TestPySparkLocal:
|
||||
model_loaded.set_device("cuda")
|
||||
assert model_loaded._run_on_gpu()
|
||||
|
||||
def test_skip_stage_level_scheduling(self) -> None:
|
||||
conf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
|
||||
classifer_on_cpu = SparkXGBClassifier(use_gpu=False)
|
||||
classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
|
||||
|
||||
# the correct configurations should not skip stage-level scheduling
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
|
||||
|
||||
# spark version < 3.4.0
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
|
||||
|
||||
# not run on GPU
|
||||
assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", conf)
|
||||
|
||||
# spark.executor.cores is not set
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# spark.executor.cores=1
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "1")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# spark.executor.resource.gpu.amount is not set
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# spark.executor.resource.gpu.amount>1
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "2")
|
||||
.set("spark.task.resource.gpu.amount", "0.08")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# spark.task.resource.gpu.amount is not set
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
)
|
||||
assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# spark.task.resource.gpu.amount=1
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("spark://foo")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "1")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# yarn
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("yarn")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "1")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
# k8s
|
||||
badConf = (
|
||||
SparkConf()
|
||||
.setMaster("k8s://")
|
||||
.set("spark.executor.cores", "12")
|
||||
.set("spark.task.cpus", "1")
|
||||
.set("spark.executor.resource.gpu.amount", "1")
|
||||
.set("spark.task.resource.gpu.amount", "1")
|
||||
)
|
||||
assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
|
||||
|
||||
|
||||
class XgboostLocalTest(SparkTestCase):
|
||||
def setUp(self):
|
||||
|
||||
Reference in New Issue
Block a user