[dask] Extend tree stats tests. (#7128)
* Add tests to GPU. * Assert cover in children sums up to the parent.
This commit is contained in:
parent
778135f657
commit
e88ac9cc54
@ -28,6 +28,7 @@ from test_with_dask import _get_client_workers # noqa
|
|||||||
from test_with_dask import generate_array # noqa
|
from test_with_dask import generate_array # noqa
|
||||||
from test_with_dask import kCols as random_cols # noqa
|
from test_with_dask import kCols as random_cols # noqa
|
||||||
from test_with_dask import suppress # noqa
|
from test_with_dask import suppress # noqa
|
||||||
|
from test_with_dask import run_tree_stats # noqa
|
||||||
import testing as tm # noqa
|
import testing as tm # noqa
|
||||||
|
|
||||||
|
|
||||||
@ -493,6 +494,17 @@ class TestDistributedGPU:
|
|||||||
for rn, drn in zip(ranker_names, dranker_names):
|
for rn, drn in zip(ranker_names, dranker_names):
|
||||||
assert rn == drn
|
assert rn == drn
|
||||||
|
|
||||||
|
def test_tree_stats(self) -> None:
|
||||||
|
with LocalCUDACluster(n_workers=1) as cluster:
|
||||||
|
with Client(cluster) as client:
|
||||||
|
local = run_tree_stats(client, "gpu_hist")
|
||||||
|
|
||||||
|
with LocalCUDACluster(n_workers=2) as cluster:
|
||||||
|
with Client(cluster) as client:
|
||||||
|
distributed = run_tree_stats(client, "gpu_hist")
|
||||||
|
|
||||||
|
assert local == distributed
|
||||||
|
|
||||||
def run_quantile(self, name: str, local_cuda_cluster: LocalCUDACluster) -> None:
|
def run_quantile(self, name: str, local_cuda_cluster: LocalCUDACluster) -> None:
|
||||||
if sys.platform.startswith("win"):
|
if sys.platform.startswith("win"):
|
||||||
pytest.skip("Skipping dask tests on Windows")
|
pytest.skip("Skipping dask tests on Windows")
|
||||||
|
|||||||
@ -1494,36 +1494,60 @@ def test_parallel_submits(client: "Client") -> None:
|
|||||||
for i, cls in enumerate(classifiers):
|
for i, cls in enumerate(classifiers):
|
||||||
assert cls.get_booster().num_boosted_rounds() == i + 1
|
assert cls.get_booster().num_boosted_rounds() == i + 1
|
||||||
|
|
||||||
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
|
||||||
def test_hist_root_stats_with_different_num_worker(tree_method: str) -> None:
|
|
||||||
"""assert that different workers count dosn't affect summ statistic's on root"""
|
|
||||||
def dask_train(n_workers, X, y, num_obs, num_features):
|
|
||||||
cluster = LocalCluster(n_workers=n_workers)
|
|
||||||
client = Client(cluster)
|
|
||||||
|
|
||||||
chunk_size = num_obs/n_workers
|
def run_tree_stats(client: Client, tree_method: str) -> str:
|
||||||
|
"""assert that different workers count dosn't affect summ statistic's on root"""
|
||||||
|
|
||||||
|
def dask_train(X, y, num_obs, num_features):
|
||||||
|
chunk_size = 100
|
||||||
X = da.from_array(X, chunks=(chunk_size, num_features))
|
X = da.from_array(X, chunks=(chunk_size, num_features))
|
||||||
y = da.from_array(y.reshape(num_obs, 1), chunks=(chunk_size, 1))
|
y = da.from_array(y.reshape(num_obs, 1), chunks=(chunk_size, 1))
|
||||||
dtrain = xgb.dask.DaskDMatrix(client, X, y)
|
dtrain = xgb.dask.DaskDMatrix(client, X, y)
|
||||||
|
|
||||||
output = xgb.dask.train(
|
output = xgb.dask.train(
|
||||||
client,
|
client,
|
||||||
{"verbosity": 0, "tree_method": tree_method, "objective": "reg:squarederror", 'max_depth': 2},
|
{
|
||||||
|
"verbosity": 0,
|
||||||
|
"tree_method": tree_method,
|
||||||
|
"objective": "reg:squarederror",
|
||||||
|
"max_depth": 3,
|
||||||
|
},
|
||||||
dtrain,
|
dtrain,
|
||||||
num_boost_round=1
|
num_boost_round=1,
|
||||||
)
|
)
|
||||||
dump_model = output['booster'].get_dump(with_stats=True)
|
dump_model = output["booster"].get_dump(with_stats=True, dump_format="json")[0]
|
||||||
client.shutdown()
|
return json.loads(dump_model)
|
||||||
return dump_model
|
|
||||||
|
|
||||||
num_obs = 1000
|
num_obs = 1000
|
||||||
num_features = 10
|
num_features = 10
|
||||||
X, y = make_regression(num_obs, num_features, random_state=777)
|
X, y = make_regression(num_obs, num_features, random_state=777)
|
||||||
first_model = dask_train(1, X, y, num_obs, num_features)[0]
|
model = dask_train(X, y, num_obs, num_features)
|
||||||
second_model = dask_train(2, X, y, num_obs, num_features)[0]
|
|
||||||
first_summ_stats = first_model[first_model.find('cover='):first_model.find('\n')]
|
# asserts children have correct cover.
|
||||||
second_summ_stats = second_model[second_model.find('cover='):second_model.find('\n')]
|
stack = [model]
|
||||||
assert first_summ_stats == second_summ_stats
|
while stack:
|
||||||
|
node: dict = stack.pop()
|
||||||
|
if "leaf" in node.keys():
|
||||||
|
continue
|
||||||
|
cover = 0
|
||||||
|
for c in node["children"]:
|
||||||
|
cover += c["cover"]
|
||||||
|
stack.append(c)
|
||||||
|
assert cover == node["cover"]
|
||||||
|
|
||||||
|
return model["cover"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
|
||||||
|
def test_tree_stats(tree_method: str) -> None:
|
||||||
|
with LocalCluster(n_workers=1) as cluster:
|
||||||
|
with Client(cluster) as client:
|
||||||
|
local = run_tree_stats(client, tree_method)
|
||||||
|
with LocalCluster(n_workers=2) as cluster:
|
||||||
|
with Client(cluster) as client:
|
||||||
|
distributed = run_tree_stats(client, tree_method)
|
||||||
|
|
||||||
|
assert local == distributed
|
||||||
|
|
||||||
|
|
||||||
def test_parallel_submit_multi_clients() -> None:
|
def test_parallel_submit_multi_clients() -> None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user