[dask] Extend tree stats tests. (#7128)

* Add tests to GPU. * Assert cover in children sums up to the parent.
2021-07-27 12:22:13 +08:00 · 2021-07-27 12:22:13 +08:00 · e88ac9cc54
commit e88ac9cc54
parent 778135f657
2 changed files with 54 additions and 18 deletions
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@ -28,6 +28,7 @@ from test_with_dask import _get_client_workers        # noqa
 from test_with_dask import generate_array             # noqa
 from test_with_dask import kCols as random_cols       # noqa
 from test_with_dask import suppress                   # noqa
 from test_with_dask import run_tree_stats             # noqa
 import testing as tm                                  # noqa
@ -493,6 +494,17 @@ class TestDistributedGPU:
        for rn, drn in zip(ranker_names, dranker_names):
            assert rn == drn
    def test_tree_stats(self) -> None:
        with LocalCUDACluster(n_workers=1) as cluster:
            with Client(cluster) as client:
                local = run_tree_stats(client, "gpu_hist")
        with LocalCUDACluster(n_workers=2) as cluster:
            with Client(cluster) as client:
                distributed = run_tree_stats(client, "gpu_hist")
        assert local == distributed
    def run_quantile(self, name: str, local_cuda_cluster: LocalCUDACluster) -> None:
        if sys.platform.startswith("win"):
            pytest.skip("Skipping dask tests on Windows")
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@ -1494,36 +1494,60 @@ def test_parallel_submits(client: "Client") -> None:
    for i, cls in enumerate(classifiers):
        assert cls.get_booster().num_boosted_rounds() == i + 1
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
 def test_hist_root_stats_with_different_num_worker(tree_method: str) -> None:
    """assert that different workers count dosn't affect summ statistic's on root"""
    def dask_train(n_workers, X, y, num_obs, num_features):
        cluster = LocalCluster(n_workers=n_workers)
        client = Client(cluster)
-        chunk_size = num_obs/n_workers
+def run_tree_stats(client: Client, tree_method: str) -> str:
    """assert that different workers count dosn't affect summ statistic's on root"""
    def dask_train(X, y, num_obs, num_features):
        chunk_size = 100
        X = da.from_array(X, chunks=(chunk_size, num_features))
        y = da.from_array(y.reshape(num_obs, 1), chunks=(chunk_size, 1))
        dtrain = xgb.dask.DaskDMatrix(client, X, y)
        output = xgb.dask.train(
            client,
-            {"verbosity": 0, "tree_method": tree_method, "objective": "reg:squarederror", 'max_depth': 2},
+            {
                "verbosity": 0,
                "tree_method": tree_method,
                "objective": "reg:squarederror",
                "max_depth": 3,
            },
            dtrain,
-            num_boost_round=1
+            num_boost_round=1,
        )
-        dump_model = output['booster'].get_dump(with_stats=True)
+        dump_model = output["booster"].get_dump(with_stats=True, dump_format="json")[0]
-        client.shutdown()
+        return json.loads(dump_model)
        return dump_model
    num_obs = 1000
    num_features = 10
    X, y = make_regression(num_obs, num_features, random_state=777)
-    first_model = dask_train(1, X, y, num_obs, num_features)[0]
+    model = dask_train(X, y, num_obs, num_features)
-    second_model = dask_train(2, X, y, num_obs, num_features)[0]
+
-    first_summ_stats = first_model[first_model.find('cover='):first_model.find('\n')]
+    # asserts children have correct cover.
-    second_summ_stats = second_model[second_model.find('cover='):second_model.find('\n')]
+    stack = [model]
-    assert first_summ_stats == second_summ_stats
+    while stack:
        node: dict = stack.pop()
        if "leaf" in node.keys():
            continue
        cover = 0
        for c in node["children"]:
            cover += c["cover"]
            stack.append(c)
        assert cover == node["cover"]
    return model["cover"]
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
 def test_tree_stats(tree_method: str) -> None:
    with LocalCluster(n_workers=1) as cluster:
        with Client(cluster) as client:
            local = run_tree_stats(client, tree_method)
    with LocalCluster(n_workers=2) as cluster:
        with Client(cluster) as client:
            distributed = run_tree_stats(client, tree_method)
    assert local == distributed
 def test_parallel_submit_multi_clients() -> None: