From 8b3ecfca255ed99525ced4e183656561ce3764c3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 28 Mar 2022 21:20:50 +0800
Subject: [PATCH] Mitigate flaky tests. (#7749)

* Skip non-increasing test with external memory when subsample is used.
* Increase bin numbers for boost from prediction test. This mitigates the effect of
  non-deterministic partitioning.
---
 tests/python-gpu/test_gpu_data_iterator.py |  2 +-
 tests/python-gpu/test_gpu_linear.py        |  4 +--
 tests/python-gpu/test_gpu_prediction.py    | 10 +++---
 tests/python-gpu/test_gpu_updaters.py      | 10 +++---
 tests/python-gpu/test_gpu_with_dask.py     |  6 ++--
 tests/python/test_data_iterator.py         |  8 +++--
 tests/python/test_linear.py                |  8 ++---
 tests/python/test_updaters.py              |  8 ++---
 tests/python/test_with_dask.py             | 40 ++++++++++++----------
 9 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py
index 2975834f9..f4eaab15e 100644
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -22,7 +22,7 @@ def test_gpu_single_batch() -> None:
     strategies.integers(0, 13),
     strategies.booleans(),
 )
-@settings(deadline=None)
+@settings(deadline=None, print_blob=True)
 def test_gpu_data_iterator(
     n_samples_per_batch: int, n_features: int, n_batches: int, subsample: bool
 ) -> None:
diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py
index 9791169f8..af8fe1bbe 100644
--- a/tests/python-gpu/test_gpu_linear.py
+++ b/tests/python-gpu/test_gpu_linear.py
@@ -30,7 +30,7 @@ def train_result(param, dmat, num_rounds):
 class TestGPULinear:
     @given(parameter_strategy, strategies.integers(10, 50),
            tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_gpu_coordinate(self, param, num_rounds, dataset):
         assume(len(dataset.y) > 0)
         param['updater'] = 'gpu_coord_descent'
@@ -45,7 +45,7 @@ class TestGPULinear:
     @given(parameter_strategy, strategies.integers(10, 50),
            tm.dataset_strategy, strategies.floats(1e-5, 1.0),
            strategies.floats(1e-5, 1.0))
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_gpu_coordinate_regularised(self, param, num_rounds, dataset, alpha, lambd):
         assume(len(dataset.y) > 0)
         param['updater'] = 'gpu_coord_descent'
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index d098b6b4a..38f4db07d 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -247,7 +247,7 @@ class TestGPUPredict:
 
     @given(strategies.integers(1, 10),
            tm.dataset_strategy, shap_parameter_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_shap(self, num_rounds, dataset, param):
         param.update({"predictor": "gpu_predictor", "gpu_id": 0})
         param = dataset.set_params(param)
@@ -261,7 +261,7 @@ class TestGPUPredict:
 
     @given(strategies.integers(1, 10),
            tm.dataset_strategy, shap_parameter_strategy)
-    @settings(deadline=None, max_examples=20)
+    @settings(deadline=None, max_examples=20, print_blob=True)
     def test_shap_interactions(self, num_rounds, dataset, param):
         param.update({"predictor": "gpu_predictor", "gpu_id": 0})
         param = dataset.set_params(param)
@@ -312,14 +312,14 @@ class TestGPUPredict:
         np.testing.assert_equal(cpu_leaf, gpu_leaf)
 
     @given(predict_parameter_strategy, tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_predict_leaf_gbtree(self, param, dataset):
         param['booster'] = 'gbtree'
         param['tree_method'] = 'gpu_hist'
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @given(predict_parameter_strategy, tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_predict_leaf_dart(self, param, dataset):
         param['booster'] = 'dart'
         param['tree_method'] = 'gpu_hist'
@@ -330,7 +330,7 @@ class TestGPUPredict:
     @given(df=data_frames([column('x0', elements=strategies.integers(min_value=0, max_value=3)),
                            column('x1', elements=strategies.integers(min_value=0, max_value=5))],
                           index=range_indexes(min_size=20, max_size=50)))
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_predict_categorical_split(self, df):
         from sklearn.metrics import mean_squared_error
 
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 5c5d19644..a3427b566 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -46,7 +46,7 @@ class TestGPUUpdaters:
     cputest = test_up.TestTreeMethod()
 
     @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_gpu_hist(self, param, num_rounds, dataset):
         param["tree_method"] = "gpu_hist"
         param = dataset.set_params(param)
@@ -56,7 +56,7 @@ class TestGPUUpdaters:
 
     @given(strategies.integers(10, 400), strategies.integers(3, 8),
            strategies.integers(1, 2), strategies.integers(4, 7))
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical(self, rows, cols, rounds, cats):
         self.cputest.run_categorical_basic(rows, cols, rounds, cats, "gpu_hist")
@@ -76,7 +76,7 @@ class TestGPUUpdaters:
     @pytest.mark.skipif(**tm.no_cupy())
     @given(parameter_strategy, strategies.integers(1, 20),
            tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
@@ -88,7 +88,7 @@ class TestGPUUpdaters:
 
     @given(parameter_strategy, strategies.integers(1, 20),
            tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_external_memory(self, param, num_rounds, dataset):
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
@@ -127,7 +127,7 @@ class TestGPUUpdaters:
 
     @pytest.mark.mgpu
     @given(tm.dataset_strategy, strategies.integers(0, 10))
-    @settings(deadline=None, max_examples=10)
+    @settings(deadline=None, max_examples=10, print_blob=True)
     def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
         param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
         param = dataset.set_params(param)
diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py
index cb8b6a8e7..1f0339e91 100644
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -27,7 +27,7 @@ from test_with_dask import run_empty_dmatrix_reg      # noqa
 from test_with_dask import run_empty_dmatrix_auc      # noqa
 from test_with_dask import run_auc                    # noqa
 from test_with_dask import run_boost_from_prediction  # noqa
-from test_with_dask import run_boost_from_prediction_multi_clasas  # noqa
+from test_with_dask import run_boost_from_prediction_multi_class  # noqa
 from test_with_dask import run_dask_classifier        # noqa
 from test_with_dask import run_empty_dmatrix_cls      # noqa
 from test_with_dask import _get_client_workers        # noqa
@@ -216,7 +216,7 @@ def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
         X_, y_ = load_digits(return_X_y=True)
         X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
         y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
-        run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client)
+        run_boost_from_prediction_multi_class(X, y, "gpu_hist", client)
 
 
 class TestDistributedGPU:
@@ -231,7 +231,7 @@ class TestDistributedGPU:
         num_rounds=strategies.integers(1, 20),
         dataset=tm.dataset_strategy,
     )
-    @settings(deadline=duration(seconds=120), suppress_health_check=suppress)
+    @settings(deadline=duration(seconds=120), suppress_health_check=suppress, print_blob=True)
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.parametrize(
         "local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index e4254bb9e..233a3a4d0 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -108,7 +108,8 @@ def run_data_iterator(
         evals_result=results_from_it,
         verbose_eval=False,
     )
-    assert non_increasing(results_from_it["Train"]["rmse"])
+    if not subsample:
+        assert non_increasing(results_from_it["Train"]["rmse"])
 
     X, y = it.as_arrays()
     Xy = xgb.DMatrix(X, y)
@@ -125,7 +126,8 @@ def run_data_iterator(
         verbose_eval=False,
     )
     arr_predt = from_arrays.predict(Xy)
-    assert non_increasing(results_from_arrays["Train"]["rmse"])
+    if not subsample:
+        assert non_increasing(results_from_arrays["Train"]["rmse"])
 
     rtol = 1e-2
     # CPU sketching is more memory efficient but less consistent due to small chunks
@@ -146,7 +148,7 @@ def run_data_iterator(
     strategies.integers(0, 13),
     strategies.booleans(),
 )
-@settings(deadline=None)
+@settings(deadline=None, print_blob=True)
 def test_data_iterator(
     n_samples_per_batch: int,
     n_features: int,
diff --git a/tests/python/test_linear.py b/tests/python/test_linear.py
index 635048ddb..2ea3e44dd 100644
--- a/tests/python/test_linear.py
+++ b/tests/python/test_linear.py
@@ -26,7 +26,7 @@ def train_result(param, dmat, num_rounds):
 class TestLinear:
     @given(parameter_strategy, strategies.integers(10, 50),
            tm.dataset_strategy, coord_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_coordinate(self, param, num_rounds, dataset, coord_param):
         param['updater'] = 'coord_descent'
         param.update(coord_param)
@@ -41,7 +41,7 @@ class TestLinear:
     @given(parameter_strategy, strategies.integers(10, 50),
            tm.dataset_strategy, coord_strategy, strategies.floats(1e-5, 1.0),
            strategies.floats(1e-5, 1.0))
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_coordinate_regularised(self, param, num_rounds, dataset, coord_param, alpha, lambd):
         param['updater'] = 'coord_descent'
         param['alpha'] = alpha
@@ -54,7 +54,7 @@ class TestLinear:
 
     @given(parameter_strategy, strategies.integers(10, 50),
            tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_shotgun(self, param, num_rounds, dataset):
         param['updater'] = 'shotgun'
         param = dataset.set_params(param)
@@ -71,7 +71,7 @@ class TestLinear:
     @given(parameter_strategy, strategies.integers(10, 50),
            tm.dataset_strategy, strategies.floats(1e-5, 1.0),
            strategies.floats(1e-5, 1.0))
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_shotgun_regularised(self, param, num_rounds, dataset, alpha, lambd):
         param['updater'] = 'shotgun'
         param['alpha'] = alpha
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index b73736c69..cdf40d843 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -38,7 +38,7 @@ def train_result(param, dmat, num_rounds):
 class TestTreeMethod:
     @given(exact_parameter_strategy, strategies.integers(1, 20),
            tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_exact(self, param, num_rounds, dataset):
         param['tree_method'] = 'exact'
         param = dataset.set_params(param)
@@ -51,7 +51,7 @@ class TestTreeMethod:
         strategies.integers(1, 20),
         tm.dataset_strategy,
     )
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_approx(self, param, hist_param, num_rounds, dataset):
         param["tree_method"] = "approx"
         param = dataset.set_params(param)
@@ -86,7 +86,7 @@ class TestTreeMethod:
 
     @given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
            tm.dataset_strategy)
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     def test_hist(self, param, hist_param, num_rounds, dataset):
         param['tree_method'] = 'hist'
         param = dataset.set_params(param)
@@ -241,7 +241,7 @@ class TestTreeMethod:
 
     @given(strategies.integers(10, 400), strategies.integers(3, 8),
            strategies.integers(1, 2), strategies.integers(4, 7))
-    @settings(deadline=None)
+    @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical(self, rows, cols, rounds, cats):
         self.run_categorical_basic(rows, cols, rounds, cats, "approx")
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index 9a68f4453..484c92e3d 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -337,14 +337,14 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
     assert prediction.shape[1] == 3
 
 
-def run_boost_from_prediction_multi_clasas(
+def run_boost_from_prediction_multi_class(
     X: xgb.dask._DaskCollection,
     y: xgb.dask._DaskCollection,
     tree_method: str,
-    client: "Client"
+    client: "Client",
 ) -> None:
     model_0 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=768
     )
     model_0.fit(X=X, y=y)
     margin = xgb.dask.inplace_predict(
@@ -352,18 +352,18 @@ def run_boost_from_prediction_multi_clasas(
     )
 
     model_1 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
+        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=768
     )
     model_1.fit(X=X, y=y, base_margin=margin)
     predictions_1 = xgb.dask.predict(
         client,
         model_1.get_booster(),
         xgb.dask.DaskDMatrix(client, X, base_margin=margin),
-        output_margin=True
+        output_margin=True,
     )
 
     model_2 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
+        learning_rate=0.3, n_estimators=8, tree_method=tree_method, max_bin=768
     )
     model_2.fit(X=X, y=y)
     predictions_2 = xgb.dask.inplace_predict(
@@ -382,26 +382,29 @@ def run_boost_from_prediction_multi_clasas(
 
 
 def run_boost_from_prediction(
-    X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
+    X: xgb.dask._DaskCollection,
+    y: xgb.dask._DaskCollection,
+    tree_method: str,
+    client: "Client",
 ) -> None:
     X = client.persist(X)
     y = client.persist(y)
 
     model_0 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4,
-        tree_method=tree_method)
+        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=512
+    )
     model_0.fit(X=X, y=y)
     margin = model_0.predict(X, output_margin=True)
 
     model_1 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=4,
-        tree_method=tree_method)
+        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=512
+    )
     model_1.fit(X=X, y=y, base_margin=margin)
     predictions_1 = model_1.predict(X, base_margin=margin)
 
     cls_2 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, random_state=0, n_estimators=8,
-        tree_method=tree_method)
+        learning_rate=0.3, n_estimators=8, tree_method=tree_method, max_bin=512
+    )
     cls_2.fit(X=X, y=y)
     predictions_2 = cls_2.predict(X)
 
@@ -415,8 +418,8 @@ def run_boost_from_prediction(
     unmargined = xgb.dask.DaskXGBClassifier(n_estimators=4)
     unmargined.fit(X=X, y=y, eval_set=[(X, y)], base_margin=margin)
 
-    margined_res = margined.evals_result()['validation_0']['logloss']
-    unmargined_res = unmargined.evals_result()['validation_0']['logloss']
+    margined_res = margined.evals_result()["validation_0"]["logloss"]
+    unmargined_res = unmargined.evals_result()["validation_0"]["logloss"]
 
     assert len(margined_res) == len(unmargined_res)
     for i in range(len(margined_res)):
@@ -429,12 +432,11 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
     from sklearn.datasets import load_breast_cancer, load_digits
     X_, y_ = load_breast_cancer(return_X_y=True)
     X, y = dd.from_array(X_, chunksize=200), dd.from_array(y_, chunksize=200)
-
     run_boost_from_prediction(X, y, tree_method, client)
 
     X_, y_ = load_digits(return_X_y=True)
     X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
-    run_boost_from_prediction_multi_clasas(X, y, tree_method, client)
+    run_boost_from_prediction_multi_class(X, y, tree_method, client)
 
 
 def test_inplace_predict(client: "Client") -> None:
@@ -1292,7 +1294,7 @@ class TestWithDask:
 
     @given(params=hist_parameter_strategy,
            dataset=tm.dataset_strategy)
-    @settings(deadline=None, suppress_health_check=suppress)
+    @settings(deadline=None, suppress_health_check=suppress, print_blob=True)
     def test_hist(
             self, params: Dict, dataset: tm.TestDataset, client: "Client"
     ) -> None:
@@ -1301,7 +1303,7 @@ class TestWithDask:
 
     @given(params=exact_parameter_strategy,
            dataset=tm.dataset_strategy)
-    @settings(deadline=None, suppress_health_check=suppress)
+    @settings(deadline=None, suppress_health_check=suppress, print_blob=True)
     def test_approx(
             self, client: "Client", params: Dict, dataset: tm.TestDataset
     ) -> None: