Extract dask and spark test into distributed test. (#8395)

- Move test files. - Run spark and dask separately to prevent conflicts. - Gather common code into the testing module.
2022-10-28 16:24:32 +08:00
parent f73520bfff
commit cfd2a9f872
34 changed files with 405 additions and 337 deletions
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -5,6 +5,7 @@ import numpy as np
 import pandas as pd
 import pytest
 from scipy import sparse
+from xgboost.testing.shared import validate_leaf_output

 import xgboost as xgb
 from xgboost import testing as tm
@@ -26,16 +27,6 @@ def run_threaded_predict(X, rows, predict_func):
        assert f.result()


-def verify_leaf_output(leaf: np.ndarray, num_parallel_tree: int):
-    for i in range(leaf.shape[0]):     # n_samples
-        for j in range(leaf.shape[1]):  # n_rounds
-            for k in range(leaf.shape[2]):    # n_classes
-                tree_group = leaf[i, j, k, :]
-                assert tree_group.shape[0] == num_parallel_tree
-                # No sampling, all trees within forest are the same
-                assert np.all(tree_group == tree_group[0])
-
-
 def run_predict_leaf(predictor):
    rows = 100
    cols = 4
@@ -67,7 +58,7 @@ def run_predict_leaf(predictor):
    assert leaf.shape[2] == classes
    assert leaf.shape[3] == num_parallel_tree

-    verify_leaf_output(leaf, num_parallel_tree)
+    validate_leaf_output(leaf, num_parallel_tree)

    ntree_limit = 2
    sliced = booster.predict(
--- a/tests/python/test_spark/init.py
+++ b/tests/python/test_spark/init.py
--- a/tests/python/test_spark/test_data.py
+++ b/tests/python/test_spark/test_data.py
@@ -1,160 +0,0 @@
-import sys
-from typing import List
-
-import numpy as np
-import pandas as pd
-import pytest
-
-from xgboost import testing as tm
-
-if tm.no_spark()["condition"]:
-    pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
-if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
-    pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
-
-from xgboost.spark.data import (
-    _read_csr_matrix_from_unwrapped_spark_vec,
-    alias,
-    create_dmatrix_from_partitions,
-    stack_series,
-)
-
-from xgboost import DMatrix, QuantileDMatrix
-
-
-def test_stack() -> None:
-    a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
-    b = stack_series(a["a"])
-    assert b.shape == (2, 2)
-
-    a = pd.DataFrame({"a": [[1], [3]]})
-    b = stack_series(a["a"])
-    assert b.shape == (2, 1)
-
-    a = pd.DataFrame({"a": [np.array([1, 2]), np.array([3, 4])]})
-    b = stack_series(a["a"])
-    assert b.shape == (2, 2)
-
-    a = pd.DataFrame({"a": [np.array([1]), np.array([3])]})
-    b = stack_series(a["a"])
-    assert b.shape == (2, 1)
-
-
-def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
-    rng = np.random.default_rng(0)
-    dfs: List[pd.DataFrame] = []
-    n_features = 16
-    n_samples_per_batch = 16
-    n_batches = 10
-    feature_types = ["float"] * n_features
-
-    for i in range(n_batches):
-        X = rng.normal(loc=0, size=256).reshape(n_samples_per_batch, n_features)
-        y = rng.normal(loc=0, size=n_samples_per_batch)
-        m = rng.normal(loc=0, size=n_samples_per_batch)
-        w = rng.normal(loc=0.5, scale=0.5, size=n_samples_per_batch)
-        w -= w.min()
-
-        valid = rng.binomial(n=1, p=0.5, size=16).astype(np.bool_)
-
-        df = pd.DataFrame(
-            {alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
-        )
-        if is_feature_cols:
-            for j in range(X.shape[1]):
-                df[f"feat-{j}"] = pd.Series(X[:, j])
-        else:
-            df[alias.data] = pd.Series(list(X))
-        dfs.append(df)
-
-    kwargs = {"feature_types": feature_types}
-    device_id = 0 if on_gpu else None
-    cols = [f"feat-{i}" for i in range(n_features)]
-    feature_cols = cols if is_feature_cols else None
-    train_Xy, valid_Xy = create_dmatrix_from_partitions(
-        iter(dfs),
-        feature_cols,
-        gpu_id=device_id,
-        use_qdm=is_qdm,
-        kwargs=kwargs,
-        enable_sparse_data_optim=False,
-        has_validation_col=True,
-    )
-
-    if is_qdm:
-        assert isinstance(train_Xy, QuantileDMatrix)
-        assert isinstance(valid_Xy, QuantileDMatrix)
-    else:
-        assert not isinstance(train_Xy, QuantileDMatrix)
-        assert isinstance(train_Xy, DMatrix)
-        assert not isinstance(valid_Xy, QuantileDMatrix)
-        assert isinstance(valid_Xy, DMatrix)
-
-    assert valid_Xy is not None
-    assert valid_Xy.num_row() + train_Xy.num_row() == n_samples_per_batch * n_batches
-    assert train_Xy.num_col() == n_features
-    assert valid_Xy.num_col() == n_features
-
-    df = pd.concat(dfs, axis=0)
-    df_train = df.loc[~df[alias.valid], :]
-    df_valid = df.loc[df[alias.valid], :]
-
-    assert df_train.shape[0] == train_Xy.num_row()
-    assert df_valid.shape[0] == valid_Xy.num_row()
-
-    # margin
-    np.testing.assert_allclose(
-        df_train[alias.margin].to_numpy(), train_Xy.get_base_margin()
-    )
-    np.testing.assert_allclose(
-        df_valid[alias.margin].to_numpy(), valid_Xy.get_base_margin()
-    )
-    # weight
-    np.testing.assert_allclose(df_train[alias.weight].to_numpy(), train_Xy.get_weight())
-    np.testing.assert_allclose(df_valid[alias.weight].to_numpy(), valid_Xy.get_weight())
-    # label
-    np.testing.assert_allclose(df_train[alias.label].to_numpy(), train_Xy.get_label())
-    np.testing.assert_allclose(df_valid[alias.label].to_numpy(), valid_Xy.get_label())
-
-    np.testing.assert_equal(train_Xy.feature_types, feature_types)
-    np.testing.assert_equal(valid_Xy.feature_types, feature_types)
-
-
-@pytest.mark.parametrize(
-    "is_feature_cols,is_qdm",
-    [(True, True), (True, False), (False, True), (False, False)],
-)
-def test_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool) -> None:
-    run_dmatrix_ctor(is_feature_cols, is_qdm, on_gpu=False)
-
-
-def test_read_csr_matrix_from_unwrapped_spark_vec() -> None:
-    from scipy.sparse import csr_matrix
-
-    pd1 = pd.DataFrame(
-        {
-            "featureVectorType": [0, 1, 1, 0],
-            "featureVectorSize": [3, None, None, 3],
-            "featureVectorIndices": [
-                np.array([0, 2], dtype=np.int32),
-                None,
-                None,
-                np.array([1, 2], dtype=np.int32),
-            ],
-            "featureVectorValues": [
-                np.array([3.0, 0.0], dtype=np.float64),
-                np.array([13.0, 14.0, 0.0], dtype=np.float64),
-                np.array([0.0, 24.0, 25.0], dtype=np.float64),
-                np.array([0.0, 35.0], dtype=np.float64),
-            ],
-        }
-    )
-    sm = _read_csr_matrix_from_unwrapped_spark_vec(pd1)
-    assert isinstance(sm, csr_matrix)
-
-    np.testing.assert_array_equal(
-        sm.data, [3.0, 0.0, 13.0, 14.0, 0.0, 0.0, 24.0, 25.0, 0.0, 35.0]
-    )
-    np.testing.assert_array_equal(sm.indptr, [0, 2, 5, 8, 10])
-    np.testing.assert_array_equal(sm.indices, [0, 2, 0, 1, 2, 0, 1, 2, 1, 2])
-    assert sm.shape == (4, 3)
--- a/tests/python/test_spark/test_spark_local.py
+++ b/tests/python/test_spark/test_spark_local.py
--- a/tests/python/test_spark/test_spark_local_cluster.py
+++ b/tests/python/test_spark/test_spark_local_cluster.py
@@ -1,452 +0,0 @@
-import json
-import os
-import random
-import sys
-import uuid
-
-import numpy as np
-import pytest
-
-from xgboost import testing as tm
-
-if tm.no_spark()["condition"]:
-    pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
-if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
-    pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
-
-from pyspark.ml.linalg import Vectors
-from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
-from xgboost.spark.utils import _get_max_num_concurrent_tasks
-
-from .utils import SparkLocalClusterTestCase
-
-
-class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
-    def setUp(self):
-        random.seed(2020)
-
-        self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
-        # The following code use xgboost python library to train xgb model and predict.
-        #
-        # >>> import numpy as np
-        # >>> import xgboost
-        # >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
-        # >>> y = np.array([0, 1])
-        # >>> reg1 = xgboost.XGBRegressor()
-        # >>> reg1.fit(X, y)
-        # >>> reg1.predict(X)
-        # array([8.8363886e-04, 9.9911636e-01], dtype=float32)
-        # >>> def custom_lr(boosting_round, num_boost_round):
-        # ...     return 1.0 / (boosting_round + 1)
-        # ...
-        # >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
-        # >>> reg1.predict(X)
-        # array([0.02406833, 0.97593164], dtype=float32)
-        # >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
-        # >>> reg2.fit(X, y)
-        # >>> reg2.predict(X, ntree_limit=5)
-        # array([0.22185263, 0.77814734], dtype=float32)
-        self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
-        self.reg_df_train = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
-            ],
-            ["features", "label"],
-        )
-        self.reg_df_test = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
-            ],
-            [
-                "features",
-                "expected_prediction",
-                "expected_prediction_with_params",
-                "expected_prediction_with_callbacks",
-            ],
-        )
-
-        # Distributed section
-        # Binary classification
-        self.cls_df_train_distributed = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
-                (Vectors.dense(4.0, 5.0, 6.0), 0),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1),
-            ]
-            * 100,
-            ["features", "label"],
-        )
-        self.cls_df_test_distributed = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, [0.9949826, 0.0050174]),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0050174, 0.9949826]),
-                (Vectors.dense(4.0, 5.0, 6.0), 0, [0.9949826, 0.0050174]),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0050174, 0.9949826]),
-            ],
-            ["features", "expected_label", "expected_probability"],
-        )
-        # Binary classification with different num_estimators
-        self.cls_df_test_distributed_lower_estimators = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, [0.9735, 0.0265]),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0265, 0.9735]),
-                (Vectors.dense(4.0, 5.0, 6.0), 0, [0.9735, 0.0265]),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0265, 0.9735]),
-            ],
-            ["features", "expected_label", "expected_probability"],
-        )
-
-        # Multiclass classification
-        self.cls_df_train_distributed_multiclass = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
-                (Vectors.dense(4.0, 5.0, 6.0), 0),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
-            ]
-            * 100,
-            ["features", "label"],
-        )
-        self.cls_df_test_distributed_multiclass = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, [4.294563, -2.449409, -2.449409]),
-                (
-                    Vectors.sparse(3, {1: 1.0, 2: 5.5}),
-                    1,
-                    [-2.3796105, 3.669014, -2.449409],
-                ),
-                (Vectors.dense(4.0, 5.0, 6.0), 0, [4.294563, -2.449409, -2.449409]),
-                (
-                    Vectors.sparse(3, {1: 6.0, 2: 7.5}),
-                    2,
-                    [-2.3796105, -2.449409, 3.669014],
-                ),
-            ],
-            ["features", "expected_label", "expected_margins"],
-        )
-
-        # Regression
-        self.reg_df_train_distributed = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
-                (Vectors.dense(4.0, 5.0, 6.0), 0),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
-            ]
-            * 100,
-            ["features", "label"],
-        )
-        self.reg_df_test_distributed = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 1.533e-04),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 9.999e-01),
-                (Vectors.dense(4.0, 5.0, 6.0), 1.533e-04),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1.999e00),
-            ],
-            ["features", "expected_label"],
-        )
-
-        # Adding weight and validation
-        self.clf_params_with_eval_dist = {
-            "validation_indicator_col": "isVal",
-            "early_stopping_rounds": 1,
-            "eval_metric": "logloss",
-        }
-        self.clf_params_with_weight_dist = {"weight_col": "weight"}
-        self.cls_df_train_distributed_with_eval_weight = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
-                (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
-            ]
-            * 100,
-            ["features", "label", "isVal", "weight"],
-        )
-        self.cls_df_test_distributed_with_eval_weight = self.session.createDataFrame(
-            [
-                (
-                    Vectors.dense(1.0, 2.0, 3.0),
-                    [0.9955, 0.0044],
-                    [0.9904, 0.0096],
-                    [0.9903, 0.0097],
-                ),
-            ],
-            [
-                "features",
-                "expected_prob_with_weight",
-                "expected_prob_with_eval",
-                "expected_prob_with_weight_and_eval",
-            ],
-        )
-        self.clf_best_score_eval = 0.009677
-        self.clf_best_score_weight_and_eval = 0.006626
-
-        self.reg_params_with_eval_dist = {
-            "validation_indicator_col": "isVal",
-            "early_stopping_rounds": 1,
-            "eval_metric": "rmse",
-        }
-        self.reg_params_with_weight_dist = {"weight_col": "weight"}
-        self.reg_df_train_distributed_with_eval_weight = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
-                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
-                (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
-                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
-            ]
-            * 100,
-            ["features", "label", "isVal", "weight"],
-        )
-        self.reg_df_test_distributed_with_eval_weight = self.session.createDataFrame(
-            [
-                (Vectors.dense(1.0, 2.0, 3.0), 4.583e-05, 5.239e-05, 6.03e-05),
-                (
-                    Vectors.sparse(3, {1: 1.0, 2: 5.5}),
-                    9.9997e-01,
-                    9.99947e-01,
-                    9.9995e-01,
-                ),
-            ],
-            [
-                "features",
-                "expected_prediction_with_weight",
-                "expected_prediction_with_eval",
-                "expected_prediction_with_weight_and_eval",
-            ],
-        )
-        self.reg_best_score_eval = 5.239e-05
-        self.reg_best_score_weight_and_eval = 4.810e-05
-
-    def test_regressor_basic_with_params(self):
-        regressor = SparkXGBRegressor(**self.reg_params)
-        model = regressor.fit(self.reg_df_train)
-        pred_result = model.transform(self.reg_df_test).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.isclose(
-                    row.prediction, row.expected_prediction_with_params, atol=1e-3
-                )
-            )
-
-    def test_callbacks(self):
-        from xgboost.callback import LearningRateScheduler
-
-        path = os.path.join(self.tempdir, str(uuid.uuid4()))
-
-        def custom_learning_rate(boosting_round):
-            return 1.0 / (boosting_round + 1)
-
-        cb = [LearningRateScheduler(custom_learning_rate)]
-        regressor = SparkXGBRegressor(callbacks=cb)
-
-        # Test the save/load of the estimator instead of the model, since
-        # the callbacks param only exists in the estimator but not in the model
-        regressor.save(path)
-        regressor = SparkXGBRegressor.load(path)
-
-        model = regressor.fit(self.reg_df_train)
-        pred_result = model.transform(self.reg_df_test).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.isclose(
-                    row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
-                )
-            )
-
-    def test_classifier_distributed_basic(self):
-        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
-        model = classifier.fit(self.cls_df_train_distributed)
-        pred_result = model.transform(self.cls_df_test_distributed).collect()
-        for row in pred_result:
-            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
-            self.assertTrue(
-                np.allclose(row.expected_probability, row.probability, atol=1e-3)
-            )
-
-    def test_classifier_distributed_multiclass(self):
-        # There is no built-in multiclass option for external storage
-        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
-        model = classifier.fit(self.cls_df_train_distributed_multiclass)
-        pred_result = model.transform(self.cls_df_test_distributed_multiclass).collect()
-        for row in pred_result:
-            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
-            self.assertTrue(
-                np.allclose(row.expected_margins, row.rawPrediction, atol=1e-3)
-            )
-
-    def test_regressor_distributed_basic(self):
-        regressor = SparkXGBRegressor(num_workers=self.n_workers, n_estimators=100)
-        model = regressor.fit(self.reg_df_train_distributed)
-        pred_result = model.transform(self.reg_df_test_distributed).collect()
-        for row in pred_result:
-            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
-
-    def test_classifier_distributed_weight_eval(self):
-        # with weight
-        classifier = SparkXGBClassifier(
-            num_workers=self.n_workers,
-            n_estimators=100,
-            **self.clf_params_with_weight_dist
-        )
-        model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
-        pred_result = model.transform(
-            self.cls_df_test_distributed_with_eval_weight
-        ).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.allclose(row.probability, row.expected_prob_with_weight, atol=1e-3)
-            )
-
-        # with eval only
-        classifier = SparkXGBClassifier(
-            num_workers=self.n_workers,
-            n_estimators=100,
-            **self.clf_params_with_eval_dist
-        )
-        model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
-        pred_result = model.transform(
-            self.cls_df_test_distributed_with_eval_weight
-        ).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.allclose(row.probability, row.expected_prob_with_eval, atol=1e-3)
-            )
-        assert np.isclose(
-            float(model.get_booster().attributes()["best_score"]),
-            self.clf_best_score_eval,
-            rtol=1e-3,
-        )
-
-        # with both weight and eval
-        classifier = SparkXGBClassifier(
-            num_workers=self.n_workers,
-            n_estimators=100,
-            **self.clf_params_with_eval_dist,
-            **self.clf_params_with_weight_dist
-        )
-        model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
-        pred_result = model.transform(
-            self.cls_df_test_distributed_with_eval_weight
-        ).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.allclose(
-                    row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
-                )
-            )
-        np.isclose(
-            float(model.get_booster().attributes()["best_score"]),
-            self.clf_best_score_weight_and_eval,
-            rtol=1e-3,
-        )
-
-    def test_regressor_distributed_weight_eval(self):
-        # with weight
-        regressor = SparkXGBRegressor(
-            num_workers=self.n_workers,
-            n_estimators=100,
-            **self.reg_params_with_weight_dist
-        )
-        model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
-        pred_result = model.transform(
-            self.reg_df_test_distributed_with_eval_weight
-        ).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.isclose(
-                    row.prediction, row.expected_prediction_with_weight, atol=1e-3
-                )
-            )
-        # with eval only
-        regressor = SparkXGBRegressor(
-            num_workers=self.n_workers,
-            n_estimators=100,
-            **self.reg_params_with_eval_dist
-        )
-        model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
-        pred_result = model.transform(
-            self.reg_df_test_distributed_with_eval_weight
-        ).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.isclose(row.prediction, row.expected_prediction_with_eval, atol=1e-3)
-            )
-        assert np.isclose(
-            float(model.get_booster().attributes()["best_score"]),
-            self.reg_best_score_eval,
-            rtol=1e-3,
-        )
-        # with both weight and eval
-        regressor = SparkXGBRegressor(
-            num_workers=self.n_workers,
-            n_estimators=100,
-            use_external_storage=False,
-            **self.reg_params_with_eval_dist,
-            **self.reg_params_with_weight_dist
-        )
-        model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
-        pred_result = model.transform(
-            self.reg_df_test_distributed_with_eval_weight
-        ).collect()
-        for row in pred_result:
-            self.assertTrue(
-                np.isclose(
-                    row.prediction,
-                    row.expected_prediction_with_weight_and_eval,
-                    atol=1e-3,
-                )
-            )
-        assert np.isclose(
-            float(model.get_booster().attributes()["best_score"]),
-            self.reg_best_score_weight_and_eval,
-            rtol=1e-3,
-        )
-
-    def test_num_estimators(self):
-        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=10)
-        model = classifier.fit(self.cls_df_train_distributed)
-        pred_result = model.transform(
-            self.cls_df_test_distributed_lower_estimators
-        ).collect()
-        print(pred_result)
-        for row in pred_result:
-            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
-            self.assertTrue(
-                np.allclose(row.expected_probability, row.probability, atol=1e-3)
-            )
-
-    def test_distributed_params(self):
-        classifier = SparkXGBClassifier(num_workers=self.n_workers, max_depth=7)
-        model = classifier.fit(self.cls_df_train_distributed)
-        self.assertTrue(hasattr(classifier, "max_depth"))
-        self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
-        booster_config = json.loads(model.get_booster().save_config())
-        max_depth = booster_config["learner"]["gradient_booster"]["updater"][
-            "grow_histmaker"
-        ]["train_param"]["max_depth"]
-        self.assertEqual(int(max_depth), 7)
-
-    def test_repartition(self):
-        # The following test case has a few partitioned datasets that are either
-        # well partitioned relative to the number of workers that the user wants
-        # or poorly partitioned. We only want to repartition when the dataset
-        # is poorly partitioned so _repartition_needed is true in those instances.
-
-        classifier = SparkXGBClassifier(num_workers=self.n_workers)
-        basic = self.cls_df_train_distributed
-        self.assertTrue(classifier._repartition_needed(basic))
-        bad_repartitioned = basic.repartition(self.n_workers + 1)
-        self.assertTrue(classifier._repartition_needed(bad_repartitioned))
-        good_repartitioned = basic.repartition(self.n_workers)
-        self.assertFalse(classifier._repartition_needed(good_repartitioned))
-
-        # Now testing if force_repartition returns True regardless of whether the data is well partitioned
-        classifier = SparkXGBClassifier(
-            num_workers=self.n_workers, force_repartition=True
-        )
-        good_repartitioned = basic.repartition(self.n_workers)
-        self.assertTrue(classifier._repartition_needed(good_repartitioned))
--- a/tests/python/test_spark/utils.py
+++ b/tests/python/test_spark/utils.py
@@ -1,145 +0,0 @@
-import contextlib
-import logging
-import shutil
-import sys
-import tempfile
-import unittest
-
-import pytest
-from six import StringIO
-
-from xgboost import testing as tm
-
-if tm.no_spark()["condition"]:
-    pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
-if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
-    pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
-
-from pyspark.sql import SparkSession, SQLContext
-from xgboost.spark.utils import _get_default_params_from_func
-
-
-class UtilsTest(unittest.TestCase):
-    def test_get_default_params(self):
-        class Foo:
-            def func1(self, x, y, key1=None, key2="val2", key3=0, key4=None):
-                pass
-
-        unsupported_params = {"key2", "key4"}
-        expected_default_params = {
-            "key1": None,
-            "key3": 0,
-        }
-        actual_default_params = _get_default_params_from_func(
-            Foo.func1, unsupported_params
-        )
-        self.assertEqual(
-            len(expected_default_params.keys()), len(actual_default_params.keys())
-        )
-        for k, v in actual_default_params.items():
-            self.assertEqual(expected_default_params[k], v)
-
-
-@contextlib.contextmanager
-def patch_stdout():
-    """patch stdout and give an output"""
-    sys_stdout = sys.stdout
-    io_out = StringIO()
-    sys.stdout = io_out
-    try:
-        yield io_out
-    finally:
-        sys.stdout = sys_stdout
-
-
-@contextlib.contextmanager
-def patch_logger(name):
-    """patch logger and give an output"""
-    io_out = StringIO()
-    log = logging.getLogger(name)
-    handler = logging.StreamHandler(io_out)
-    log.addHandler(handler)
-    try:
-        yield io_out
-    finally:
-        log.removeHandler(handler)
-
-
-class TestTempDir(object):
-    @classmethod
-    def make_tempdir(cls):
-        """
-        :param dir: Root directory in which to create the temp directory
-        """
-        cls.tempdir = tempfile.mkdtemp(prefix="sparkdl_tests")
-
-    @classmethod
-    def remove_tempdir(cls):
-        shutil.rmtree(cls.tempdir)
-
-
-class TestSparkContext(object):
-    @classmethod
-    def setup_env(cls, spark_config):
-        builder = SparkSession.builder.appName("xgboost spark python API Tests")
-        for k, v in spark_config.items():
-            builder.config(k, v)
-        spark = builder.getOrCreate()
-        logging.getLogger("pyspark").setLevel(logging.INFO)
-
-        cls.sc = spark.sparkContext
-        cls.session = spark
-
-    @classmethod
-    def tear_down_env(cls):
-        cls.session.stop()
-        cls.session = None
-        cls.sc.stop()
-        cls.sc = None
-
-
-class SparkTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.setup_env(
-            {
-                "spark.master": "local[4]",
-                "spark.python.worker.reuse": "false",
-                "spark.driver.host": "127.0.0.1",
-                "spark.task.maxFailures": "1",
-                "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
-                "spark.sql.pyspark.jvmStacktrace.enabled": "true",
-            }
-        )
-        cls.make_tempdir()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.remove_tempdir()
-        cls.tear_down_env()
-
-
-class SparkLocalClusterTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.setup_env(
-            {
-                "spark.master": "local-cluster[2, 2, 1024]",
-                "spark.python.worker.reuse": "false",
-                "spark.driver.host": "127.0.0.1",
-                "spark.task.maxFailures": "1",
-                "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
-                "spark.sql.pyspark.jvmStacktrace.enabled": "true",
-                "spark.cores.max": "4",
-                "spark.task.cpus": "1",
-                "spark.executor.cores": "2",
-            }
-        )
-        cls.make_tempdir()
-        # We run a dummy job so that we block until the workers have connected to the master
-        cls.sc.parallelize(range(4), 4).barrier().mapPartitions(lambda _: []).collect()
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.remove_tempdir()
-        cls.tear_down_env()
--- a/tests/python/test_tracker.py
+++ b/tests/python/test_tracker.py
@@ -7,6 +7,7 @@ import pytest
 import xgboost as xgb
 from xgboost import RabitTracker
 from xgboost import testing as tm
+from xgboost import collective

 if sys.platform.startswith("win"):
    pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -21,12 +22,9 @@ def test_rabit_tracker():


 def run_rabit_ops(client, n_workers):
-    from test_with_dask import _get_client_workers
    from xgboost.dask import CommunicatorContext, _get_dask_config, _get_rabit_args

-    from xgboost import collective
-
-    workers = _get_client_workers(client)
+    workers = tm.get_client_workers(client)
    rabit_args = client.sync(_get_rabit_args, len(workers), _get_dask_config(), client)
    assert not collective.is_distributed()
    n_workers_from_dask = len(workers)
@@ -76,7 +74,6 @@ def test_rabit_ops_ipv6():

 def test_rank_assignment() -> None:
    from distributed import Client, LocalCluster
-    from test_with_dask import _get_client_workers

    def local_test(worker_id):
        with xgb.dask.CommunicatorContext(**args) as ctx:
@@ -89,7 +86,7 @@ def test_rank_assignment() -> None:

    with LocalCluster(n_workers=8) as cluster:
        with Client(cluster) as client:
-            workers = _get_client_workers(client)
+            workers = tm.get_client_workers(client)
            args = client.sync(
                xgb.dask._get_rabit_args,
                len(workers),
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -8,36 +8,10 @@ from hypothesis import given, note, settings, strategies

 import xgboost as xgb
 from xgboost import testing as tm
-
-exact_parameter_strategy = strategies.fixed_dictionaries({
-    'nthread': strategies.integers(1, 4),
-    'max_depth': strategies.integers(1, 11),
-    'min_child_weight': strategies.floats(0.5, 2.0),
-    'alpha': strategies.floats(1e-5, 2.0),
-    'lambda': strategies.floats(1e-5, 2.0),
-    'eta': strategies.floats(0.01, 0.5),
-    'gamma': strategies.floats(1e-5, 2.0),
-    'seed': strategies.integers(0, 10),
-    # We cannot enable subsampling as the training loss can increase
-    # 'subsample': strategies.floats(0.5, 1.0),
-    'colsample_bytree': strategies.floats(0.5, 1.0),
-    'colsample_bylevel': strategies.floats(0.5, 1.0),
-})
-
-hist_parameter_strategy = strategies.fixed_dictionaries({
-    'max_depth': strategies.integers(1, 11),
-    'max_leaves': strategies.integers(0, 1024),
-    'max_bin': strategies.integers(2, 512),
-    'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
-}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
-    x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
-
-
-cat_parameter_strategy = strategies.fixed_dictionaries(
-    {
-        "max_cat_to_onehot": strategies.integers(1, 128),
-        "max_cat_threshold": strategies.integers(1, 128),
-    }
+from xgboost.testing.params import (
+    exact_parameter_strategy,
+    hist_parameter_strategy,
+    cat_parameter_strategy,
 )


--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1,5 +1,3 @@
-import collections
-import importlib.util
 import json
 import os
 import random
@@ -9,6 +7,7 @@ from typing import Callable, Optional
 import numpy as np
 import pytest
 from sklearn.utils.estimator_checks import parametrize_with_checks
+from xgboost.testing.shared import get_feature_weights, validate_data_initialization

 import xgboost as xgb
 from xgboost import testing as tm
@@ -1031,45 +1030,6 @@ def test_pandas_input():
    np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))


-def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
-    with tempfile.TemporaryDirectory() as tmpdir:
-        colsample_bynode = 0.5
-        reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode)
-
-        reg.fit(X, y, feature_weights=fw)
-        model_path = os.path.join(tmpdir, 'model.json')
-        reg.save_model(model_path)
-        with open(model_path) as fd:
-            model = json.load(fd)
-
-        parser_path = os.path.join(
-            tm.demo_dir(__file__), "json-model", "json_parser.py"
-        )
-        spec = importlib.util.spec_from_file_location("JsonParser",
-                                                      parser_path)
-        foo = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(foo)
-        model = foo.Model(model)
-        splits = {}
-        total_nodes = 0
-        for tree in model.trees:
-            n_nodes = len(tree.nodes)
-            total_nodes += n_nodes
-            for n in range(n_nodes):
-                if tree.is_leaf(n):
-                    continue
-                if splits.get(tree.split_index(n), None) is None:
-                    splits[tree.split_index(n)] = 1
-                else:
-                    splits[tree.split_index(n)] += 1
-
-        od = collections.OrderedDict(sorted(splits.items()))
-        tuples = [(k, v) for k, v in od.items()]
-        k, v = list(zip(*tuples))
-        w = np.polyfit(k, v, deg=1)
-        return w
-
-
@pytest.mark.parametrize("tree_method", ["approx", "hist"])
 def test_feature_weights(tree_method):
    kRows = 512
@@ -1080,12 +1040,18 @@ def test_feature_weights(tree_method):
    fw = np.ones(shape=(kCols,))
    for i in range(kCols):
        fw[i] *= float(i)
-    poly_increasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
+
+    parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py")
+    poly_increasing = get_feature_weights(
+        X, y, fw, parser_path, tree_method, xgb.XGBRegressor
+    )

    fw = np.ones(shape=(kCols,))
    for i in range(kCols):
        fw[i] *= float(kCols - i)
-    poly_decreasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
+    poly_decreasing = get_feature_weights(
+        X, y, fw, parser_path, tree_method, xgb.XGBRegressor
+    )

    # Approxmated test, this is dependent on the implementation of random
    # number generator in std library.
@@ -1219,33 +1185,10 @@ def test_multilabel_classification() -> None:
    assert predt.dtype == np.int64


-def run_data_initialization(DMatrix, model, X, y):
-    """Assert that we don't create duplicated DMatrix."""
-
-    old_init = DMatrix.__init__
-    count = [0]
-
-    def new_init(self, **kwargs):
-        count[0] += 1
-        return old_init(self, **kwargs)
-
-    DMatrix.__init__ = new_init
-    model(n_estimators=1).fit(X, y, eval_set=[(X, y)])
-
-    assert count[0] == 1
-    count[0] = 0                # only 1 DMatrix is created.
-
-    y_copy = y.copy()
-    model(n_estimators=1).fit(X, y, eval_set=[(X, y_copy)])
-    assert count[0] == 2        # a different Python object is considered different
-
-    DMatrix.__init__ = old_init
-
-
 def test_data_initialization():
    from sklearn.datasets import load_digits
    X, y = load_digits(return_X_y=True)
-    run_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
+    validate_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)


@parametrize_with_checks([xgb.XGBRegressor()])