Extract dask and spark test into distributed test. (#8395)

- Move test files. - Run spark and dask separately to prevent conflicts. - Gather common code into the testing module.
2022-10-28 16:24:32 +08:00
parent f73520bfff
commit cfd2a9f872
34 changed files with 405 additions and 337 deletions
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1,5 +1,3 @@
-import collections
-import importlib.util
 import json
 import os
 import random
@@ -9,6 +7,7 @@ from typing import Callable, Optional
 import numpy as np
 import pytest
 from sklearn.utils.estimator_checks import parametrize_with_checks
+from xgboost.testing.shared import get_feature_weights, validate_data_initialization

 import xgboost as xgb
 from xgboost import testing as tm
@@ -1031,45 +1030,6 @@ def test_pandas_input():
    np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))


-def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
-    with tempfile.TemporaryDirectory() as tmpdir:
-        colsample_bynode = 0.5
-        reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode)
-
-        reg.fit(X, y, feature_weights=fw)
-        model_path = os.path.join(tmpdir, 'model.json')
-        reg.save_model(model_path)
-        with open(model_path) as fd:
-            model = json.load(fd)
-
-        parser_path = os.path.join(
-            tm.demo_dir(__file__), "json-model", "json_parser.py"
-        )
-        spec = importlib.util.spec_from_file_location("JsonParser",
-                                                      parser_path)
-        foo = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(foo)
-        model = foo.Model(model)
-        splits = {}
-        total_nodes = 0
-        for tree in model.trees:
-            n_nodes = len(tree.nodes)
-            total_nodes += n_nodes
-            for n in range(n_nodes):
-                if tree.is_leaf(n):
-                    continue
-                if splits.get(tree.split_index(n), None) is None:
-                    splits[tree.split_index(n)] = 1
-                else:
-                    splits[tree.split_index(n)] += 1
-
-        od = collections.OrderedDict(sorted(splits.items()))
-        tuples = [(k, v) for k, v in od.items()]
-        k, v = list(zip(*tuples))
-        w = np.polyfit(k, v, deg=1)
-        return w
-
-
@pytest.mark.parametrize("tree_method", ["approx", "hist"])
 def test_feature_weights(tree_method):
    kRows = 512
@@ -1080,12 +1040,18 @@ def test_feature_weights(tree_method):
    fw = np.ones(shape=(kCols,))
    for i in range(kCols):
        fw[i] *= float(i)
-    poly_increasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
+
+    parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py")
+    poly_increasing = get_feature_weights(
+        X, y, fw, parser_path, tree_method, xgb.XGBRegressor
+    )

    fw = np.ones(shape=(kCols,))
    for i in range(kCols):
        fw[i] *= float(kCols - i)
-    poly_decreasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
+    poly_decreasing = get_feature_weights(
+        X, y, fw, parser_path, tree_method, xgb.XGBRegressor
+    )

    # Approxmated test, this is dependent on the implementation of random
    # number generator in std library.
@@ -1219,33 +1185,10 @@ def test_multilabel_classification() -> None:
    assert predt.dtype == np.int64


-def run_data_initialization(DMatrix, model, X, y):
-    """Assert that we don't create duplicated DMatrix."""
-
-    old_init = DMatrix.__init__
-    count = [0]
-
-    def new_init(self, **kwargs):
-        count[0] += 1
-        return old_init(self, **kwargs)
-
-    DMatrix.__init__ = new_init
-    model(n_estimators=1).fit(X, y, eval_set=[(X, y)])
-
-    assert count[0] == 1
-    count[0] = 0                # only 1 DMatrix is created.
-
-    y_copy = y.copy()
-    model(n_estimators=1).fit(X, y, eval_set=[(X, y_copy)])
-    assert count[0] == 2        # a different Python object is considered different
-
-    DMatrix.__init__ = old_init
-
-
 def test_data_initialization():
    from sklearn.datasets import load_digits
    X, y = load_digits(return_X_y=True)
-    run_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
+    validate_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)


@parametrize_with_checks([xgb.XGBRegressor()])