Extract dask and spark test into distributed test. (#8395)

- Move test files.
- Run spark and dask separately to prevent conflicts.
- Gather common code into the testing module.
This commit is contained in:
Jiaming Yuan
2022-10-28 16:24:32 +08:00
committed by GitHub
parent f73520bfff
commit cfd2a9f872
34 changed files with 405 additions and 337 deletions

View File

@@ -1,5 +1,3 @@
import collections
import importlib.util
import json
import os
import random
@@ -9,6 +7,7 @@ from typing import Callable, Optional
import numpy as np
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks
from xgboost.testing.shared import get_feature_weights, validate_data_initialization
import xgboost as xgb
from xgboost import testing as tm
@@ -1031,45 +1030,6 @@ def test_pandas_input():
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))
def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
with tempfile.TemporaryDirectory() as tmpdir:
colsample_bynode = 0.5
reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode)
reg.fit(X, y, feature_weights=fw)
model_path = os.path.join(tmpdir, 'model.json')
reg.save_model(model_path)
with open(model_path) as fd:
model = json.load(fd)
parser_path = os.path.join(
tm.demo_dir(__file__), "json-model", "json_parser.py"
)
spec = importlib.util.spec_from_file_location("JsonParser",
parser_path)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
model = foo.Model(model)
splits = {}
total_nodes = 0
for tree in model.trees:
n_nodes = len(tree.nodes)
total_nodes += n_nodes
for n in range(n_nodes):
if tree.is_leaf(n):
continue
if splits.get(tree.split_index(n), None) is None:
splits[tree.split_index(n)] = 1
else:
splits[tree.split_index(n)] += 1
od = collections.OrderedDict(sorted(splits.items()))
tuples = [(k, v) for k, v in od.items()]
k, v = list(zip(*tuples))
w = np.polyfit(k, v, deg=1)
return w
@pytest.mark.parametrize("tree_method", ["approx", "hist"])
def test_feature_weights(tree_method):
kRows = 512
@@ -1080,12 +1040,18 @@ def test_feature_weights(tree_method):
fw = np.ones(shape=(kCols,))
for i in range(kCols):
fw[i] *= float(i)
poly_increasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py")
poly_increasing = get_feature_weights(
X, y, fw, parser_path, tree_method, xgb.XGBRegressor
)
fw = np.ones(shape=(kCols,))
for i in range(kCols):
fw[i] *= float(kCols - i)
poly_decreasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
poly_decreasing = get_feature_weights(
X, y, fw, parser_path, tree_method, xgb.XGBRegressor
)
# Approxmated test, this is dependent on the implementation of random
# number generator in std library.
@@ -1219,33 +1185,10 @@ def test_multilabel_classification() -> None:
assert predt.dtype == np.int64
def run_data_initialization(DMatrix, model, X, y):
"""Assert that we don't create duplicated DMatrix."""
old_init = DMatrix.__init__
count = [0]
def new_init(self, **kwargs):
count[0] += 1
return old_init(self, **kwargs)
DMatrix.__init__ = new_init
model(n_estimators=1).fit(X, y, eval_set=[(X, y)])
assert count[0] == 1
count[0] = 0 # only 1 DMatrix is created.
y_copy = y.copy()
model(n_estimators=1).fit(X, y, eval_set=[(X, y_copy)])
assert count[0] == 2 # a different Python object is considered different
DMatrix.__init__ = old_init
def test_data_initialization():
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
run_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
validate_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
@parametrize_with_checks([xgb.XGBRegressor()])