Extract dask and spark test into distributed test. (#8395)

- Move test files.
- Run spark and dask separately to prevent conflicts.
- Gather common code into the testing module.
This commit is contained in:
Jiaming Yuan
2022-10-28 16:24:32 +08:00
committed by GitHub
parent f73520bfff
commit cfd2a9f872
34 changed files with 405 additions and 337 deletions

View File

@@ -5,6 +5,7 @@ import numpy as np
import pandas as pd
import pytest
from scipy import sparse
from xgboost.testing.shared import validate_leaf_output
import xgboost as xgb
from xgboost import testing as tm
@@ -26,16 +27,6 @@ def run_threaded_predict(X, rows, predict_func):
assert f.result()
def verify_leaf_output(leaf: np.ndarray, num_parallel_tree: int):
for i in range(leaf.shape[0]): # n_samples
for j in range(leaf.shape[1]): # n_rounds
for k in range(leaf.shape[2]): # n_classes
tree_group = leaf[i, j, k, :]
assert tree_group.shape[0] == num_parallel_tree
# No sampling, all trees within forest are the same
assert np.all(tree_group == tree_group[0])
def run_predict_leaf(predictor):
rows = 100
cols = 4
@@ -67,7 +58,7 @@ def run_predict_leaf(predictor):
assert leaf.shape[2] == classes
assert leaf.shape[3] == num_parallel_tree
verify_leaf_output(leaf, num_parallel_tree)
validate_leaf_output(leaf, num_parallel_tree)
ntree_limit = 2
sliced = booster.predict(

View File

@@ -1,160 +0,0 @@
import sys
from typing import List
import numpy as np
import pandas as pd
import pytest
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
from xgboost.spark.data import (
_read_csr_matrix_from_unwrapped_spark_vec,
alias,
create_dmatrix_from_partitions,
stack_series,
)
from xgboost import DMatrix, QuantileDMatrix
def test_stack() -> None:
a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
b = stack_series(a["a"])
assert b.shape == (2, 2)
a = pd.DataFrame({"a": [[1], [3]]})
b = stack_series(a["a"])
assert b.shape == (2, 1)
a = pd.DataFrame({"a": [np.array([1, 2]), np.array([3, 4])]})
b = stack_series(a["a"])
assert b.shape == (2, 2)
a = pd.DataFrame({"a": [np.array([1]), np.array([3])]})
b = stack_series(a["a"])
assert b.shape == (2, 1)
def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
rng = np.random.default_rng(0)
dfs: List[pd.DataFrame] = []
n_features = 16
n_samples_per_batch = 16
n_batches = 10
feature_types = ["float"] * n_features
for i in range(n_batches):
X = rng.normal(loc=0, size=256).reshape(n_samples_per_batch, n_features)
y = rng.normal(loc=0, size=n_samples_per_batch)
m = rng.normal(loc=0, size=n_samples_per_batch)
w = rng.normal(loc=0.5, scale=0.5, size=n_samples_per_batch)
w -= w.min()
valid = rng.binomial(n=1, p=0.5, size=16).astype(np.bool_)
df = pd.DataFrame(
{alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
)
if is_feature_cols:
for j in range(X.shape[1]):
df[f"feat-{j}"] = pd.Series(X[:, j])
else:
df[alias.data] = pd.Series(list(X))
dfs.append(df)
kwargs = {"feature_types": feature_types}
device_id = 0 if on_gpu else None
cols = [f"feat-{i}" for i in range(n_features)]
feature_cols = cols if is_feature_cols else None
train_Xy, valid_Xy = create_dmatrix_from_partitions(
iter(dfs),
feature_cols,
gpu_id=device_id,
use_qdm=is_qdm,
kwargs=kwargs,
enable_sparse_data_optim=False,
has_validation_col=True,
)
if is_qdm:
assert isinstance(train_Xy, QuantileDMatrix)
assert isinstance(valid_Xy, QuantileDMatrix)
else:
assert not isinstance(train_Xy, QuantileDMatrix)
assert isinstance(train_Xy, DMatrix)
assert not isinstance(valid_Xy, QuantileDMatrix)
assert isinstance(valid_Xy, DMatrix)
assert valid_Xy is not None
assert valid_Xy.num_row() + train_Xy.num_row() == n_samples_per_batch * n_batches
assert train_Xy.num_col() == n_features
assert valid_Xy.num_col() == n_features
df = pd.concat(dfs, axis=0)
df_train = df.loc[~df[alias.valid], :]
df_valid = df.loc[df[alias.valid], :]
assert df_train.shape[0] == train_Xy.num_row()
assert df_valid.shape[0] == valid_Xy.num_row()
# margin
np.testing.assert_allclose(
df_train[alias.margin].to_numpy(), train_Xy.get_base_margin()
)
np.testing.assert_allclose(
df_valid[alias.margin].to_numpy(), valid_Xy.get_base_margin()
)
# weight
np.testing.assert_allclose(df_train[alias.weight].to_numpy(), train_Xy.get_weight())
np.testing.assert_allclose(df_valid[alias.weight].to_numpy(), valid_Xy.get_weight())
# label
np.testing.assert_allclose(df_train[alias.label].to_numpy(), train_Xy.get_label())
np.testing.assert_allclose(df_valid[alias.label].to_numpy(), valid_Xy.get_label())
np.testing.assert_equal(train_Xy.feature_types, feature_types)
np.testing.assert_equal(valid_Xy.feature_types, feature_types)
@pytest.mark.parametrize(
"is_feature_cols,is_qdm",
[(True, True), (True, False), (False, True), (False, False)],
)
def test_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool) -> None:
run_dmatrix_ctor(is_feature_cols, is_qdm, on_gpu=False)
def test_read_csr_matrix_from_unwrapped_spark_vec() -> None:
from scipy.sparse import csr_matrix
pd1 = pd.DataFrame(
{
"featureVectorType": [0, 1, 1, 0],
"featureVectorSize": [3, None, None, 3],
"featureVectorIndices": [
np.array([0, 2], dtype=np.int32),
None,
None,
np.array([1, 2], dtype=np.int32),
],
"featureVectorValues": [
np.array([3.0, 0.0], dtype=np.float64),
np.array([13.0, 14.0, 0.0], dtype=np.float64),
np.array([0.0, 24.0, 25.0], dtype=np.float64),
np.array([0.0, 35.0], dtype=np.float64),
],
}
)
sm = _read_csr_matrix_from_unwrapped_spark_vec(pd1)
assert isinstance(sm, csr_matrix)
np.testing.assert_array_equal(
sm.data, [3.0, 0.0, 13.0, 14.0, 0.0, 0.0, 24.0, 25.0, 0.0, 35.0]
)
np.testing.assert_array_equal(sm.indptr, [0, 2, 5, 8, 10])
np.testing.assert_array_equal(sm.indices, [0, 2, 0, 1, 2, 0, 1, 2, 1, 2])
assert sm.shape == (4, 3)

File diff suppressed because it is too large Load Diff

View File

@@ -1,452 +0,0 @@
import json
import os
import random
import sys
import uuid
import numpy as np
import pytest
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
from pyspark.ml.linalg import Vectors
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
from xgboost.spark.utils import _get_max_num_concurrent_tasks
from .utils import SparkLocalClusterTestCase
class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
def setUp(self):
random.seed(2020)
self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
# The following code use xgboost python library to train xgb model and predict.
#
# >>> import numpy as np
# >>> import xgboost
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
# >>> y = np.array([0, 1])
# >>> reg1 = xgboost.XGBRegressor()
# >>> reg1.fit(X, y)
# >>> reg1.predict(X)
# array([8.8363886e-04, 9.9911636e-01], dtype=float32)
# >>> def custom_lr(boosting_round, num_boost_round):
# ... return 1.0 / (boosting_round + 1)
# ...
# >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
# >>> reg1.predict(X)
# array([0.02406833, 0.97593164], dtype=float32)
# >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
# >>> reg2.fit(X, y)
# >>> reg2.predict(X, ntree_limit=5)
# array([0.22185263, 0.77814734], dtype=float32)
self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
self.reg_df_train = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
],
["features", "label"],
)
self.reg_df_test = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
],
[
"features",
"expected_prediction",
"expected_prediction_with_params",
"expected_prediction_with_callbacks",
],
)
# Distributed section
# Binary classification
self.cls_df_train_distributed = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
(Vectors.dense(4.0, 5.0, 6.0), 0),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1),
]
* 100,
["features", "label"],
)
self.cls_df_test_distributed = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.9949826, 0.0050174]),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0050174, 0.9949826]),
(Vectors.dense(4.0, 5.0, 6.0), 0, [0.9949826, 0.0050174]),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0050174, 0.9949826]),
],
["features", "expected_label", "expected_probability"],
)
# Binary classification with different num_estimators
self.cls_df_test_distributed_lower_estimators = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.9735, 0.0265]),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0265, 0.9735]),
(Vectors.dense(4.0, 5.0, 6.0), 0, [0.9735, 0.0265]),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0265, 0.9735]),
],
["features", "expected_label", "expected_probability"],
)
# Multiclass classification
self.cls_df_train_distributed_multiclass = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
(Vectors.dense(4.0, 5.0, 6.0), 0),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
]
* 100,
["features", "label"],
)
self.cls_df_test_distributed_multiclass = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0, [4.294563, -2.449409, -2.449409]),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
1,
[-2.3796105, 3.669014, -2.449409],
),
(Vectors.dense(4.0, 5.0, 6.0), 0, [4.294563, -2.449409, -2.449409]),
(
Vectors.sparse(3, {1: 6.0, 2: 7.5}),
2,
[-2.3796105, -2.449409, 3.669014],
),
],
["features", "expected_label", "expected_margins"],
)
# Regression
self.reg_df_train_distributed = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
(Vectors.dense(4.0, 5.0, 6.0), 0),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
]
* 100,
["features", "label"],
)
self.reg_df_test_distributed = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 1.533e-04),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 9.999e-01),
(Vectors.dense(4.0, 5.0, 6.0), 1.533e-04),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1.999e00),
],
["features", "expected_label"],
)
# Adding weight and validation
self.clf_params_with_eval_dist = {
"validation_indicator_col": "isVal",
"early_stopping_rounds": 1,
"eval_metric": "logloss",
}
self.clf_params_with_weight_dist = {"weight_col": "weight"}
self.cls_df_train_distributed_with_eval_weight = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
]
* 100,
["features", "label", "isVal", "weight"],
)
self.cls_df_test_distributed_with_eval_weight = self.session.createDataFrame(
[
(
Vectors.dense(1.0, 2.0, 3.0),
[0.9955, 0.0044],
[0.9904, 0.0096],
[0.9903, 0.0097],
),
],
[
"features",
"expected_prob_with_weight",
"expected_prob_with_eval",
"expected_prob_with_weight_and_eval",
],
)
self.clf_best_score_eval = 0.009677
self.clf_best_score_weight_and_eval = 0.006626
self.reg_params_with_eval_dist = {
"validation_indicator_col": "isVal",
"early_stopping_rounds": 1,
"eval_metric": "rmse",
}
self.reg_params_with_weight_dist = {"weight_col": "weight"}
self.reg_df_train_distributed_with_eval_weight = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
]
* 100,
["features", "label", "isVal", "weight"],
)
self.reg_df_test_distributed_with_eval_weight = self.session.createDataFrame(
[
(Vectors.dense(1.0, 2.0, 3.0), 4.583e-05, 5.239e-05, 6.03e-05),
(
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
9.9997e-01,
9.99947e-01,
9.9995e-01,
),
],
[
"features",
"expected_prediction_with_weight",
"expected_prediction_with_eval",
"expected_prediction_with_weight_and_eval",
],
)
self.reg_best_score_eval = 5.239e-05
self.reg_best_score_weight_and_eval = 4.810e-05
def test_regressor_basic_with_params(self):
regressor = SparkXGBRegressor(**self.reg_params)
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_params, atol=1e-3
)
)
def test_callbacks(self):
from xgboost.callback import LearningRateScheduler
path = os.path.join(self.tempdir, str(uuid.uuid4()))
def custom_learning_rate(boosting_round):
return 1.0 / (boosting_round + 1)
cb = [LearningRateScheduler(custom_learning_rate)]
regressor = SparkXGBRegressor(callbacks=cb)
# Test the save/load of the estimator instead of the model, since
# the callbacks param only exists in the estimator but not in the model
regressor.save(path)
regressor = SparkXGBRegressor.load(path)
model = regressor.fit(self.reg_df_train)
pred_result = model.transform(self.reg_df_test).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
)
)
def test_classifier_distributed_basic(self):
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
model = classifier.fit(self.cls_df_train_distributed)
pred_result = model.transform(self.cls_df_test_distributed).collect()
for row in pred_result:
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
self.assertTrue(
np.allclose(row.expected_probability, row.probability, atol=1e-3)
)
def test_classifier_distributed_multiclass(self):
# There is no built-in multiclass option for external storage
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
model = classifier.fit(self.cls_df_train_distributed_multiclass)
pred_result = model.transform(self.cls_df_test_distributed_multiclass).collect()
for row in pred_result:
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
self.assertTrue(
np.allclose(row.expected_margins, row.rawPrediction, atol=1e-3)
)
def test_regressor_distributed_basic(self):
regressor = SparkXGBRegressor(num_workers=self.n_workers, n_estimators=100)
model = regressor.fit(self.reg_df_train_distributed)
pred_result = model.transform(self.reg_df_test_distributed).collect()
for row in pred_result:
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
def test_classifier_distributed_weight_eval(self):
# with weight
classifier = SparkXGBClassifier(
num_workers=self.n_workers,
n_estimators=100,
**self.clf_params_with_weight_dist
)
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
pred_result = model.transform(
self.cls_df_test_distributed_with_eval_weight
).collect()
for row in pred_result:
self.assertTrue(
np.allclose(row.probability, row.expected_prob_with_weight, atol=1e-3)
)
# with eval only
classifier = SparkXGBClassifier(
num_workers=self.n_workers,
n_estimators=100,
**self.clf_params_with_eval_dist
)
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
pred_result = model.transform(
self.cls_df_test_distributed_with_eval_weight
).collect()
for row in pred_result:
self.assertTrue(
np.allclose(row.probability, row.expected_prob_with_eval, atol=1e-3)
)
assert np.isclose(
float(model.get_booster().attributes()["best_score"]),
self.clf_best_score_eval,
rtol=1e-3,
)
# with both weight and eval
classifier = SparkXGBClassifier(
num_workers=self.n_workers,
n_estimators=100,
**self.clf_params_with_eval_dist,
**self.clf_params_with_weight_dist
)
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
pred_result = model.transform(
self.cls_df_test_distributed_with_eval_weight
).collect()
for row in pred_result:
self.assertTrue(
np.allclose(
row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
)
)
np.isclose(
float(model.get_booster().attributes()["best_score"]),
self.clf_best_score_weight_and_eval,
rtol=1e-3,
)
def test_regressor_distributed_weight_eval(self):
# with weight
regressor = SparkXGBRegressor(
num_workers=self.n_workers,
n_estimators=100,
**self.reg_params_with_weight_dist
)
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
pred_result = model.transform(
self.reg_df_test_distributed_with_eval_weight
).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction, row.expected_prediction_with_weight, atol=1e-3
)
)
# with eval only
regressor = SparkXGBRegressor(
num_workers=self.n_workers,
n_estimators=100,
**self.reg_params_with_eval_dist
)
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
pred_result = model.transform(
self.reg_df_test_distributed_with_eval_weight
).collect()
for row in pred_result:
self.assertTrue(
np.isclose(row.prediction, row.expected_prediction_with_eval, atol=1e-3)
)
assert np.isclose(
float(model.get_booster().attributes()["best_score"]),
self.reg_best_score_eval,
rtol=1e-3,
)
# with both weight and eval
regressor = SparkXGBRegressor(
num_workers=self.n_workers,
n_estimators=100,
use_external_storage=False,
**self.reg_params_with_eval_dist,
**self.reg_params_with_weight_dist
)
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
pred_result = model.transform(
self.reg_df_test_distributed_with_eval_weight
).collect()
for row in pred_result:
self.assertTrue(
np.isclose(
row.prediction,
row.expected_prediction_with_weight_and_eval,
atol=1e-3,
)
)
assert np.isclose(
float(model.get_booster().attributes()["best_score"]),
self.reg_best_score_weight_and_eval,
rtol=1e-3,
)
def test_num_estimators(self):
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=10)
model = classifier.fit(self.cls_df_train_distributed)
pred_result = model.transform(
self.cls_df_test_distributed_lower_estimators
).collect()
print(pred_result)
for row in pred_result:
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
self.assertTrue(
np.allclose(row.expected_probability, row.probability, atol=1e-3)
)
def test_distributed_params(self):
classifier = SparkXGBClassifier(num_workers=self.n_workers, max_depth=7)
model = classifier.fit(self.cls_df_train_distributed)
self.assertTrue(hasattr(classifier, "max_depth"))
self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
booster_config = json.loads(model.get_booster().save_config())
max_depth = booster_config["learner"]["gradient_booster"]["updater"][
"grow_histmaker"
]["train_param"]["max_depth"]
self.assertEqual(int(max_depth), 7)
def test_repartition(self):
# The following test case has a few partitioned datasets that are either
# well partitioned relative to the number of workers that the user wants
# or poorly partitioned. We only want to repartition when the dataset
# is poorly partitioned so _repartition_needed is true in those instances.
classifier = SparkXGBClassifier(num_workers=self.n_workers)
basic = self.cls_df_train_distributed
self.assertTrue(classifier._repartition_needed(basic))
bad_repartitioned = basic.repartition(self.n_workers + 1)
self.assertTrue(classifier._repartition_needed(bad_repartitioned))
good_repartitioned = basic.repartition(self.n_workers)
self.assertFalse(classifier._repartition_needed(good_repartitioned))
# Now testing if force_repartition returns True regardless of whether the data is well partitioned
classifier = SparkXGBClassifier(
num_workers=self.n_workers, force_repartition=True
)
good_repartitioned = basic.repartition(self.n_workers)
self.assertTrue(classifier._repartition_needed(good_repartitioned))

View File

@@ -1,145 +0,0 @@
import contextlib
import logging
import shutil
import sys
import tempfile
import unittest
import pytest
from six import StringIO
from xgboost import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
from pyspark.sql import SparkSession, SQLContext
from xgboost.spark.utils import _get_default_params_from_func
class UtilsTest(unittest.TestCase):
def test_get_default_params(self):
class Foo:
def func1(self, x, y, key1=None, key2="val2", key3=0, key4=None):
pass
unsupported_params = {"key2", "key4"}
expected_default_params = {
"key1": None,
"key3": 0,
}
actual_default_params = _get_default_params_from_func(
Foo.func1, unsupported_params
)
self.assertEqual(
len(expected_default_params.keys()), len(actual_default_params.keys())
)
for k, v in actual_default_params.items():
self.assertEqual(expected_default_params[k], v)
@contextlib.contextmanager
def patch_stdout():
"""patch stdout and give an output"""
sys_stdout = sys.stdout
io_out = StringIO()
sys.stdout = io_out
try:
yield io_out
finally:
sys.stdout = sys_stdout
@contextlib.contextmanager
def patch_logger(name):
"""patch logger and give an output"""
io_out = StringIO()
log = logging.getLogger(name)
handler = logging.StreamHandler(io_out)
log.addHandler(handler)
try:
yield io_out
finally:
log.removeHandler(handler)
class TestTempDir(object):
@classmethod
def make_tempdir(cls):
"""
:param dir: Root directory in which to create the temp directory
"""
cls.tempdir = tempfile.mkdtemp(prefix="sparkdl_tests")
@classmethod
def remove_tempdir(cls):
shutil.rmtree(cls.tempdir)
class TestSparkContext(object):
@classmethod
def setup_env(cls, spark_config):
builder = SparkSession.builder.appName("xgboost spark python API Tests")
for k, v in spark_config.items():
builder.config(k, v)
spark = builder.getOrCreate()
logging.getLogger("pyspark").setLevel(logging.INFO)
cls.sc = spark.sparkContext
cls.session = spark
@classmethod
def tear_down_env(cls):
cls.session.stop()
cls.session = None
cls.sc.stop()
cls.sc = None
class SparkTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.setup_env(
{
"spark.master": "local[4]",
"spark.python.worker.reuse": "false",
"spark.driver.host": "127.0.0.1",
"spark.task.maxFailures": "1",
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
}
)
cls.make_tempdir()
@classmethod
def tearDownClass(cls):
cls.remove_tempdir()
cls.tear_down_env()
class SparkLocalClusterTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.setup_env(
{
"spark.master": "local-cluster[2, 2, 1024]",
"spark.python.worker.reuse": "false",
"spark.driver.host": "127.0.0.1",
"spark.task.maxFailures": "1",
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
"spark.cores.max": "4",
"spark.task.cpus": "1",
"spark.executor.cores": "2",
}
)
cls.make_tempdir()
# We run a dummy job so that we block until the workers have connected to the master
cls.sc.parallelize(range(4), 4).barrier().mapPartitions(lambda _: []).collect()
@classmethod
def tearDownClass(cls):
cls.remove_tempdir()
cls.tear_down_env()

View File

@@ -7,6 +7,7 @@ import pytest
import xgboost as xgb
from xgboost import RabitTracker
from xgboost import testing as tm
from xgboost import collective
if sys.platform.startswith("win"):
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -21,12 +22,9 @@ def test_rabit_tracker():
def run_rabit_ops(client, n_workers):
from test_with_dask import _get_client_workers
from xgboost.dask import CommunicatorContext, _get_dask_config, _get_rabit_args
from xgboost import collective
workers = _get_client_workers(client)
workers = tm.get_client_workers(client)
rabit_args = client.sync(_get_rabit_args, len(workers), _get_dask_config(), client)
assert not collective.is_distributed()
n_workers_from_dask = len(workers)
@@ -76,7 +74,6 @@ def test_rabit_ops_ipv6():
def test_rank_assignment() -> None:
from distributed import Client, LocalCluster
from test_with_dask import _get_client_workers
def local_test(worker_id):
with xgb.dask.CommunicatorContext(**args) as ctx:
@@ -89,7 +86,7 @@ def test_rank_assignment() -> None:
with LocalCluster(n_workers=8) as cluster:
with Client(cluster) as client:
workers = _get_client_workers(client)
workers = tm.get_client_workers(client)
args = client.sync(
xgb.dask._get_rabit_args,
len(workers),

View File

@@ -8,36 +8,10 @@ from hypothesis import given, note, settings, strategies
import xgboost as xgb
from xgboost import testing as tm
exact_parameter_strategy = strategies.fixed_dictionaries({
'nthread': strategies.integers(1, 4),
'max_depth': strategies.integers(1, 11),
'min_child_weight': strategies.floats(0.5, 2.0),
'alpha': strategies.floats(1e-5, 2.0),
'lambda': strategies.floats(1e-5, 2.0),
'eta': strategies.floats(0.01, 0.5),
'gamma': strategies.floats(1e-5, 2.0),
'seed': strategies.integers(0, 10),
# We cannot enable subsampling as the training loss can increase
# 'subsample': strategies.floats(0.5, 1.0),
'colsample_bytree': strategies.floats(0.5, 1.0),
'colsample_bylevel': strategies.floats(0.5, 1.0),
})
hist_parameter_strategy = strategies.fixed_dictionaries({
'max_depth': strategies.integers(1, 11),
'max_leaves': strategies.integers(0, 1024),
'max_bin': strategies.integers(2, 512),
'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
}).filter(lambda x: (x['max_depth'] > 0 or x['max_leaves'] > 0) and (
x['max_depth'] > 0 or x['grow_policy'] == 'lossguide'))
cat_parameter_strategy = strategies.fixed_dictionaries(
{
"max_cat_to_onehot": strategies.integers(1, 128),
"max_cat_threshold": strategies.integers(1, 128),
}
from xgboost.testing.params import (
exact_parameter_strategy,
hist_parameter_strategy,
cat_parameter_strategy,
)

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,3 @@
import collections
import importlib.util
import json
import os
import random
@@ -9,6 +7,7 @@ from typing import Callable, Optional
import numpy as np
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks
from xgboost.testing.shared import get_feature_weights, validate_data_initialization
import xgboost as xgb
from xgboost import testing as tm
@@ -1031,45 +1030,6 @@ def test_pandas_input():
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))
def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
with tempfile.TemporaryDirectory() as tmpdir:
colsample_bynode = 0.5
reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode)
reg.fit(X, y, feature_weights=fw)
model_path = os.path.join(tmpdir, 'model.json')
reg.save_model(model_path)
with open(model_path) as fd:
model = json.load(fd)
parser_path = os.path.join(
tm.demo_dir(__file__), "json-model", "json_parser.py"
)
spec = importlib.util.spec_from_file_location("JsonParser",
parser_path)
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
model = foo.Model(model)
splits = {}
total_nodes = 0
for tree in model.trees:
n_nodes = len(tree.nodes)
total_nodes += n_nodes
for n in range(n_nodes):
if tree.is_leaf(n):
continue
if splits.get(tree.split_index(n), None) is None:
splits[tree.split_index(n)] = 1
else:
splits[tree.split_index(n)] += 1
od = collections.OrderedDict(sorted(splits.items()))
tuples = [(k, v) for k, v in od.items()]
k, v = list(zip(*tuples))
w = np.polyfit(k, v, deg=1)
return w
@pytest.mark.parametrize("tree_method", ["approx", "hist"])
def test_feature_weights(tree_method):
kRows = 512
@@ -1080,12 +1040,18 @@ def test_feature_weights(tree_method):
fw = np.ones(shape=(kCols,))
for i in range(kCols):
fw[i] *= float(i)
poly_increasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py")
poly_increasing = get_feature_weights(
X, y, fw, parser_path, tree_method, xgb.XGBRegressor
)
fw = np.ones(shape=(kCols,))
for i in range(kCols):
fw[i] *= float(kCols - i)
poly_decreasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
poly_decreasing = get_feature_weights(
X, y, fw, parser_path, tree_method, xgb.XGBRegressor
)
# Approxmated test, this is dependent on the implementation of random
# number generator in std library.
@@ -1219,33 +1185,10 @@ def test_multilabel_classification() -> None:
assert predt.dtype == np.int64
def run_data_initialization(DMatrix, model, X, y):
"""Assert that we don't create duplicated DMatrix."""
old_init = DMatrix.__init__
count = [0]
def new_init(self, **kwargs):
count[0] += 1
return old_init(self, **kwargs)
DMatrix.__init__ = new_init
model(n_estimators=1).fit(X, y, eval_set=[(X, y)])
assert count[0] == 1
count[0] = 0 # only 1 DMatrix is created.
y_copy = y.copy()
model(n_estimators=1).fit(X, y, eval_set=[(X, y_copy)])
assert count[0] == 2 # a different Python object is considered different
DMatrix.__init__ = old_init
def test_data_initialization():
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
run_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
validate_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
@parametrize_with_checks([xgb.XGBRegressor()])