Extract dask and spark test into distributed test. (#8395)
- Move test files. - Run spark and dask separately to prevent conflicts. - Gather common code into the testing module.
This commit is contained in:
0
tests/test_distributed/test_with_spark/__init__.py
Normal file
0
tests/test_distributed/test_with_spark/__init__.py
Normal file
156
tests/test_distributed/test_with_spark/test_data.py
Normal file
156
tests/test_distributed/test_with_spark/test_data.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = [pytest.mark.skipif(**tm.no_spark())]
|
||||
|
||||
from xgboost.spark.data import (
|
||||
_read_csr_matrix_from_unwrapped_spark_vec,
|
||||
alias,
|
||||
create_dmatrix_from_partitions,
|
||||
stack_series,
|
||||
)
|
||||
|
||||
from xgboost import DMatrix, QuantileDMatrix
|
||||
|
||||
|
||||
def test_stack() -> None:
|
||||
a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 2)
|
||||
|
||||
a = pd.DataFrame({"a": [[1], [3]]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 1)
|
||||
|
||||
a = pd.DataFrame({"a": [np.array([1, 2]), np.array([3, 4])]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 2)
|
||||
|
||||
a = pd.DataFrame({"a": [np.array([1]), np.array([3])]})
|
||||
b = stack_series(a["a"])
|
||||
assert b.shape == (2, 1)
|
||||
|
||||
|
||||
def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
|
||||
rng = np.random.default_rng(0)
|
||||
dfs: List[pd.DataFrame] = []
|
||||
n_features = 16
|
||||
n_samples_per_batch = 16
|
||||
n_batches = 10
|
||||
feature_types = ["float"] * n_features
|
||||
|
||||
for i in range(n_batches):
|
||||
X = rng.normal(loc=0, size=256).reshape(n_samples_per_batch, n_features)
|
||||
y = rng.normal(loc=0, size=n_samples_per_batch)
|
||||
m = rng.normal(loc=0, size=n_samples_per_batch)
|
||||
w = rng.normal(loc=0.5, scale=0.5, size=n_samples_per_batch)
|
||||
w -= w.min()
|
||||
|
||||
valid = rng.binomial(n=1, p=0.5, size=16).astype(np.bool_)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
|
||||
)
|
||||
if is_feature_cols:
|
||||
for j in range(X.shape[1]):
|
||||
df[f"feat-{j}"] = pd.Series(X[:, j])
|
||||
else:
|
||||
df[alias.data] = pd.Series(list(X))
|
||||
dfs.append(df)
|
||||
|
||||
kwargs = {"feature_types": feature_types}
|
||||
device_id = 0 if on_gpu else None
|
||||
cols = [f"feat-{i}" for i in range(n_features)]
|
||||
feature_cols = cols if is_feature_cols else None
|
||||
train_Xy, valid_Xy = create_dmatrix_from_partitions(
|
||||
iter(dfs),
|
||||
feature_cols,
|
||||
gpu_id=device_id,
|
||||
use_qdm=is_qdm,
|
||||
kwargs=kwargs,
|
||||
enable_sparse_data_optim=False,
|
||||
has_validation_col=True,
|
||||
)
|
||||
|
||||
if is_qdm:
|
||||
assert isinstance(train_Xy, QuantileDMatrix)
|
||||
assert isinstance(valid_Xy, QuantileDMatrix)
|
||||
else:
|
||||
assert not isinstance(train_Xy, QuantileDMatrix)
|
||||
assert isinstance(train_Xy, DMatrix)
|
||||
assert not isinstance(valid_Xy, QuantileDMatrix)
|
||||
assert isinstance(valid_Xy, DMatrix)
|
||||
|
||||
assert valid_Xy is not None
|
||||
assert valid_Xy.num_row() + train_Xy.num_row() == n_samples_per_batch * n_batches
|
||||
assert train_Xy.num_col() == n_features
|
||||
assert valid_Xy.num_col() == n_features
|
||||
|
||||
df = pd.concat(dfs, axis=0)
|
||||
df_train = df.loc[~df[alias.valid], :]
|
||||
df_valid = df.loc[df[alias.valid], :]
|
||||
|
||||
assert df_train.shape[0] == train_Xy.num_row()
|
||||
assert df_valid.shape[0] == valid_Xy.num_row()
|
||||
|
||||
# margin
|
||||
np.testing.assert_allclose(
|
||||
df_train[alias.margin].to_numpy(), train_Xy.get_base_margin()
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
df_valid[alias.margin].to_numpy(), valid_Xy.get_base_margin()
|
||||
)
|
||||
# weight
|
||||
np.testing.assert_allclose(df_train[alias.weight].to_numpy(), train_Xy.get_weight())
|
||||
np.testing.assert_allclose(df_valid[alias.weight].to_numpy(), valid_Xy.get_weight())
|
||||
# label
|
||||
np.testing.assert_allclose(df_train[alias.label].to_numpy(), train_Xy.get_label())
|
||||
np.testing.assert_allclose(df_valid[alias.label].to_numpy(), valid_Xy.get_label())
|
||||
|
||||
np.testing.assert_equal(train_Xy.feature_types, feature_types)
|
||||
np.testing.assert_equal(valid_Xy.feature_types, feature_types)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"is_feature_cols,is_qdm",
|
||||
[(True, True), (True, False), (False, True), (False, False)],
|
||||
)
|
||||
def test_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool) -> None:
|
||||
run_dmatrix_ctor(is_feature_cols, is_qdm, on_gpu=False)
|
||||
|
||||
|
||||
def test_read_csr_matrix_from_unwrapped_spark_vec() -> None:
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
pd1 = pd.DataFrame(
|
||||
{
|
||||
"featureVectorType": [0, 1, 1, 0],
|
||||
"featureVectorSize": [3, None, None, 3],
|
||||
"featureVectorIndices": [
|
||||
np.array([0, 2], dtype=np.int32),
|
||||
None,
|
||||
None,
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
],
|
||||
"featureVectorValues": [
|
||||
np.array([3.0, 0.0], dtype=np.float64),
|
||||
np.array([13.0, 14.0, 0.0], dtype=np.float64),
|
||||
np.array([0.0, 24.0, 25.0], dtype=np.float64),
|
||||
np.array([0.0, 35.0], dtype=np.float64),
|
||||
],
|
||||
}
|
||||
)
|
||||
sm = _read_csr_matrix_from_unwrapped_spark_vec(pd1)
|
||||
assert isinstance(sm, csr_matrix)
|
||||
|
||||
np.testing.assert_array_equal(
|
||||
sm.data, [3.0, 0.0, 13.0, 14.0, 0.0, 0.0, 24.0, 25.0, 0.0, 35.0]
|
||||
)
|
||||
np.testing.assert_array_equal(sm.indptr, [0, 2, 5, 8, 10])
|
||||
np.testing.assert_array_equal(sm.indices, [0, 2, 0, 1, 2, 0, 1, 2, 1, 2])
|
||||
assert sm.shape == (4, 3)
|
||||
1125
tests/test_distributed/test_with_spark/test_spark_local.py
Normal file
1125
tests/test_distributed/test_with_spark/test_spark_local.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,449 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = pytest.mark.skipif(**tm.no_spark())
|
||||
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
|
||||
from xgboost.spark.utils import _get_max_num_concurrent_tasks
|
||||
|
||||
from .utils import SparkLocalClusterTestCase
|
||||
|
||||
|
||||
class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
|
||||
def setUp(self):
|
||||
random.seed(2020)
|
||||
|
||||
self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
|
||||
# The following code use xgboost python library to train xgb model and predict.
|
||||
#
|
||||
# >>> import numpy as np
|
||||
# >>> import xgboost
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
# >>> y = np.array([0, 1])
|
||||
# >>> reg1 = xgboost.XGBRegressor()
|
||||
# >>> reg1.fit(X, y)
|
||||
# >>> reg1.predict(X)
|
||||
# array([8.8363886e-04, 9.9911636e-01], dtype=float32)
|
||||
# >>> def custom_lr(boosting_round, num_boost_round):
|
||||
# ... return 1.0 / (boosting_round + 1)
|
||||
# ...
|
||||
# >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
|
||||
# >>> reg1.predict(X)
|
||||
# array([0.02406833, 0.97593164], dtype=float32)
|
||||
# >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
|
||||
# >>> reg2.fit(X, y)
|
||||
# >>> reg2.predict(X, ntree_limit=5)
|
||||
# array([0.22185263, 0.77814734], dtype=float32)
|
||||
self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
|
||||
self.reg_df_train = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
],
|
||||
["features", "label"],
|
||||
)
|
||||
self.reg_df_test = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_prediction_with_params",
|
||||
"expected_prediction_with_callbacks",
|
||||
],
|
||||
)
|
||||
|
||||
# Distributed section
|
||||
# Binary classification
|
||||
self.cls_df_train_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1),
|
||||
]
|
||||
* 100,
|
||||
["features", "label"],
|
||||
)
|
||||
self.cls_df_test_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.9949826, 0.0050174]),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0050174, 0.9949826]),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, [0.9949826, 0.0050174]),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0050174, 0.9949826]),
|
||||
],
|
||||
["features", "expected_label", "expected_probability"],
|
||||
)
|
||||
# Binary classification with different num_estimators
|
||||
self.cls_df_test_distributed_lower_estimators = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.9735, 0.0265]),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0265, 0.9735]),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, [0.9735, 0.0265]),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0265, 0.9735]),
|
||||
],
|
||||
["features", "expected_label", "expected_probability"],
|
||||
)
|
||||
|
||||
# Multiclass classification
|
||||
self.cls_df_train_distributed_multiclass = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
|
||||
]
|
||||
* 100,
|
||||
["features", "label"],
|
||||
)
|
||||
self.cls_df_test_distributed_multiclass = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [4.294563, -2.449409, -2.449409]),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
1,
|
||||
[-2.3796105, 3.669014, -2.449409],
|
||||
),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, [4.294563, -2.449409, -2.449409]),
|
||||
(
|
||||
Vectors.sparse(3, {1: 6.0, 2: 7.5}),
|
||||
2,
|
||||
[-2.3796105, -2.449409, 3.669014],
|
||||
),
|
||||
],
|
||||
["features", "expected_label", "expected_margins"],
|
||||
)
|
||||
|
||||
# Regression
|
||||
self.reg_df_train_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
|
||||
]
|
||||
* 100,
|
||||
["features", "label"],
|
||||
)
|
||||
self.reg_df_test_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 1.533e-04),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 9.999e-01),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 1.533e-04),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1.999e00),
|
||||
],
|
||||
["features", "expected_label"],
|
||||
)
|
||||
|
||||
# Adding weight and validation
|
||||
self.clf_params_with_eval_dist = {
|
||||
"validation_indicator_col": "isVal",
|
||||
"early_stopping_rounds": 1,
|
||||
"eval_metric": "logloss",
|
||||
}
|
||||
self.clf_params_with_weight_dist = {"weight_col": "weight"}
|
||||
self.cls_df_train_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
|
||||
]
|
||||
* 100,
|
||||
["features", "label", "isVal", "weight"],
|
||||
)
|
||||
self.cls_df_test_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(
|
||||
Vectors.dense(1.0, 2.0, 3.0),
|
||||
[0.9955, 0.0044],
|
||||
[0.9904, 0.0096],
|
||||
[0.9903, 0.0097],
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prob_with_weight",
|
||||
"expected_prob_with_eval",
|
||||
"expected_prob_with_weight_and_eval",
|
||||
],
|
||||
)
|
||||
self.clf_best_score_eval = 0.009677
|
||||
self.clf_best_score_weight_and_eval = 0.006626
|
||||
|
||||
self.reg_params_with_eval_dist = {
|
||||
"validation_indicator_col": "isVal",
|
||||
"early_stopping_rounds": 1,
|
||||
"eval_metric": "rmse",
|
||||
}
|
||||
self.reg_params_with_weight_dist = {"weight_col": "weight"}
|
||||
self.reg_df_train_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
|
||||
]
|
||||
* 100,
|
||||
["features", "label", "isVal", "weight"],
|
||||
)
|
||||
self.reg_df_test_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 4.583e-05, 5.239e-05, 6.03e-05),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
9.9997e-01,
|
||||
9.99947e-01,
|
||||
9.9995e-01,
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction_with_weight",
|
||||
"expected_prediction_with_eval",
|
||||
"expected_prediction_with_weight_and_eval",
|
||||
],
|
||||
)
|
||||
self.reg_best_score_eval = 5.239e-05
|
||||
self.reg_best_score_weight_and_eval = 4.810e-05
|
||||
|
||||
def test_regressor_basic_with_params(self):
|
||||
regressor = SparkXGBRegressor(**self.reg_params)
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_callbacks(self):
|
||||
from xgboost.callback import LearningRateScheduler
|
||||
|
||||
path = os.path.join(self.tempdir, str(uuid.uuid4()))
|
||||
|
||||
def custom_learning_rate(boosting_round):
|
||||
return 1.0 / (boosting_round + 1)
|
||||
|
||||
cb = [LearningRateScheduler(custom_learning_rate)]
|
||||
regressor = SparkXGBRegressor(callbacks=cb)
|
||||
|
||||
# Test the save/load of the estimator instead of the model, since
|
||||
# the callbacks param only exists in the estimator but not in the model
|
||||
regressor.save(path)
|
||||
regressor = SparkXGBRegressor.load(path)
|
||||
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_classifier_distributed_basic(self):
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
|
||||
model = classifier.fit(self.cls_df_train_distributed)
|
||||
pred_result = model.transform(self.cls_df_test_distributed).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
self.assertTrue(
|
||||
np.allclose(row.expected_probability, row.probability, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_classifier_distributed_multiclass(self):
|
||||
# There is no built-in multiclass option for external storage
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
|
||||
model = classifier.fit(self.cls_df_train_distributed_multiclass)
|
||||
pred_result = model.transform(self.cls_df_test_distributed_multiclass).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
self.assertTrue(
|
||||
np.allclose(row.expected_margins, row.rawPrediction, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_regressor_distributed_basic(self):
|
||||
regressor = SparkXGBRegressor(num_workers=self.n_workers, n_estimators=100)
|
||||
model = regressor.fit(self.reg_df_train_distributed)
|
||||
pred_result = model.transform(self.reg_df_test_distributed).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
|
||||
def test_classifier_distributed_weight_eval(self):
|
||||
# with weight
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.clf_params_with_weight_dist
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_prob_with_weight, atol=1e-3)
|
||||
)
|
||||
|
||||
# with eval only
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.clf_params_with_eval_dist
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_prob_with_eval, atol=1e-3)
|
||||
)
|
||||
assert np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.clf_best_score_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
# with both weight and eval
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.clf_params_with_eval_dist,
|
||||
**self.clf_params_with_weight_dist
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(
|
||||
row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
|
||||
)
|
||||
)
|
||||
np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.clf_best_score_weight_and_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
def test_regressor_distributed_weight_eval(self):
|
||||
# with weight
|
||||
regressor = SparkXGBRegressor(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.reg_params_with_weight_dist
|
||||
)
|
||||
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.reg_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_weight, atol=1e-3
|
||||
)
|
||||
)
|
||||
# with eval only
|
||||
regressor = SparkXGBRegressor(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.reg_params_with_eval_dist
|
||||
)
|
||||
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.reg_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(row.prediction, row.expected_prediction_with_eval, atol=1e-3)
|
||||
)
|
||||
assert np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.reg_best_score_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
# with both weight and eval
|
||||
regressor = SparkXGBRegressor(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
use_external_storage=False,
|
||||
**self.reg_params_with_eval_dist,
|
||||
**self.reg_params_with_weight_dist
|
||||
)
|
||||
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.reg_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction,
|
||||
row.expected_prediction_with_weight_and_eval,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
assert np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.reg_best_score_weight_and_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
def test_num_estimators(self):
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=10)
|
||||
model = classifier.fit(self.cls_df_train_distributed)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_lower_estimators
|
||||
).collect()
|
||||
print(pred_result)
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
self.assertTrue(
|
||||
np.allclose(row.expected_probability, row.probability, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_distributed_params(self):
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, max_depth=7)
|
||||
model = classifier.fit(self.cls_df_train_distributed)
|
||||
self.assertTrue(hasattr(classifier, "max_depth"))
|
||||
self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
|
||||
booster_config = json.loads(model.get_booster().save_config())
|
||||
max_depth = booster_config["learner"]["gradient_booster"]["updater"][
|
||||
"grow_histmaker"
|
||||
]["train_param"]["max_depth"]
|
||||
self.assertEqual(int(max_depth), 7)
|
||||
|
||||
def test_repartition(self):
|
||||
# The following test case has a few partitioned datasets that are either
|
||||
# well partitioned relative to the number of workers that the user wants
|
||||
# or poorly partitioned. We only want to repartition when the dataset
|
||||
# is poorly partitioned so _repartition_needed is true in those instances.
|
||||
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers)
|
||||
basic = self.cls_df_train_distributed
|
||||
self.assertTrue(classifier._repartition_needed(basic))
|
||||
bad_repartitioned = basic.repartition(self.n_workers + 1)
|
||||
self.assertTrue(classifier._repartition_needed(bad_repartitioned))
|
||||
good_repartitioned = basic.repartition(self.n_workers)
|
||||
self.assertFalse(classifier._repartition_needed(good_repartitioned))
|
||||
|
||||
# Now testing if force_repartition returns True regardless of whether the data is well partitioned
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers, force_repartition=True
|
||||
)
|
||||
good_repartitioned = basic.repartition(self.n_workers)
|
||||
self.assertTrue(classifier._repartition_needed(good_repartitioned))
|
||||
143
tests/test_distributed/test_with_spark/utils.py
Normal file
143
tests/test_distributed/test_with_spark/utils.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import contextlib
|
||||
import logging
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
from six import StringIO
|
||||
|
||||
from xgboost import testing as tm
|
||||
|
||||
pytestmark = [pytest.mark.skipif(**tm.no_spark())]
|
||||
|
||||
|
||||
from pyspark.sql import SparkSession, SQLContext
|
||||
from xgboost.spark.utils import _get_default_params_from_func
|
||||
|
||||
|
||||
class UtilsTest(unittest.TestCase):
|
||||
def test_get_default_params(self):
|
||||
class Foo:
|
||||
def func1(self, x, y, key1=None, key2="val2", key3=0, key4=None):
|
||||
pass
|
||||
|
||||
unsupported_params = {"key2", "key4"}
|
||||
expected_default_params = {
|
||||
"key1": None,
|
||||
"key3": 0,
|
||||
}
|
||||
actual_default_params = _get_default_params_from_func(
|
||||
Foo.func1, unsupported_params
|
||||
)
|
||||
self.assertEqual(
|
||||
len(expected_default_params.keys()), len(actual_default_params.keys())
|
||||
)
|
||||
for k, v in actual_default_params.items():
|
||||
self.assertEqual(expected_default_params[k], v)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def patch_stdout():
|
||||
"""patch stdout and give an output"""
|
||||
sys_stdout = sys.stdout
|
||||
io_out = StringIO()
|
||||
sys.stdout = io_out
|
||||
try:
|
||||
yield io_out
|
||||
finally:
|
||||
sys.stdout = sys_stdout
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def patch_logger(name):
|
||||
"""patch logger and give an output"""
|
||||
io_out = StringIO()
|
||||
log = logging.getLogger(name)
|
||||
handler = logging.StreamHandler(io_out)
|
||||
log.addHandler(handler)
|
||||
try:
|
||||
yield io_out
|
||||
finally:
|
||||
log.removeHandler(handler)
|
||||
|
||||
|
||||
class TestTempDir(object):
|
||||
@classmethod
|
||||
def make_tempdir(cls):
|
||||
"""
|
||||
:param dir: Root directory in which to create the temp directory
|
||||
"""
|
||||
cls.tempdir = tempfile.mkdtemp(prefix="sparkdl_tests")
|
||||
|
||||
@classmethod
|
||||
def remove_tempdir(cls):
|
||||
shutil.rmtree(cls.tempdir)
|
||||
|
||||
|
||||
class TestSparkContext(object):
|
||||
@classmethod
|
||||
def setup_env(cls, spark_config):
|
||||
builder = SparkSession.builder.appName("xgboost spark python API Tests")
|
||||
for k, v in spark_config.items():
|
||||
builder.config(k, v)
|
||||
spark = builder.getOrCreate()
|
||||
logging.getLogger("pyspark").setLevel(logging.INFO)
|
||||
|
||||
cls.sc = spark.sparkContext
|
||||
cls.session = spark
|
||||
|
||||
@classmethod
|
||||
def tear_down_env(cls):
|
||||
cls.session.stop()
|
||||
cls.session = None
|
||||
cls.sc.stop()
|
||||
cls.sc = None
|
||||
|
||||
|
||||
class SparkTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.setup_env(
|
||||
{
|
||||
"spark.master": "local[4]",
|
||||
"spark.python.worker.reuse": "false",
|
||||
"spark.driver.host": "127.0.0.1",
|
||||
"spark.task.maxFailures": "1",
|
||||
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
|
||||
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
|
||||
}
|
||||
)
|
||||
cls.make_tempdir()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.remove_tempdir()
|
||||
cls.tear_down_env()
|
||||
|
||||
|
||||
class SparkLocalClusterTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.setup_env(
|
||||
{
|
||||
"spark.master": "local-cluster[2, 2, 1024]",
|
||||
"spark.python.worker.reuse": "false",
|
||||
"spark.driver.host": "127.0.0.1",
|
||||
"spark.task.maxFailures": "1",
|
||||
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
|
||||
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
|
||||
"spark.cores.max": "4",
|
||||
"spark.task.cpus": "1",
|
||||
"spark.executor.cores": "2",
|
||||
}
|
||||
)
|
||||
cls.make_tempdir()
|
||||
# We run a dummy job so that we block until the workers have connected to the master
|
||||
cls.sc.parallelize(range(4), 4).barrier().mapPartitions(lambda _: []).collect()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.remove_tempdir()
|
||||
cls.tear_down_env()
|
||||
Reference in New Issue
Block a user