PySpark XGBoost integration (#8020)
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu> Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
@@ -10,7 +10,7 @@ RUN \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository ppa:ubuntu-toolchain-r/test && \
|
||||
apt-get update && \
|
||||
apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 && \
|
||||
apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
|
||||
# CMake
|
||||
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
|
||||
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
|
||||
@@ -24,6 +24,7 @@ ENV CXX=g++-8
|
||||
ENV CPP=cpp-8
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
||||
|
||||
# Create new Conda environment
|
||||
COPY conda_env/cpu_test.yml /scripts/
|
||||
|
||||
@@ -10,7 +10,7 @@ SHELL ["/bin/bash", "-c"] # Use Bash as shell
|
||||
RUN \
|
||||
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
|
||||
apt-get update && \
|
||||
apt-get install -y wget unzip bzip2 libgomp1 build-essential && \
|
||||
apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
|
||||
# Python
|
||||
wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
|
||||
bash Miniconda3.sh -b -p /opt/python
|
||||
@@ -19,11 +19,14 @@ ENV PATH=/opt/python/bin:$PATH
|
||||
|
||||
# Create new Conda environment with cuDF, Dask, and cuPy
|
||||
RUN \
|
||||
conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||
conda install -c conda-forge mamba && \
|
||||
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
|
||||
python=3.8 cudf=22.04* rmm=22.04* cudatoolkit=$CUDA_VERSION_ARG dask dask-cuda=22.04* dask-cudf=22.04* cupy \
|
||||
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
|
||||
numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||
pyspark cloudpickle cuda-python=11.7.0
|
||||
|
||||
ENV GOSU_VERSION 1.10
|
||||
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
||||
|
||||
# Install lightweight sudo (not bound to TTY)
|
||||
RUN set -ex; \
|
||||
|
||||
@@ -28,6 +28,8 @@ dependencies:
|
||||
- llvmlite
|
||||
- cffi
|
||||
- pyarrow
|
||||
- pyspark
|
||||
- cloudpickle
|
||||
- pip:
|
||||
- shap
|
||||
- awscli
|
||||
|
||||
@@ -36,6 +36,8 @@ dependencies:
|
||||
- cffi
|
||||
- pyarrow
|
||||
- protobuf<=3.20
|
||||
- pyspark
|
||||
- cloudpickle
|
||||
- pip:
|
||||
- shap
|
||||
- ipython # required by shap at import time.
|
||||
|
||||
@@ -35,6 +35,8 @@ dependencies:
|
||||
- py-ubjson
|
||||
- cffi
|
||||
- pyarrow
|
||||
- pyspark
|
||||
- cloudpickle
|
||||
- pip:
|
||||
- sphinx_rtd_theme
|
||||
- datatable
|
||||
|
||||
@@ -34,6 +34,18 @@ function install_xgboost {
|
||||
fi
|
||||
}
|
||||
|
||||
function setup_pyspark_envs {
|
||||
export PYSPARK_DRIVER_PYTHON=`which python`
|
||||
export PYSPARK_PYTHON=`which python`
|
||||
export SPARK_TESTING=1
|
||||
}
|
||||
|
||||
function unset_pyspark_envs {
|
||||
unset PYSPARK_DRIVER_PYTHON
|
||||
unset PYSPARK_PYTHON
|
||||
unset SPARK_TESTING
|
||||
}
|
||||
|
||||
function uninstall_xgboost {
|
||||
pip uninstall -y xgboost
|
||||
}
|
||||
@@ -43,14 +55,18 @@ case "$suite" in
|
||||
gpu)
|
||||
source activate gpu_test
|
||||
install_xgboost
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu
|
||||
unset_pyspark_envs
|
||||
uninstall_xgboost
|
||||
;;
|
||||
|
||||
mgpu)
|
||||
source activate gpu_test
|
||||
install_xgboost
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu
|
||||
unset_pyspark_envs
|
||||
|
||||
cd tests/distributed
|
||||
./runtests-gpu.sh
|
||||
@@ -61,7 +77,9 @@ case "$suite" in
|
||||
source activate cpu_test
|
||||
install_xgboost
|
||||
export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python
|
||||
unset_pyspark_envs
|
||||
cd tests/distributed
|
||||
./runtests.sh
|
||||
uninstall_xgboost
|
||||
@@ -70,7 +88,9 @@ case "$suite" in
|
||||
cpu-arm64)
|
||||
source activate aarch64_test
|
||||
install_xgboost
|
||||
setup_pyspark_envs
|
||||
pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py
|
||||
unset_pyspark_envs
|
||||
uninstall_xgboost
|
||||
;;
|
||||
|
||||
|
||||
@@ -44,13 +44,15 @@ def pytest_addoption(parser):
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
if config.getoption('--use-rmm-pool'):
|
||||
if config.getoption("--use-rmm-pool"):
|
||||
blocklist = [
|
||||
'python-gpu/test_gpu_demos.py::test_dask_training',
|
||||
'python-gpu/test_gpu_prediction.py::TestGPUPredict::test_shap',
|
||||
'python-gpu/test_gpu_linear.py::TestGPULinear'
|
||||
"python-gpu/test_gpu_demos.py::test_dask_training",
|
||||
"python-gpu/test_gpu_prediction.py::TestGPUPredict::test_shap",
|
||||
"python-gpu/test_gpu_linear.py::TestGPULinear",
|
||||
]
|
||||
skip_mark = pytest.mark.skip(reason='This test is not run when --use-rmm-pool flag is active')
|
||||
skip_mark = pytest.mark.skip(
|
||||
reason="This test is not run when --use-rmm-pool flag is active"
|
||||
)
|
||||
for item in items:
|
||||
if any(item.nodeid.startswith(x) for x in blocklist):
|
||||
item.add_marker(skip_mark)
|
||||
@@ -58,5 +60,9 @@ def pytest_collection_modifyitems(config, items):
|
||||
# mark dask tests as `mgpu`.
|
||||
mgpu_mark = pytest.mark.mgpu
|
||||
for item in items:
|
||||
if item.nodeid.startswith("python-gpu/test_gpu_with_dask.py"):
|
||||
if item.nodeid.startswith(
|
||||
"python-gpu/test_gpu_with_dask.py"
|
||||
) or item.nodeid.startswith(
|
||||
"python-gpu/test_spark_with_gpu/test_spark_with_gpu.py"
|
||||
):
|
||||
item.add_marker(mgpu_mark)
|
||||
|
||||
3
tests/python-gpu/test_spark_with_gpu/discover_gpu.sh
Executable file
3
tests/python-gpu/test_spark_with_gpu/discover_gpu.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}"
|
||||
120
tests/python-gpu/test_spark_with_gpu/test_spark_with_gpu.py
Normal file
120
tests/python-gpu/test_spark_with_gpu/test_spark_with_gpu.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import sys
|
||||
|
||||
import logging
|
||||
import pytest
|
||||
import sklearn
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
|
||||
if tm.no_dask()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
if sys.platform.startswith("win"):
|
||||
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
||||
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from xgboost.spark import SparkXGBRegressor, SparkXGBClassifier
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def spark_session_with_gpu():
|
||||
spark_config = {
|
||||
"spark.master": "local-cluster[1, 4, 1024]",
|
||||
"spark.python.worker.reuse": "false",
|
||||
"spark.driver.host": "127.0.0.1",
|
||||
"spark.task.maxFailures": "1",
|
||||
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
|
||||
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
|
||||
"spark.cores.max": "4",
|
||||
"spark.task.cpus": "1",
|
||||
"spark.executor.cores": "4",
|
||||
"spark.worker.resource.gpu.amount": "4",
|
||||
"spark.task.resource.gpu.amount": "1",
|
||||
"spark.executor.resource.gpu.amount": "4",
|
||||
"spark.worker.resource.gpu.discoveryScript": "tests/python-gpu/test_spark_with_gpu/discover_gpu.sh",
|
||||
}
|
||||
builder = SparkSession.builder.appName("xgboost spark python API Tests with GPU")
|
||||
for k, v in spark_config.items():
|
||||
builder.config(k, v)
|
||||
spark = builder.getOrCreate()
|
||||
logging.getLogger("pyspark").setLevel(logging.INFO)
|
||||
# We run a dummy job so that we block until the workers have connected to the master
|
||||
spark.sparkContext.parallelize(range(4), 4).barrier().mapPartitions(
|
||||
lambda _: []
|
||||
).collect()
|
||||
yield spark
|
||||
spark.stop()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spark_iris_dataset(spark_session_with_gpu):
|
||||
spark = spark_session_with_gpu
|
||||
data = sklearn.datasets.load_iris()
|
||||
train_rows = [
|
||||
(Vectors.dense(features), float(label))
|
||||
for features, label in zip(data.data[0::2], data.target[0::2])
|
||||
]
|
||||
train_df = spark.createDataFrame(
|
||||
spark.sparkContext.parallelize(train_rows, 4), ["features", "label"]
|
||||
)
|
||||
test_rows = [
|
||||
(Vectors.dense(features), float(label))
|
||||
for features, label in zip(data.data[1::2], data.target[1::2])
|
||||
]
|
||||
test_df = spark.createDataFrame(
|
||||
spark.sparkContext.parallelize(test_rows, 4), ["features", "label"]
|
||||
)
|
||||
return train_df, test_df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spark_diabetes_dataset(spark_session_with_gpu):
|
||||
spark = spark_session_with_gpu
|
||||
data = sklearn.datasets.load_diabetes()
|
||||
train_rows = [
|
||||
(Vectors.dense(features), float(label))
|
||||
for features, label in zip(data.data[0::2], data.target[0::2])
|
||||
]
|
||||
train_df = spark.createDataFrame(
|
||||
spark.sparkContext.parallelize(train_rows, 4), ["features", "label"]
|
||||
)
|
||||
test_rows = [
|
||||
(Vectors.dense(features), float(label))
|
||||
for features, label in zip(data.data[1::2], data.target[1::2])
|
||||
]
|
||||
test_df = spark.createDataFrame(
|
||||
spark.sparkContext.parallelize(test_rows, 4), ["features", "label"]
|
||||
)
|
||||
return train_df, test_df
|
||||
|
||||
|
||||
def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
|
||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
|
||||
|
||||
classifier = SparkXGBClassifier(
|
||||
use_gpu=True,
|
||||
num_workers=4,
|
||||
)
|
||||
train_df, test_df = spark_iris_dataset
|
||||
model = classifier.fit(train_df)
|
||||
pred_result_df = model.transform(test_df)
|
||||
evaluator = MulticlassClassificationEvaluator(metricName="f1")
|
||||
f1 = evaluator.evaluate(pred_result_df)
|
||||
assert f1 >= 0.97
|
||||
|
||||
|
||||
def test_sparkxgb_regressor_with_gpu(spark_diabetes_dataset):
|
||||
from pyspark.ml.evaluation import RegressionEvaluator
|
||||
|
||||
regressor = SparkXGBRegressor(
|
||||
use_gpu=True,
|
||||
num_workers=4,
|
||||
)
|
||||
train_df, test_df = spark_diabetes_dataset
|
||||
model = regressor.fit(train_df)
|
||||
pred_result_df = model.transform(test_df)
|
||||
evaluator = RegressionEvaluator(metricName="rmse")
|
||||
rmse = evaluator.evaluate(pred_result_df)
|
||||
assert rmse <= 65.0
|
||||
0
tests/python/test_spark/__init__.py
Normal file
0
tests/python/test_spark/__init__.py
Normal file
168
tests/python/test_spark/test_data.py
Normal file
168
tests/python/test_spark/test_data.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import sys
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
||||
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
||||
|
||||
from xgboost.spark.data import (
|
||||
_row_tuple_list_to_feature_matrix_y_w,
|
||||
_convert_partition_data_to_dmatrix,
|
||||
)
|
||||
|
||||
from xgboost import DMatrix, XGBClassifier
|
||||
from xgboost.training import train as worker_train
|
||||
from .utils import SparkTestCase
|
||||
import logging
|
||||
|
||||
logging.getLogger("py4j").setLevel(logging.INFO)
|
||||
|
||||
|
||||
class DataTest(SparkTestCase):
|
||||
def test_sparse_dense_vector(self):
|
||||
def row_tup_iter(data):
|
||||
pdf = pd.DataFrame(data)
|
||||
yield pdf
|
||||
|
||||
expected_ndarray = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
data = {"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]]}
|
||||
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
||||
list(row_tup_iter(data)),
|
||||
train=False,
|
||||
has_weight=False,
|
||||
has_fit_base_margin=False,
|
||||
has_predict_base_margin=False,
|
||||
)
|
||||
self.assertIsNone(y)
|
||||
self.assertIsNone(w)
|
||||
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
||||
|
||||
data["label"] = [1, 0]
|
||||
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
||||
row_tup_iter(data),
|
||||
train=True,
|
||||
has_weight=False,
|
||||
has_fit_base_margin=False,
|
||||
has_predict_base_margin=False,
|
||||
)
|
||||
self.assertIsNone(w)
|
||||
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
||||
self.assertTrue(np.array_equal(y, np.array(data["label"])))
|
||||
|
||||
data["weight"] = [0.2, 0.8]
|
||||
feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
|
||||
list(row_tup_iter(data)),
|
||||
train=True,
|
||||
has_weight=True,
|
||||
has_fit_base_margin=False,
|
||||
has_predict_base_margin=False,
|
||||
)
|
||||
self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
|
||||
self.assertTrue(np.array_equal(y, np.array(data["label"])))
|
||||
self.assertTrue(np.array_equal(w, np.array(data["weight"])))
|
||||
|
||||
def test_dmatrix_creator(self):
|
||||
|
||||
# This function acts as a pseudo-itertools.chain()
|
||||
def row_tup_iter(data):
|
||||
pdf = pd.DataFrame(data)
|
||||
yield pdf
|
||||
|
||||
# Standard testing DMatrix creation
|
||||
expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
|
||||
expected_labels = np.array([1, 0] * 100)
|
||||
expected_dmatrix = DMatrix(data=expected_features, label=expected_labels)
|
||||
|
||||
data = {
|
||||
"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
|
||||
"label": [1, 0] * 100,
|
||||
}
|
||||
output_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=False,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
# You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
|
||||
# the same classifier and making sure the outputs are equal
|
||||
model = XGBClassifier()
|
||||
model.fit(expected_features, expected_labels)
|
||||
expected_preds = model.get_booster().predict(expected_dmatrix)
|
||||
output_preds = model.get_booster().predict(output_dmatrix)
|
||||
self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
|
||||
|
||||
# DMatrix creation with weights
|
||||
expected_weight = np.array([0.2, 0.8] * 100)
|
||||
expected_dmatrix = DMatrix(
|
||||
data=expected_features, label=expected_labels, weight=expected_weight
|
||||
)
|
||||
|
||||
data["weight"] = [0.2, 0.8] * 100
|
||||
output_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=True,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
|
||||
model.fit(expected_features, expected_labels, sample_weight=expected_weight)
|
||||
expected_preds = model.get_booster().predict(expected_dmatrix)
|
||||
output_preds = model.get_booster().predict(output_dmatrix)
|
||||
self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
|
||||
|
||||
def test_external_storage(self):
|
||||
# Instantiating base data (features, labels)
|
||||
features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
|
||||
labels = np.array([1, 0] * 100)
|
||||
normal_dmatrix = DMatrix(features, labels)
|
||||
test_dmatrix = DMatrix(features)
|
||||
|
||||
data = {
|
||||
"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
|
||||
"label": [1, 0] * 100,
|
||||
}
|
||||
|
||||
# Creating the dmatrix based on storage
|
||||
temporary_path = tempfile.mkdtemp()
|
||||
storage_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=False,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
|
||||
# Testing without weights
|
||||
normal_booster = worker_train({}, normal_dmatrix)
|
||||
storage_booster = worker_train({}, storage_dmatrix)
|
||||
normal_preds = normal_booster.predict(test_dmatrix)
|
||||
storage_preds = storage_booster.predict(test_dmatrix)
|
||||
self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
|
||||
shutil.rmtree(temporary_path)
|
||||
|
||||
# Testing weights
|
||||
weights = np.array([0.2, 0.8] * 100)
|
||||
normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
|
||||
data["weight"] = [0.2, 0.8] * 100
|
||||
|
||||
temporary_path = tempfile.mkdtemp()
|
||||
storage_dmatrix = _convert_partition_data_to_dmatrix(
|
||||
[pd.DataFrame(data)],
|
||||
has_weight=True,
|
||||
has_validation=False,
|
||||
has_base_margin=False,
|
||||
)
|
||||
|
||||
normal_booster = worker_train({}, normal_dmatrix)
|
||||
storage_booster = worker_train({}, storage_dmatrix)
|
||||
normal_preds = normal_booster.predict(test_dmatrix)
|
||||
storage_preds = storage_booster.predict(test_dmatrix)
|
||||
self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
|
||||
shutil.rmtree(temporary_path)
|
||||
971
tests/python/test_spark/test_spark_local.py
Normal file
971
tests/python/test_spark/test_spark_local.py
Normal file
@@ -0,0 +1,971 @@
|
||||
import sys
|
||||
import logging
|
||||
import random
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
||||
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
||||
|
||||
from pyspark.ml.functions import vector_to_array
|
||||
from pyspark.sql import functions as spark_sql_func
|
||||
from pyspark.ml import Pipeline, PipelineModel
|
||||
from pyspark.ml.evaluation import (
|
||||
BinaryClassificationEvaluator,
|
||||
MulticlassClassificationEvaluator,
|
||||
)
|
||||
from pyspark.ml.linalg import Vectors
|
||||
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
|
||||
|
||||
from xgboost.spark import (
|
||||
SparkXGBClassifier,
|
||||
SparkXGBClassifierModel,
|
||||
SparkXGBRegressor,
|
||||
SparkXGBRegressorModel,
|
||||
)
|
||||
from .utils import SparkTestCase
|
||||
from xgboost import XGBClassifier, XGBRegressor
|
||||
from xgboost.spark.core import _non_booster_params
|
||||
|
||||
logging.getLogger("py4j").setLevel(logging.INFO)
|
||||
|
||||
|
||||
class XgboostLocalTest(SparkTestCase):
|
||||
def setUp(self):
|
||||
logging.getLogger().setLevel("INFO")
|
||||
random.seed(2020)
|
||||
|
||||
# The following code use xgboost python library to train xgb model and predict.
|
||||
#
|
||||
# >>> import numpy as np
|
||||
# >>> import xgboost
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
# >>> y = np.array([0, 1])
|
||||
# >>> reg1 = xgboost.XGBRegressor()
|
||||
# >>> reg1.fit(X, y)
|
||||
# >>> reg1.predict(X)
|
||||
# array([8.8375784e-04, 9.9911624e-01], dtype=float32)
|
||||
# >>> def custom_lr(boosting_round):
|
||||
# ... return 1.0 / (boosting_round + 1)
|
||||
# ...
|
||||
# >>> reg1.fit(X, y, callbacks=[xgboost.callback.LearningRateScheduler(custom_lr)])
|
||||
# >>> reg1.predict(X)
|
||||
# array([0.02406844, 0.9759315 ], dtype=float32)
|
||||
# >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
|
||||
# >>> reg2.fit(X, y)
|
||||
# >>> reg2.predict(X, ntree_limit=5)
|
||||
# array([0.22185266, 0.77814734], dtype=float32)
|
||||
self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
|
||||
self.reg_df_train = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
],
|
||||
["features", "label"],
|
||||
)
|
||||
self.reg_df_test = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_prediction_with_params",
|
||||
"expected_prediction_with_callbacks",
|
||||
],
|
||||
)
|
||||
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
# >>> y = np.array([0, 1])
|
||||
# >>> cl1 = xgboost.XGBClassifier()
|
||||
# >>> cl1.fit(X, y)
|
||||
# >>> cl1.predict(X)
|
||||
# array([0, 0])
|
||||
# >>> cl1.predict_proba(X)
|
||||
# array([[0.5, 0.5],
|
||||
# [0.5, 0.5]], dtype=float32)
|
||||
# >>> cl2 = xgboost.XGBClassifier(max_depth=5, n_estimators=10, scale_pos_weight=4)
|
||||
# >>> cl2.fit(X, y)
|
||||
# >>> cl2.predict(X)
|
||||
# array([1, 1])
|
||||
# >>> cl2.predict_proba(X)
|
||||
# array([[0.27574146, 0.72425854 ],
|
||||
# [0.27574146, 0.72425854 ]], dtype=float32)
|
||||
self.cls_params = {"max_depth": 5, "n_estimators": 10, "scale_pos_weight": 4}
|
||||
|
||||
cls_df_train_data = [
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
]
|
||||
self.cls_df_train = self.session.createDataFrame(
|
||||
cls_df_train_data, ["features", "label"]
|
||||
)
|
||||
self.cls_df_train_large = self.session.createDataFrame(
|
||||
cls_df_train_data * 100, ["features", "label"]
|
||||
)
|
||||
self.cls_df_test = self.session.createDataFrame(
|
||||
[
|
||||
(
|
||||
Vectors.dense(1.0, 2.0, 3.0),
|
||||
0,
|
||||
[0.5, 0.5],
|
||||
1,
|
||||
[0.27574146, 0.72425854],
|
||||
),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
0,
|
||||
[0.5, 0.5],
|
||||
1,
|
||||
[0.27574146, 0.72425854],
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_probability",
|
||||
"expected_prediction_with_params",
|
||||
"expected_probability_with_params",
|
||||
],
|
||||
)
|
||||
|
||||
# kwargs test (using the above data, train, we get the same results)
|
||||
self.cls_params_kwargs = {"tree_method": "approx", "sketch_eps": 0.03}
|
||||
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [1.0, 2.0, 4.0], [0.0, 1.0, 5.5], [-1.0, -2.0, 1.0]])
|
||||
# >>> y = np.array([0, 0, 1, 2])
|
||||
# >>> cl = xgboost.XGBClassifier()
|
||||
# >>> cl.fit(X, y)
|
||||
# >>> cl.predict_proba(np.array([[1.0, 2.0, 3.0]]))
|
||||
# array([[0.5374299 , 0.23128504, 0.23128504]], dtype=float32)
|
||||
multi_cls_df_train_data = [
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.dense(1.0, 2.0, 4.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(-1.0, -2.0, 1.0), 2),
|
||||
]
|
||||
self.multi_cls_df_train = self.session.createDataFrame(
|
||||
multi_cls_df_train_data, ["features", "label"]
|
||||
)
|
||||
self.multi_cls_df_train_large = self.session.createDataFrame(
|
||||
multi_cls_df_train_data * 100, ["features", "label"]
|
||||
)
|
||||
self.multi_cls_df_test = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), [0.5374, 0.2312, 0.2312]),
|
||||
],
|
||||
["features", "expected_probability"],
|
||||
)
|
||||
|
||||
# Test regressor with weight and eval set
|
||||
# >>> import numpy as np
|
||||
# >>> import xgboost
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5], [4.0, 5.0, 6.0], [0.0, 6.0, 7.5]])
|
||||
# >>> w = np.array([1.0, 2.0, 1.0, 2.0])
|
||||
# >>> y = np.array([0, 1, 2, 3])
|
||||
# >>> reg1 = xgboost.XGBRegressor()
|
||||
# >>> reg1.fit(X, y, sample_weight=w)
|
||||
# >>> reg1.predict(X)
|
||||
# >>> array([1.0679445e-03, 1.0000550e+00, ...
|
||||
# >>> X_train = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
# >>> X_val = np.array([[4.0, 5.0, 6.0], [0.0, 6.0, 7.5]])
|
||||
# >>> y_train = np.array([0, 1])
|
||||
# >>> y_val = np.array([2, 3])
|
||||
# >>> w_train = np.array([1.0, 2.0])
|
||||
# >>> w_val = np.array([1.0, 2.0])
|
||||
# >>> reg2 = xgboost.XGBRegressor()
|
||||
# >>> reg2.fit(X_train, y_train, eval_set=[(X_val, y_val)],
|
||||
# >>> early_stopping_rounds=1, eval_metric='rmse')
|
||||
# >>> reg2.predict(X)
|
||||
# >>> array([8.8370638e-04, 9.9911624e-01, ...
|
||||
# >>> reg2.best_score
|
||||
# 2.0000002682208837
|
||||
# >>> reg3 = xgboost.XGBRegressor()
|
||||
# >>> reg3.fit(X_train, y_train, sample_weight=w_train, eval_set=[(X_val, y_val)],
|
||||
# >>> sample_weight_eval_set=[w_val],
|
||||
# >>> early_stopping_rounds=1, eval_metric='rmse')
|
||||
# >>> reg3.predict(X)
|
||||
# >>> array([0.03155671, 0.98874104,...
|
||||
# >>> reg3.best_score
|
||||
# 1.9970891552124017
|
||||
self.reg_df_train_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 2, True, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 3, True, 2.0),
|
||||
],
|
||||
["features", "label", "isVal", "weight"],
|
||||
)
|
||||
self.reg_params_with_eval = {
|
||||
"validation_indicator_col": "isVal",
|
||||
"early_stopping_rounds": 1,
|
||||
"eval_metric": "rmse",
|
||||
}
|
||||
self.reg_df_test_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0.001068, 0.00088, 0.03155),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.000055, 0.9991, 0.9887),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction_with_weight",
|
||||
"expected_prediction_with_eval",
|
||||
"expected_prediction_with_weight_and_eval",
|
||||
],
|
||||
)
|
||||
self.reg_with_eval_best_score = 2.0
|
||||
self.reg_with_eval_and_weight_best_score = 1.997
|
||||
|
||||
# Test classifier with weight and eval set
|
||||
# >>> import numpy as np
|
||||
# >>> import xgboost
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5], [4.0, 5.0, 6.0], [0.0, 6.0, 7.5]])
|
||||
# >>> w = np.array([1.0, 2.0, 1.0, 2.0])
|
||||
# >>> y = np.array([0, 1, 0, 1])
|
||||
# >>> cls1 = xgboost.XGBClassifier()
|
||||
# >>> cls1.fit(X, y, sample_weight=w)
|
||||
# >>> cls1.predict_proba(X)
|
||||
# array([[0.3333333, 0.6666667],...
|
||||
# >>> X_train = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
# >>> X_val = np.array([[4.0, 5.0, 6.0], [0.0, 6.0, 7.5]])
|
||||
# >>> y_train = np.array([0, 1])
|
||||
# >>> y_val = np.array([0, 1])
|
||||
# >>> w_train = np.array([1.0, 2.0])
|
||||
# >>> w_val = np.array([1.0, 2.0])
|
||||
# >>> cls2 = xgboost.XGBClassifier()
|
||||
# >>> cls2.fit(X_train, y_train, eval_set=[(X_val, y_val)],
|
||||
# >>> early_stopping_rounds=1, eval_metric='logloss')
|
||||
# >>> cls2.predict_proba(X)
|
||||
# array([[0.5, 0.5],...
|
||||
# >>> cls2.best_score
|
||||
# 0.6931
|
||||
# >>> cls3 = xgboost.XGBClassifier()
|
||||
# >>> cls3.fit(X_train, y_train, sample_weight=w_train, eval_set=[(X_val, y_val)],
|
||||
# >>> sample_weight_eval_set=[w_val],
|
||||
# >>> early_stopping_rounds=1, eval_metric='logloss')
|
||||
# >>> cls3.predict_proba(X)
|
||||
# array([[0.3344962, 0.6655038],...
|
||||
# >>> cls3.best_score
|
||||
# 0.6365
|
||||
self.cls_df_train_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
|
||||
],
|
||||
["features", "label", "isVal", "weight"],
|
||||
)
|
||||
self.cls_params_with_eval = {
|
||||
"validation_indicator_col": "isVal",
|
||||
"early_stopping_rounds": 1,
|
||||
"eval_metric": "logloss",
|
||||
}
|
||||
self.cls_df_test_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(
|
||||
Vectors.dense(1.0, 2.0, 3.0),
|
||||
[0.3333, 0.6666],
|
||||
[0.5, 0.5],
|
||||
[0.3097, 0.6903],
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prob_with_weight",
|
||||
"expected_prob_with_eval",
|
||||
"expected_prob_with_weight_and_eval",
|
||||
],
|
||||
)
|
||||
self.cls_with_eval_best_score = 0.6931
|
||||
self.cls_with_eval_and_weight_best_score = 0.6378
|
||||
|
||||
# Test classifier with both base margin and without
|
||||
# >>> import numpy as np
|
||||
# >>> import xgboost
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5], [4.0, 5.0, 6.0], [0.0, 6.0, 7.5]])
|
||||
# >>> w = np.array([1.0, 2.0, 1.0, 2.0])
|
||||
# >>> y = np.array([0, 1, 0, 1])
|
||||
# >>> base_margin = np.array([1,0,0,1])
|
||||
#
|
||||
# This is without the base margin
|
||||
# >>> cls1 = xgboost.XGBClassifier()
|
||||
# >>> cls1.fit(X, y, sample_weight=w)
|
||||
# >>> cls1.predict_proba(np.array([[1.0, 2.0, 3.0]]))
|
||||
# array([[0.3333333, 0.6666667]], dtype=float32)
|
||||
# >>> cls1.predict(np.array([[1.0, 2.0, 3.0]]))
|
||||
# array([1])
|
||||
#
|
||||
# This is with the same base margin for predict
|
||||
# >>> cls2 = xgboost.XGBClassifier()
|
||||
# >>> cls2.fit(X, y, sample_weight=w, base_margin=base_margin)
|
||||
# >>> cls2.predict_proba(np.array([[1.0, 2.0, 3.0]]), base_margin=[0])
|
||||
# array([[0.44142532, 0.5585747 ]], dtype=float32)
|
||||
# >>> cls2.predict(np.array([[1.0, 2.0, 3.0]]), base_margin=[0])
|
||||
# array([1])
|
||||
#
|
||||
# This is with a different base margin for predict
|
||||
# # >>> cls2 = xgboost.XGBClassifier()
|
||||
# >>> cls2.fit(X, y, sample_weight=w, base_margin=base_margin)
|
||||
# >>> cls2.predict_proba(np.array([[1.0, 2.0, 3.0]]), base_margin=[1])
|
||||
# array([[0.2252, 0.7747 ]], dtype=float32)
|
||||
# >>> cls2.predict(np.array([[1.0, 2.0, 3.0]]), base_margin=[0])
|
||||
# array([1])
|
||||
self.cls_df_train_without_base_margin = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 2.0),
|
||||
],
|
||||
["features", "label", "weight"],
|
||||
)
|
||||
self.cls_df_test_without_base_margin = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), [0.3333, 0.6666], 1),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prob_without_base_margin",
|
||||
"expected_prediction_without_base_margin",
|
||||
],
|
||||
)
|
||||
|
||||
self.cls_df_train_with_same_base_margin = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, 1.0, 1),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, 2.0, 0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, 1.0, 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 2.0, 1),
|
||||
],
|
||||
["features", "label", "weight", "base_margin"],
|
||||
)
|
||||
self.cls_df_test_with_same_base_margin = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.4415, 0.5585], 1),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"base_margin",
|
||||
"expected_prob_with_base_margin",
|
||||
"expected_prediction_with_base_margin",
|
||||
],
|
||||
)
|
||||
|
||||
self.cls_df_train_with_different_base_margin = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, 1.0, 1),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, 2.0, 0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, 1.0, 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 2.0, 1),
|
||||
],
|
||||
["features", "label", "weight", "base_margin"],
|
||||
)
|
||||
self.cls_df_test_with_different_base_margin = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 1, [0.2252, 0.7747], 1),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"base_margin",
|
||||
"expected_prob_with_base_margin",
|
||||
"expected_prediction_with_base_margin",
|
||||
],
|
||||
)
|
||||
|
||||
def get_local_tmp_dir(self):
|
||||
return self.tempdir + str(uuid.uuid4())
|
||||
|
||||
def test_regressor_params_basic(self):
|
||||
py_reg = SparkXGBRegressor()
|
||||
self.assertTrue(hasattr(py_reg, "n_estimators"))
|
||||
self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
|
||||
self.assertFalse(hasattr(py_reg, "gpu_id"))
|
||||
self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
|
||||
py_reg2 = SparkXGBRegressor(n_estimators=200)
|
||||
self.assertEqual(py_reg2.getOrDefault(py_reg2.n_estimators), 200)
|
||||
py_reg3 = py_reg2.copy({py_reg2.max_depth: 10})
|
||||
self.assertEqual(py_reg3.getOrDefault(py_reg3.n_estimators), 200)
|
||||
self.assertEqual(py_reg3.getOrDefault(py_reg3.max_depth), 10)
|
||||
|
||||
def test_classifier_params_basic(self):
|
||||
py_cls = SparkXGBClassifier()
|
||||
self.assertTrue(hasattr(py_cls, "n_estimators"))
|
||||
self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
|
||||
self.assertFalse(hasattr(py_cls, "gpu_id"))
|
||||
self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
|
||||
py_cls2 = SparkXGBClassifier(n_estimators=200)
|
||||
self.assertEqual(py_cls2.getOrDefault(py_cls2.n_estimators), 200)
|
||||
py_cls3 = py_cls2.copy({py_cls2.max_depth: 10})
|
||||
self.assertEqual(py_cls3.getOrDefault(py_cls3.n_estimators), 200)
|
||||
self.assertEqual(py_cls3.getOrDefault(py_cls3.max_depth), 10)
|
||||
|
||||
def test_classifier_kwargs_basic(self):
|
||||
py_cls = SparkXGBClassifier(**self.cls_params_kwargs)
|
||||
self.assertTrue(hasattr(py_cls, "n_estimators"))
|
||||
self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
|
||||
self.assertFalse(hasattr(py_cls, "gpu_id"))
|
||||
self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
|
||||
expected_kwargs = {"sketch_eps": 0.03}
|
||||
self.assertEqual(
|
||||
py_cls.getOrDefault(py_cls.arbitrary_params_dict), expected_kwargs
|
||||
)
|
||||
|
||||
# Testing overwritten params
|
||||
py_cls = SparkXGBClassifier()
|
||||
py_cls.setParams(x=1, y=2)
|
||||
py_cls.setParams(y=3, z=4)
|
||||
xgb_params = py_cls._gen_xgb_params_dict()
|
||||
assert xgb_params["x"] == 1
|
||||
assert xgb_params["y"] == 3
|
||||
assert xgb_params["z"] == 4
|
||||
|
||||
def test_param_alias(self):
|
||||
py_cls = SparkXGBClassifier(features_col="f1", label_col="l1")
|
||||
self.assertEqual(py_cls.getOrDefault(py_cls.featuresCol), "f1")
|
||||
self.assertEqual(py_cls.getOrDefault(py_cls.labelCol), "l1")
|
||||
with pytest.raises(
|
||||
ValueError, match="Please use param name features_col instead"
|
||||
):
|
||||
SparkXGBClassifier(featuresCol="f1")
|
||||
|
||||
def test_gpu_param_setting(self):
|
||||
py_cls = SparkXGBClassifier(use_gpu=True)
|
||||
train_params = py_cls._get_distributed_train_params(self.cls_df_train)
|
||||
assert train_params["tree_method"] == "gpu_hist"
|
||||
|
||||
@staticmethod
|
||||
def test_param_value_converter():
|
||||
py_cls = SparkXGBClassifier(missing=np.float64(1.0), sketch_eps=np.float64(0.3))
|
||||
# don't check by isintance(v, float) because for numpy scalar it will also return True
|
||||
assert py_cls.getOrDefault(py_cls.missing).__class__.__name__ == "float"
|
||||
assert (
|
||||
py_cls.getOrDefault(py_cls.arbitrary_params_dict)[
|
||||
"sketch_eps"
|
||||
].__class__.__name__
|
||||
== "float64"
|
||||
)
|
||||
|
||||
def test_regressor_basic(self):
|
||||
regressor = SparkXGBRegressor()
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(row.prediction, row.expected_prediction, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_classifier_basic(self):
|
||||
classifier = SparkXGBClassifier()
|
||||
model = classifier.fit(self.cls_df_train)
|
||||
pred_result = model.transform(self.cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertEqual(row.prediction, row.expected_prediction)
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_probability, rtol=1e-3)
|
||||
)
|
||||
|
||||
def test_multi_classifier(self):
|
||||
classifier = SparkXGBClassifier()
|
||||
model = classifier.fit(self.multi_cls_df_train)
|
||||
pred_result = model.transform(self.multi_cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_probability, rtol=1e-3)
|
||||
)
|
||||
|
||||
def _check_sub_dict_match(self, sub_dist, whole_dict, excluding_keys):
|
||||
for k in sub_dist:
|
||||
if k not in excluding_keys:
|
||||
self.assertTrue(k in whole_dict, f"check on {k} failed")
|
||||
self.assertEqual(sub_dist[k], whole_dict[k], f"check on {k} failed")
|
||||
|
||||
def test_regressor_with_params(self):
|
||||
regressor = SparkXGBRegressor(**self.reg_params)
|
||||
all_params = dict(
|
||||
**(regressor._gen_xgb_params_dict()),
|
||||
**(regressor._gen_fit_params_dict()),
|
||||
**(regressor._gen_predict_params_dict()),
|
||||
)
|
||||
self._check_sub_dict_match(
|
||||
self.reg_params, all_params, excluding_keys=_non_booster_params
|
||||
)
|
||||
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
all_params = dict(
|
||||
**(model._gen_xgb_params_dict()),
|
||||
**(model._gen_fit_params_dict()),
|
||||
**(model._gen_predict_params_dict()),
|
||||
)
|
||||
self._check_sub_dict_match(
|
||||
self.reg_params, all_params, excluding_keys=_non_booster_params
|
||||
)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_classifier_with_params(self):
|
||||
classifier = SparkXGBClassifier(**self.cls_params)
|
||||
all_params = dict(
|
||||
**(classifier._gen_xgb_params_dict()),
|
||||
**(classifier._gen_fit_params_dict()),
|
||||
**(classifier._gen_predict_params_dict()),
|
||||
)
|
||||
self._check_sub_dict_match(
|
||||
self.cls_params, all_params, excluding_keys=_non_booster_params
|
||||
)
|
||||
|
||||
model = classifier.fit(self.cls_df_train)
|
||||
all_params = dict(
|
||||
**(model._gen_xgb_params_dict()),
|
||||
**(model._gen_fit_params_dict()),
|
||||
**(model._gen_predict_params_dict()),
|
||||
)
|
||||
self._check_sub_dict_match(
|
||||
self.cls_params, all_params, excluding_keys=_non_booster_params
|
||||
)
|
||||
pred_result = model.transform(self.cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertEqual(row.prediction, row.expected_prediction_with_params)
|
||||
self.assertTrue(
|
||||
np.allclose(
|
||||
row.probability, row.expected_probability_with_params, rtol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_regressor_model_save_load(self):
|
||||
path = "file:" + self.get_local_tmp_dir()
|
||||
regressor = SparkXGBRegressor(**self.reg_params)
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
model.save(path)
|
||||
loaded_model = SparkXGBRegressorModel.load(path)
|
||||
self.assertEqual(model.uid, loaded_model.uid)
|
||||
for k, v in self.reg_params.items():
|
||||
self.assertEqual(loaded_model.getOrDefault(k), v)
|
||||
|
||||
pred_result = loaded_model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(AssertionError, "Expected class name"):
|
||||
SparkXGBClassifierModel.load(path)
|
||||
|
||||
def test_classifier_model_save_load(self):
|
||||
path = "file:" + self.get_local_tmp_dir()
|
||||
regressor = SparkXGBClassifier(**self.cls_params)
|
||||
model = regressor.fit(self.cls_df_train)
|
||||
model.save(path)
|
||||
loaded_model = SparkXGBClassifierModel.load(path)
|
||||
self.assertEqual(model.uid, loaded_model.uid)
|
||||
for k, v in self.cls_params.items():
|
||||
self.assertEqual(loaded_model.getOrDefault(k), v)
|
||||
|
||||
pred_result = loaded_model.transform(self.cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(
|
||||
row.probability, row.expected_probability_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(AssertionError, "Expected class name"):
|
||||
SparkXGBRegressorModel.load(path)
|
||||
|
||||
@staticmethod
|
||||
def _get_params_map(params_kv, estimator):
|
||||
return {getattr(estimator, k): v for k, v in params_kv.items()}
|
||||
|
||||
def test_regressor_model_pipeline_save_load(self):
|
||||
path = "file:" + self.get_local_tmp_dir()
|
||||
regressor = SparkXGBRegressor()
|
||||
pipeline = Pipeline(stages=[regressor])
|
||||
pipeline = pipeline.copy(extra=self._get_params_map(self.reg_params, regressor))
|
||||
model = pipeline.fit(self.reg_df_train)
|
||||
model.save(path)
|
||||
|
||||
loaded_model = PipelineModel.load(path)
|
||||
for k, v in self.reg_params.items():
|
||||
self.assertEqual(loaded_model.stages[0].getOrDefault(k), v)
|
||||
|
||||
pred_result = loaded_model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_classifier_model_pipeline_save_load(self):
|
||||
path = "file:" + self.get_local_tmp_dir()
|
||||
classifier = SparkXGBClassifier()
|
||||
pipeline = Pipeline(stages=[classifier])
|
||||
pipeline = pipeline.copy(
|
||||
extra=self._get_params_map(self.cls_params, classifier)
|
||||
)
|
||||
model = pipeline.fit(self.cls_df_train)
|
||||
model.save(path)
|
||||
|
||||
loaded_model = PipelineModel.load(path)
|
||||
for k, v in self.cls_params.items():
|
||||
self.assertEqual(loaded_model.stages[0].getOrDefault(k), v)
|
||||
|
||||
pred_result = loaded_model.transform(self.cls_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(
|
||||
row.probability, row.expected_probability_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_classifier_with_cross_validator(self):
|
||||
xgb_classifer = SparkXGBClassifier()
|
||||
paramMaps = ParamGridBuilder().addGrid(xgb_classifer.max_depth, [1, 2]).build()
|
||||
cvBin = CrossValidator(
|
||||
estimator=xgb_classifer,
|
||||
estimatorParamMaps=paramMaps,
|
||||
evaluator=BinaryClassificationEvaluator(),
|
||||
seed=1,
|
||||
)
|
||||
cvBinModel = cvBin.fit(self.cls_df_train_large)
|
||||
cvBinModel.transform(self.cls_df_test)
|
||||
cvMulti = CrossValidator(
|
||||
estimator=xgb_classifer,
|
||||
estimatorParamMaps=paramMaps,
|
||||
evaluator=MulticlassClassificationEvaluator(),
|
||||
seed=1,
|
||||
)
|
||||
cvMultiModel = cvMulti.fit(self.multi_cls_df_train_large)
|
||||
cvMultiModel.transform(self.multi_cls_df_test)
|
||||
|
||||
def test_callbacks(self):
|
||||
from xgboost.callback import LearningRateScheduler
|
||||
|
||||
path = self.get_local_tmp_dir()
|
||||
|
||||
def custom_learning_rate(boosting_round):
|
||||
return 1.0 / (boosting_round + 1)
|
||||
|
||||
cb = [LearningRateScheduler(custom_learning_rate)]
|
||||
regressor = SparkXGBRegressor(callbacks=cb)
|
||||
|
||||
# Test the save/load of the estimator instead of the model, since
|
||||
# the callbacks param only exists in the estimator but not in the model
|
||||
regressor.save(path)
|
||||
regressor = SparkXGBRegressor.load(path)
|
||||
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_train_with_initial_model(self):
|
||||
path = self.get_local_tmp_dir()
|
||||
reg1 = SparkXGBRegressor(**self.reg_params)
|
||||
model = reg1.fit(self.reg_df_train)
|
||||
init_booster = model.get_booster()
|
||||
reg2 = SparkXGBRegressor(max_depth=2, n_estimators=2, xgb_model=init_booster)
|
||||
model21 = reg2.fit(self.reg_df_train)
|
||||
pred_res21 = model21.transform(self.reg_df_test).collect()
|
||||
reg2.save(path)
|
||||
reg2 = SparkXGBRegressor.load(path)
|
||||
self.assertTrue(reg2.getOrDefault(reg2.xgb_model) is not None)
|
||||
model22 = reg2.fit(self.reg_df_train)
|
||||
pred_res22 = model22.transform(self.reg_df_test).collect()
|
||||
# Test the transform result is the same for original and loaded model
|
||||
for row1, row2 in zip(pred_res21, pred_res22):
|
||||
self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3))
|
||||
|
||||
def test_classifier_with_base_margin(self):
|
||||
cls_without_base_margin = SparkXGBClassifier(weight_col="weight")
|
||||
model_without_base_margin = cls_without_base_margin.fit(
|
||||
self.cls_df_train_without_base_margin
|
||||
)
|
||||
pred_result_without_base_margin = model_without_base_margin.transform(
|
||||
self.cls_df_test_without_base_margin
|
||||
).collect()
|
||||
for row in pred_result_without_base_margin:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction,
|
||||
row.expected_prediction_without_base_margin,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
row.probability, row.expected_prob_without_base_margin, atol=1e-3
|
||||
)
|
||||
|
||||
cls_with_same_base_margin = SparkXGBClassifier(
|
||||
weight_col="weight", base_margin_col="base_margin"
|
||||
)
|
||||
model_with_same_base_margin = cls_with_same_base_margin.fit(
|
||||
self.cls_df_train_with_same_base_margin
|
||||
)
|
||||
pred_result_with_same_base_margin = model_with_same_base_margin.transform(
|
||||
self.cls_df_test_with_same_base_margin
|
||||
).collect()
|
||||
for row in pred_result_with_same_base_margin:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_base_margin, atol=1e-3
|
||||
)
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
row.probability, row.expected_prob_with_base_margin, atol=1e-3
|
||||
)
|
||||
|
||||
cls_with_different_base_margin = SparkXGBClassifier(
|
||||
weight_col="weight", base_margin_col="base_margin"
|
||||
)
|
||||
model_with_different_base_margin = cls_with_different_base_margin.fit(
|
||||
self.cls_df_train_with_different_base_margin
|
||||
)
|
||||
pred_result_with_different_base_margin = (
|
||||
model_with_different_base_margin.transform(
|
||||
self.cls_df_test_with_different_base_margin
|
||||
).collect()
|
||||
)
|
||||
for row in pred_result_with_different_base_margin:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_base_margin, atol=1e-3
|
||||
)
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
row.probability, row.expected_prob_with_base_margin, atol=1e-3
|
||||
)
|
||||
|
||||
def test_regressor_with_weight_eval(self):
|
||||
# with weight
|
||||
regressor_with_weight = SparkXGBRegressor(weight_col="weight")
|
||||
model_with_weight = regressor_with_weight.fit(
|
||||
self.reg_df_train_with_eval_weight
|
||||
)
|
||||
pred_result_with_weight = model_with_weight.transform(
|
||||
self.reg_df_test_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result_with_weight:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_weight, atol=1e-3
|
||||
)
|
||||
)
|
||||
# with eval
|
||||
regressor_with_eval = SparkXGBRegressor(**self.reg_params_with_eval)
|
||||
model_with_eval = regressor_with_eval.fit(self.reg_df_train_with_eval_weight)
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
model_with_eval._xgb_sklearn_model.best_score,
|
||||
self.reg_with_eval_best_score,
|
||||
atol=1e-3,
|
||||
),
|
||||
f"Expected best score: {self.reg_with_eval_best_score}, "
|
||||
f"but get {model_with_eval._xgb_sklearn_model.best_score}",
|
||||
)
|
||||
pred_result_with_eval = model_with_eval.transform(
|
||||
self.reg_df_test_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result_with_eval:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_eval, atol=1e-3
|
||||
),
|
||||
f"Expect prediction is {row.expected_prediction_with_eval},"
|
||||
f"but get {row.prediction}",
|
||||
)
|
||||
# with weight and eval
|
||||
regressor_with_weight_eval = SparkXGBRegressor(
|
||||
weight_col="weight", **self.reg_params_with_eval
|
||||
)
|
||||
model_with_weight_eval = regressor_with_weight_eval.fit(
|
||||
self.reg_df_train_with_eval_weight
|
||||
)
|
||||
pred_result_with_weight_eval = model_with_weight_eval.transform(
|
||||
self.reg_df_test_with_eval_weight
|
||||
).collect()
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
model_with_weight_eval._xgb_sklearn_model.best_score,
|
||||
self.reg_with_eval_and_weight_best_score,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
for row in pred_result_with_weight_eval:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction,
|
||||
row.expected_prediction_with_weight_and_eval,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
|
||||
def test_classifier_with_weight_eval(self):
|
||||
# with weight
|
||||
classifier_with_weight = SparkXGBClassifier(weight_col="weight")
|
||||
model_with_weight = classifier_with_weight.fit(
|
||||
self.cls_df_train_with_eval_weight
|
||||
)
|
||||
pred_result_with_weight = model_with_weight.transform(
|
||||
self.cls_df_test_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result_with_weight:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_prob_with_weight, atol=1e-3)
|
||||
)
|
||||
# with eval
|
||||
classifier_with_eval = SparkXGBClassifier(**self.cls_params_with_eval)
|
||||
model_with_eval = classifier_with_eval.fit(self.cls_df_train_with_eval_weight)
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
model_with_eval._xgb_sklearn_model.best_score,
|
||||
self.cls_with_eval_best_score,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
pred_result_with_eval = model_with_eval.transform(
|
||||
self.cls_df_test_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result_with_eval:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_prob_with_eval, atol=1e-3)
|
||||
)
|
||||
# with weight and eval
|
||||
# Added scale_pos_weight because in 1.4.2, the original answer returns 0.5 which
|
||||
# doesn't really indicate this working correctly.
|
||||
classifier_with_weight_eval = SparkXGBClassifier(
|
||||
weight_col="weight", scale_pos_weight=4, **self.cls_params_with_eval
|
||||
)
|
||||
model_with_weight_eval = classifier_with_weight_eval.fit(
|
||||
self.cls_df_train_with_eval_weight
|
||||
)
|
||||
pred_result_with_weight_eval = model_with_weight_eval.transform(
|
||||
self.cls_df_test_with_eval_weight
|
||||
).collect()
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
model_with_weight_eval._xgb_sklearn_model.best_score,
|
||||
self.cls_with_eval_and_weight_best_score,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
for row in pred_result_with_weight_eval:
|
||||
self.assertTrue(
|
||||
np.allclose(
|
||||
row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_num_workers_param(self):
|
||||
regressor = SparkXGBRegressor(num_workers=-1)
|
||||
self.assertRaises(ValueError, regressor._validate_params)
|
||||
classifier = SparkXGBClassifier(num_workers=0)
|
||||
self.assertRaises(ValueError, classifier._validate_params)
|
||||
|
||||
def test_use_gpu_param(self):
|
||||
classifier = SparkXGBClassifier(use_gpu=True, tree_method="exact")
|
||||
self.assertRaises(ValueError, classifier._validate_params)
|
||||
regressor = SparkXGBRegressor(use_gpu=True, tree_method="exact")
|
||||
self.assertRaises(ValueError, regressor._validate_params)
|
||||
regressor = SparkXGBRegressor(use_gpu=True, tree_method="gpu_hist")
|
||||
regressor = SparkXGBRegressor(use_gpu=True)
|
||||
classifier = SparkXGBClassifier(use_gpu=True, tree_method="gpu_hist")
|
||||
classifier = SparkXGBClassifier(use_gpu=True)
|
||||
|
||||
def test_convert_to_sklearn_model(self):
|
||||
classifier = SparkXGBClassifier(
|
||||
n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
|
||||
)
|
||||
clf_model = classifier.fit(self.cls_df_train)
|
||||
|
||||
regressor = SparkXGBRegressor(
|
||||
n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
|
||||
)
|
||||
reg_model = regressor.fit(self.reg_df_train)
|
||||
|
||||
# Check that regardless of what booster, _convert_to_model converts to the correct class type
|
||||
sklearn_classifier = classifier._convert_to_sklearn_model(
|
||||
clf_model.get_booster()
|
||||
)
|
||||
assert isinstance(sklearn_classifier, XGBClassifier)
|
||||
assert sklearn_classifier.n_estimators == 200
|
||||
assert sklearn_classifier.missing == 2.0
|
||||
assert sklearn_classifier.max_depth == 3
|
||||
assert sklearn_classifier.get_params()["sketch_eps"] == 0.5
|
||||
|
||||
sklearn_regressor = regressor._convert_to_sklearn_model(reg_model.get_booster())
|
||||
assert isinstance(sklearn_regressor, XGBRegressor)
|
||||
assert sklearn_regressor.n_estimators == 200
|
||||
assert sklearn_regressor.missing == 2.0
|
||||
assert sklearn_regressor.max_depth == 3
|
||||
assert sklearn_classifier.get_params()["sketch_eps"] == 0.5
|
||||
|
||||
def test_feature_importances(self):
|
||||
reg1 = SparkXGBRegressor(**self.reg_params)
|
||||
model = reg1.fit(self.reg_df_train)
|
||||
booster = model.get_booster()
|
||||
self.assertEqual(model.get_feature_importances(), booster.get_score())
|
||||
self.assertEqual(
|
||||
model.get_feature_importances(importance_type="gain"),
|
||||
booster.get_score(importance_type="gain"),
|
||||
)
|
||||
|
||||
def test_regressor_array_col_as_feature(self):
|
||||
train_dataset = self.reg_df_train.withColumn(
|
||||
"features", vector_to_array(spark_sql_func.col("features"))
|
||||
)
|
||||
test_dataset = self.reg_df_test.withColumn(
|
||||
"features", vector_to_array(spark_sql_func.col("features"))
|
||||
)
|
||||
regressor = SparkXGBRegressor()
|
||||
model = regressor.fit(train_dataset)
|
||||
pred_result = model.transform(test_dataset).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(row.prediction, row.expected_prediction, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_classifier_array_col_as_feature(self):
|
||||
train_dataset = self.cls_df_train.withColumn(
|
||||
"features", vector_to_array(spark_sql_func.col("features"))
|
||||
)
|
||||
test_dataset = self.cls_df_test.withColumn(
|
||||
"features", vector_to_array(spark_sql_func.col("features"))
|
||||
)
|
||||
classifier = SparkXGBClassifier()
|
||||
model = classifier.fit(train_dataset)
|
||||
|
||||
pred_result = model.transform(test_dataset).collect()
|
||||
for row in pred_result:
|
||||
self.assertEqual(row.prediction, row.expected_prediction)
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_probability, rtol=1e-3)
|
||||
)
|
||||
|
||||
def test_classifier_with_feature_names_types_weights(self):
|
||||
classifier = SparkXGBClassifier(
|
||||
feature_names=["a1", "a2", "a3"],
|
||||
feature_types=["i", "int", "float"],
|
||||
feature_weights=[2.0, 5.0, 3.0],
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train)
|
||||
model.transform(self.cls_df_test).collect()
|
||||
450
tests/python/test_spark/test_spark_local_cluster.py
Normal file
450
tests/python/test_spark/test_spark_local_cluster.py
Normal file
@@ -0,0 +1,450 @@
|
||||
import sys
|
||||
import random
|
||||
import json
|
||||
import uuid
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
||||
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
||||
|
||||
from .utils import SparkLocalClusterTestCase
|
||||
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
|
||||
from xgboost.spark.utils import _get_max_num_concurrent_tasks
|
||||
from pyspark.ml.linalg import Vectors
|
||||
|
||||
|
||||
class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
|
||||
def setUp(self):
|
||||
random.seed(2020)
|
||||
|
||||
self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
|
||||
# The following code use xgboost python library to train xgb model and predict.
|
||||
#
|
||||
# >>> import numpy as np
|
||||
# >>> import xgboost
|
||||
# >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
|
||||
# >>> y = np.array([0, 1])
|
||||
# >>> reg1 = xgboost.XGBRegressor()
|
||||
# >>> reg1.fit(X, y)
|
||||
# >>> reg1.predict(X)
|
||||
# array([8.8363886e-04, 9.9911636e-01], dtype=float32)
|
||||
# >>> def custom_lr(boosting_round, num_boost_round):
|
||||
# ... return 1.0 / (boosting_round + 1)
|
||||
# ...
|
||||
# >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
|
||||
# >>> reg1.predict(X)
|
||||
# array([0.02406833, 0.97593164], dtype=float32)
|
||||
# >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
|
||||
# >>> reg2.fit(X, y)
|
||||
# >>> reg2.predict(X, ntree_limit=5)
|
||||
# array([0.22185263, 0.77814734], dtype=float32)
|
||||
self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
|
||||
self.reg_df_train = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
],
|
||||
["features", "label"],
|
||||
)
|
||||
self.reg_df_test = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction",
|
||||
"expected_prediction_with_params",
|
||||
"expected_prediction_with_callbacks",
|
||||
],
|
||||
)
|
||||
|
||||
# Distributed section
|
||||
# Binary classification
|
||||
self.cls_df_train_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1),
|
||||
]
|
||||
* 100,
|
||||
["features", "label"],
|
||||
)
|
||||
self.cls_df_test_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.9949826, 0.0050174]),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0050174, 0.9949826]),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, [0.9949826, 0.0050174]),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0050174, 0.9949826]),
|
||||
],
|
||||
["features", "expected_label", "expected_probability"],
|
||||
)
|
||||
# Binary classification with different num_estimators
|
||||
self.cls_df_test_distributed_lower_estimators = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [0.9735, 0.0265]),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0265, 0.9735]),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, [0.9735, 0.0265]),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0265, 0.9735]),
|
||||
],
|
||||
["features", "expected_label", "expected_probability"],
|
||||
)
|
||||
|
||||
# Multiclass classification
|
||||
self.cls_df_train_distributed_multiclass = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
|
||||
]
|
||||
* 100,
|
||||
["features", "label"],
|
||||
)
|
||||
self.cls_df_test_distributed_multiclass = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, [4.294563, -2.449409, -2.449409]),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
1,
|
||||
[-2.3796105, 3.669014, -2.449409],
|
||||
),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, [4.294563, -2.449409, -2.449409]),
|
||||
(
|
||||
Vectors.sparse(3, {1: 6.0, 2: 7.5}),
|
||||
2,
|
||||
[-2.3796105, -2.449409, 3.669014],
|
||||
),
|
||||
],
|
||||
["features", "expected_label", "expected_margins"],
|
||||
)
|
||||
|
||||
# Regression
|
||||
self.reg_df_train_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
|
||||
]
|
||||
* 100,
|
||||
["features", "label"],
|
||||
)
|
||||
self.reg_df_test_distributed = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 1.533e-04),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 9.999e-01),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 1.533e-04),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1.999e00),
|
||||
],
|
||||
["features", "expected_label"],
|
||||
)
|
||||
|
||||
# Adding weight and validation
|
||||
self.clf_params_with_eval_dist = {
|
||||
"validation_indicator_col": "isVal",
|
||||
"early_stopping_rounds": 1,
|
||||
"eval_metric": "logloss",
|
||||
}
|
||||
self.clf_params_with_weight_dist = {"weight_col": "weight"}
|
||||
self.cls_df_train_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
|
||||
]
|
||||
* 100,
|
||||
["features", "label", "isVal", "weight"],
|
||||
)
|
||||
self.cls_df_test_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(
|
||||
Vectors.dense(1.0, 2.0, 3.0),
|
||||
[0.9955, 0.0044],
|
||||
[0.9904, 0.0096],
|
||||
[0.9903, 0.0097],
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prob_with_weight",
|
||||
"expected_prob_with_eval",
|
||||
"expected_prob_with_weight_and_eval",
|
||||
],
|
||||
)
|
||||
self.clf_best_score_eval = 0.009677
|
||||
self.clf_best_score_weight_and_eval = 0.006626
|
||||
|
||||
self.reg_params_with_eval_dist = {
|
||||
"validation_indicator_col": "isVal",
|
||||
"early_stopping_rounds": 1,
|
||||
"eval_metric": "rmse",
|
||||
}
|
||||
self.reg_params_with_weight_dist = {"weight_col": "weight"}
|
||||
self.reg_df_train_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
|
||||
(Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
|
||||
(Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
|
||||
(Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
|
||||
]
|
||||
* 100,
|
||||
["features", "label", "isVal", "weight"],
|
||||
)
|
||||
self.reg_df_test_distributed_with_eval_weight = self.session.createDataFrame(
|
||||
[
|
||||
(Vectors.dense(1.0, 2.0, 3.0), 4.583e-05, 5.239e-05, 6.03e-05),
|
||||
(
|
||||
Vectors.sparse(3, {1: 1.0, 2: 5.5}),
|
||||
9.9997e-01,
|
||||
9.99947e-01,
|
||||
9.9995e-01,
|
||||
),
|
||||
],
|
||||
[
|
||||
"features",
|
||||
"expected_prediction_with_weight",
|
||||
"expected_prediction_with_eval",
|
||||
"expected_prediction_with_weight_and_eval",
|
||||
],
|
||||
)
|
||||
self.reg_best_score_eval = 5.239e-05
|
||||
self.reg_best_score_weight_and_eval = 4.810e-05
|
||||
|
||||
def test_regressor_basic_with_params(self):
|
||||
regressor = SparkXGBRegressor(**self.reg_params)
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_params, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_callbacks(self):
|
||||
from xgboost.callback import LearningRateScheduler
|
||||
|
||||
path = os.path.join(self.tempdir, str(uuid.uuid4()))
|
||||
|
||||
def custom_learning_rate(boosting_round):
|
||||
return 1.0 / (boosting_round + 1)
|
||||
|
||||
cb = [LearningRateScheduler(custom_learning_rate)]
|
||||
regressor = SparkXGBRegressor(callbacks=cb)
|
||||
|
||||
# Test the save/load of the estimator instead of the model, since
|
||||
# the callbacks param only exists in the estimator but not in the model
|
||||
regressor.save(path)
|
||||
regressor = SparkXGBRegressor.load(path)
|
||||
|
||||
model = regressor.fit(self.reg_df_train)
|
||||
pred_result = model.transform(self.reg_df_test).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
|
||||
)
|
||||
)
|
||||
|
||||
def test_classifier_distributed_basic(self):
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
|
||||
model = classifier.fit(self.cls_df_train_distributed)
|
||||
pred_result = model.transform(self.cls_df_test_distributed).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
self.assertTrue(
|
||||
np.allclose(row.expected_probability, row.probability, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_classifier_distributed_multiclass(self):
|
||||
# There is no built-in multiclass option for external storage
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
|
||||
model = classifier.fit(self.cls_df_train_distributed_multiclass)
|
||||
pred_result = model.transform(self.cls_df_test_distributed_multiclass).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
self.assertTrue(
|
||||
np.allclose(row.expected_margins, row.rawPrediction, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_regressor_distributed_basic(self):
|
||||
regressor = SparkXGBRegressor(num_workers=self.n_workers, n_estimators=100)
|
||||
model = regressor.fit(self.reg_df_train_distributed)
|
||||
pred_result = model.transform(self.reg_df_test_distributed).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
|
||||
def test_classifier_distributed_weight_eval(self):
|
||||
# with weight
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.clf_params_with_weight_dist
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_prob_with_weight, atol=1e-3)
|
||||
)
|
||||
|
||||
# with eval only
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.clf_params_with_eval_dist
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(row.probability, row.expected_prob_with_eval, atol=1e-3)
|
||||
)
|
||||
assert np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.clf_best_score_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
# with both weight and eval
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.clf_params_with_eval_dist,
|
||||
**self.clf_params_with_weight_dist
|
||||
)
|
||||
model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.allclose(
|
||||
row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
|
||||
)
|
||||
)
|
||||
np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.clf_best_score_weight_and_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
def test_regressor_distributed_weight_eval(self):
|
||||
# with weight
|
||||
regressor = SparkXGBRegressor(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.reg_params_with_weight_dist
|
||||
)
|
||||
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.reg_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction, row.expected_prediction_with_weight, atol=1e-3
|
||||
)
|
||||
)
|
||||
# with eval only
|
||||
regressor = SparkXGBRegressor(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
**self.reg_params_with_eval_dist
|
||||
)
|
||||
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.reg_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(row.prediction, row.expected_prediction_with_eval, atol=1e-3)
|
||||
)
|
||||
assert np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.reg_best_score_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
# with both weight and eval
|
||||
regressor = SparkXGBRegressor(
|
||||
num_workers=self.n_workers,
|
||||
n_estimators=100,
|
||||
use_external_storage=False,
|
||||
**self.reg_params_with_eval_dist,
|
||||
**self.reg_params_with_weight_dist
|
||||
)
|
||||
model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
|
||||
pred_result = model.transform(
|
||||
self.reg_df_test_distributed_with_eval_weight
|
||||
).collect()
|
||||
for row in pred_result:
|
||||
self.assertTrue(
|
||||
np.isclose(
|
||||
row.prediction,
|
||||
row.expected_prediction_with_weight_and_eval,
|
||||
atol=1e-3,
|
||||
)
|
||||
)
|
||||
assert np.isclose(
|
||||
float(model.get_booster().attributes()["best_score"]),
|
||||
self.reg_best_score_weight_and_eval,
|
||||
rtol=1e-3,
|
||||
)
|
||||
|
||||
def test_num_estimators(self):
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=10)
|
||||
model = classifier.fit(self.cls_df_train_distributed)
|
||||
pred_result = model.transform(
|
||||
self.cls_df_test_distributed_lower_estimators
|
||||
).collect()
|
||||
print(pred_result)
|
||||
for row in pred_result:
|
||||
self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
|
||||
self.assertTrue(
|
||||
np.allclose(row.expected_probability, row.probability, atol=1e-3)
|
||||
)
|
||||
|
||||
def test_distributed_params(self):
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers, max_depth=7)
|
||||
model = classifier.fit(self.cls_df_train_distributed)
|
||||
self.assertTrue(hasattr(classifier, "max_depth"))
|
||||
self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
|
||||
booster_config = json.loads(model.get_booster().save_config())
|
||||
max_depth = booster_config["learner"]["gradient_booster"]["updater"][
|
||||
"grow_histmaker"
|
||||
]["train_param"]["max_depth"]
|
||||
self.assertEqual(int(max_depth), 7)
|
||||
|
||||
def test_repartition(self):
|
||||
# The following test case has a few partitioned datasets that are either
|
||||
# well partitioned relative to the number of workers that the user wants
|
||||
# or poorly partitioned. We only want to repartition when the dataset
|
||||
# is poorly partitioned so _repartition_needed is true in those instances.
|
||||
|
||||
classifier = SparkXGBClassifier(num_workers=self.n_workers)
|
||||
basic = self.cls_df_train_distributed
|
||||
self.assertTrue(classifier._repartition_needed(basic))
|
||||
bad_repartitioned = basic.repartition(self.n_workers + 1)
|
||||
self.assertTrue(classifier._repartition_needed(bad_repartitioned))
|
||||
good_repartitioned = basic.repartition(self.n_workers)
|
||||
self.assertFalse(classifier._repartition_needed(good_repartitioned))
|
||||
|
||||
# Now testing if force_repartition returns True regardless of whether the data is well partitioned
|
||||
classifier = SparkXGBClassifier(
|
||||
num_workers=self.n_workers, force_repartition=True
|
||||
)
|
||||
good_repartitioned = basic.repartition(self.n_workers)
|
||||
self.assertTrue(classifier._repartition_needed(good_repartitioned))
|
||||
148
tests/python/test_spark/utils.py
Normal file
148
tests/python/test_spark/utils.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import contextlib
|
||||
import logging
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import unittest
|
||||
import pytest
|
||||
|
||||
from six import StringIO
|
||||
|
||||
import testing as tm
|
||||
|
||||
if tm.no_spark()["condition"]:
|
||||
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
|
||||
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
|
||||
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
|
||||
|
||||
from pyspark.sql import SQLContext
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
from xgboost.spark.utils import _get_default_params_from_func
|
||||
|
||||
|
||||
class UtilsTest(unittest.TestCase):
|
||||
def test_get_default_params(self):
|
||||
class Foo:
|
||||
def func1(self, x, y, key1=None, key2="val2", key3=0, key4=None):
|
||||
pass
|
||||
|
||||
unsupported_params = {"key2", "key4"}
|
||||
expected_default_params = {
|
||||
"key1": None,
|
||||
"key3": 0,
|
||||
}
|
||||
actual_default_params = _get_default_params_from_func(
|
||||
Foo.func1, unsupported_params
|
||||
)
|
||||
self.assertEqual(
|
||||
len(expected_default_params.keys()), len(actual_default_params.keys())
|
||||
)
|
||||
for k, v in actual_default_params.items():
|
||||
self.assertEqual(expected_default_params[k], v)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def patch_stdout():
|
||||
"""patch stdout and give an output"""
|
||||
sys_stdout = sys.stdout
|
||||
io_out = StringIO()
|
||||
sys.stdout = io_out
|
||||
try:
|
||||
yield io_out
|
||||
finally:
|
||||
sys.stdout = sys_stdout
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def patch_logger(name):
|
||||
"""patch logger and give an output"""
|
||||
io_out = StringIO()
|
||||
log = logging.getLogger(name)
|
||||
handler = logging.StreamHandler(io_out)
|
||||
log.addHandler(handler)
|
||||
try:
|
||||
yield io_out
|
||||
finally:
|
||||
log.removeHandler(handler)
|
||||
|
||||
|
||||
class TestTempDir(object):
|
||||
@classmethod
|
||||
def make_tempdir(cls):
|
||||
"""
|
||||
:param dir: Root directory in which to create the temp directory
|
||||
"""
|
||||
cls.tempdir = tempfile.mkdtemp(prefix="sparkdl_tests")
|
||||
|
||||
@classmethod
|
||||
def remove_tempdir(cls):
|
||||
shutil.rmtree(cls.tempdir)
|
||||
|
||||
|
||||
class TestSparkContext(object):
|
||||
@classmethod
|
||||
def setup_env(cls, spark_config):
|
||||
builder = SparkSession.builder.appName("xgboost spark python API Tests")
|
||||
for k, v in spark_config.items():
|
||||
builder.config(k, v)
|
||||
spark = builder.getOrCreate()
|
||||
logging.getLogger("pyspark").setLevel(logging.INFO)
|
||||
|
||||
cls.sc = spark.sparkContext
|
||||
cls.session = spark
|
||||
|
||||
@classmethod
|
||||
def tear_down_env(cls):
|
||||
cls.session.stop()
|
||||
cls.session = None
|
||||
cls.sc.stop()
|
||||
cls.sc = None
|
||||
|
||||
|
||||
class SparkTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.setup_env(
|
||||
{
|
||||
"spark.master": "local[2]",
|
||||
"spark.python.worker.reuse": "false",
|
||||
"spark.driver.host": "127.0.0.1",
|
||||
"spark.task.maxFailures": "1",
|
||||
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
|
||||
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
|
||||
}
|
||||
)
|
||||
cls.make_tempdir()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.remove_tempdir()
|
||||
cls.tear_down_env()
|
||||
|
||||
|
||||
class SparkLocalClusterTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.setup_env(
|
||||
{
|
||||
"spark.master": "local-cluster[2, 2, 1024]",
|
||||
"spark.python.worker.reuse": "false",
|
||||
"spark.driver.host": "127.0.0.1",
|
||||
"spark.task.maxFailures": "1",
|
||||
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
|
||||
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
|
||||
"spark.cores.max": "4",
|
||||
"spark.task.cpus": "1",
|
||||
"spark.executor.cores": "2",
|
||||
}
|
||||
)
|
||||
cls.make_tempdir()
|
||||
# We run a dummy job so that we block until the workers have connected to the master
|
||||
cls.sc.parallelize(range(4), 4).barrier().mapPartitions(lambda _: []).collect()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.remove_tempdir()
|
||||
cls.tear_down_env()
|
||||
@@ -56,6 +56,15 @@ def no_dask():
|
||||
return {"condition": not DASK_INSTALLED, "reason": "Dask is not installed"}
|
||||
|
||||
|
||||
def no_spark():
|
||||
try:
|
||||
import pyspark # noqa
|
||||
SPARK_INSTALLED = True
|
||||
except ImportError:
|
||||
SPARK_INSTALLED = False
|
||||
return {"condition": not SPARK_INSTALLED, "reason": "Spark is not installed"}
|
||||
|
||||
|
||||
def no_pandas():
|
||||
return {'condition': not PANDAS_INSTALLED,
|
||||
'reason': 'Pandas is not installed.'}
|
||||
|
||||
Reference in New Issue
Block a user