PySpark XGBoost integration (#8020)

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
WeichenXu
2022-07-13 13:11:18 +08:00
committed by GitHub
parent 8959622836
commit 176fec8789
25 changed files with 3650 additions and 12 deletions

View File

@@ -44,13 +44,15 @@ def pytest_addoption(parser):
def pytest_collection_modifyitems(config, items):
if config.getoption('--use-rmm-pool'):
if config.getoption("--use-rmm-pool"):
blocklist = [
'python-gpu/test_gpu_demos.py::test_dask_training',
'python-gpu/test_gpu_prediction.py::TestGPUPredict::test_shap',
'python-gpu/test_gpu_linear.py::TestGPULinear'
"python-gpu/test_gpu_demos.py::test_dask_training",
"python-gpu/test_gpu_prediction.py::TestGPUPredict::test_shap",
"python-gpu/test_gpu_linear.py::TestGPULinear",
]
skip_mark = pytest.mark.skip(reason='This test is not run when --use-rmm-pool flag is active')
skip_mark = pytest.mark.skip(
reason="This test is not run when --use-rmm-pool flag is active"
)
for item in items:
if any(item.nodeid.startswith(x) for x in blocklist):
item.add_marker(skip_mark)
@@ -58,5 +60,9 @@ def pytest_collection_modifyitems(config, items):
# mark dask tests as `mgpu`.
mgpu_mark = pytest.mark.mgpu
for item in items:
if item.nodeid.startswith("python-gpu/test_gpu_with_dask.py"):
if item.nodeid.startswith(
"python-gpu/test_gpu_with_dask.py"
) or item.nodeid.startswith(
"python-gpu/test_spark_with_gpu/test_spark_with_gpu.py"
):
item.add_marker(mgpu_mark)

View File

@@ -0,0 +1,3 @@
#!/bin/bash
echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}"

View File

@@ -0,0 +1,120 @@
import sys
import logging
import pytest
import sklearn
sys.path.append("tests/python")
import testing as tm
if tm.no_dask()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
if sys.platform.startswith("win"):
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from xgboost.spark import SparkXGBRegressor, SparkXGBClassifier
@pytest.fixture(scope="module", autouse=True)
def spark_session_with_gpu():
spark_config = {
"spark.master": "local-cluster[1, 4, 1024]",
"spark.python.worker.reuse": "false",
"spark.driver.host": "127.0.0.1",
"spark.task.maxFailures": "1",
"spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
"spark.sql.pyspark.jvmStacktrace.enabled": "true",
"spark.cores.max": "4",
"spark.task.cpus": "1",
"spark.executor.cores": "4",
"spark.worker.resource.gpu.amount": "4",
"spark.task.resource.gpu.amount": "1",
"spark.executor.resource.gpu.amount": "4",
"spark.worker.resource.gpu.discoveryScript": "tests/python-gpu/test_spark_with_gpu/discover_gpu.sh",
}
builder = SparkSession.builder.appName("xgboost spark python API Tests with GPU")
for k, v in spark_config.items():
builder.config(k, v)
spark = builder.getOrCreate()
logging.getLogger("pyspark").setLevel(logging.INFO)
# We run a dummy job so that we block until the workers have connected to the master
spark.sparkContext.parallelize(range(4), 4).barrier().mapPartitions(
lambda _: []
).collect()
yield spark
spark.stop()
@pytest.fixture
def spark_iris_dataset(spark_session_with_gpu):
spark = spark_session_with_gpu
data = sklearn.datasets.load_iris()
train_rows = [
(Vectors.dense(features), float(label))
for features, label in zip(data.data[0::2], data.target[0::2])
]
train_df = spark.createDataFrame(
spark.sparkContext.parallelize(train_rows, 4), ["features", "label"]
)
test_rows = [
(Vectors.dense(features), float(label))
for features, label in zip(data.data[1::2], data.target[1::2])
]
test_df = spark.createDataFrame(
spark.sparkContext.parallelize(test_rows, 4), ["features", "label"]
)
return train_df, test_df
@pytest.fixture
def spark_diabetes_dataset(spark_session_with_gpu):
spark = spark_session_with_gpu
data = sklearn.datasets.load_diabetes()
train_rows = [
(Vectors.dense(features), float(label))
for features, label in zip(data.data[0::2], data.target[0::2])
]
train_df = spark.createDataFrame(
spark.sparkContext.parallelize(train_rows, 4), ["features", "label"]
)
test_rows = [
(Vectors.dense(features), float(label))
for features, label in zip(data.data[1::2], data.target[1::2])
]
test_df = spark.createDataFrame(
spark.sparkContext.parallelize(test_rows, 4), ["features", "label"]
)
return train_df, test_df
def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
classifier = SparkXGBClassifier(
use_gpu=True,
num_workers=4,
)
train_df, test_df = spark_iris_dataset
model = classifier.fit(train_df)
pred_result_df = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(metricName="f1")
f1 = evaluator.evaluate(pred_result_df)
assert f1 >= 0.97
def test_sparkxgb_regressor_with_gpu(spark_diabetes_dataset):
from pyspark.ml.evaluation import RegressionEvaluator
regressor = SparkXGBRegressor(
use_gpu=True,
num_workers=4,
)
train_df, test_df = spark_diabetes_dataset
model = regressor.fit(train_df)
pred_result_df = model.transform(test_df)
evaluator = RegressionEvaluator(metricName="rmse")
rmse = evaluator.evaluate(pred_result_df)
assert rmse <= 65.0