[pyspark] Handle the device parameter in pyspark. (#9390)
- Handle the new `device` parameter in PySpark. - Deprecate the old `use_gpu` parameter.
This commit is contained in:
@@ -154,7 +154,7 @@ def spark_diabetes_dataset_feature_cols(spark_session_with_gpu):
|
||||
def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
|
||||
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
|
||||
|
||||
classifier = SparkXGBClassifier(use_gpu=True, num_workers=num_workers)
|
||||
classifier = SparkXGBClassifier(device="cuda", num_workers=num_workers)
|
||||
train_df, test_df = spark_iris_dataset
|
||||
model = classifier.fit(train_df)
|
||||
pred_result_df = model.transform(test_df)
|
||||
@@ -169,7 +169,7 @@ def test_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature_co
|
||||
train_df, test_df, feature_names = spark_iris_dataset_feature_cols
|
||||
|
||||
classifier = SparkXGBClassifier(
|
||||
features_col=feature_names, use_gpu=True, num_workers=num_workers
|
||||
features_col=feature_names, device="cuda", num_workers=num_workers
|
||||
)
|
||||
|
||||
model = classifier.fit(train_df)
|
||||
@@ -185,7 +185,7 @@ def test_cv_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature
|
||||
train_df, test_df, feature_names = spark_iris_dataset_feature_cols
|
||||
|
||||
classifier = SparkXGBClassifier(
|
||||
features_col=feature_names, use_gpu=True, num_workers=num_workers
|
||||
features_col=feature_names, device="cuda", num_workers=num_workers
|
||||
)
|
||||
grid = ParamGridBuilder().addGrid(classifier.max_depth, [6, 8]).build()
|
||||
evaluator = MulticlassClassificationEvaluator(metricName="f1")
|
||||
@@ -197,11 +197,24 @@ def test_cv_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature
|
||||
f1 = evaluator.evaluate(pred_result_df)
|
||||
assert f1 >= 0.97
|
||||
|
||||
clf = SparkXGBClassifier(
|
||||
features_col=feature_names, use_gpu=True, num_workers=num_workers
|
||||
)
|
||||
grid = ParamGridBuilder().addGrid(clf.max_depth, [6, 8]).build()
|
||||
evaluator = MulticlassClassificationEvaluator(metricName="f1")
|
||||
cv = CrossValidator(
|
||||
estimator=clf, evaluator=evaluator, estimatorParamMaps=grid, numFolds=3
|
||||
)
|
||||
cvModel = cv.fit(train_df)
|
||||
pred_result_df = cvModel.transform(test_df)
|
||||
f1 = evaluator.evaluate(pred_result_df)
|
||||
assert f1 >= 0.97
|
||||
|
||||
|
||||
def test_sparkxgb_regressor_with_gpu(spark_diabetes_dataset):
|
||||
from pyspark.ml.evaluation import RegressionEvaluator
|
||||
|
||||
regressor = SparkXGBRegressor(use_gpu=True, num_workers=num_workers)
|
||||
regressor = SparkXGBRegressor(device="cuda", num_workers=num_workers)
|
||||
train_df, test_df = spark_diabetes_dataset
|
||||
model = regressor.fit(train_df)
|
||||
pred_result_df = model.transform(test_df)
|
||||
@@ -215,7 +228,7 @@ def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature
|
||||
|
||||
train_df, test_df, feature_names = spark_diabetes_dataset_feature_cols
|
||||
regressor = SparkXGBRegressor(
|
||||
features_col=feature_names, use_gpu=True, num_workers=num_workers
|
||||
features_col=feature_names, device="cuda", num_workers=num_workers
|
||||
)
|
||||
|
||||
model = regressor.fit(train_df)
|
||||
|
||||
@@ -741,11 +741,6 @@ class TestPySparkLocal:
|
||||
with pytest.raises(ValueError, match="early_stopping_rounds"):
|
||||
classifier.fit(clf_data.cls_df_train)
|
||||
|
||||
def test_gpu_param_setting(self, clf_data: ClfData) -> None:
|
||||
py_cls = SparkXGBClassifier(use_gpu=True)
|
||||
train_params = py_cls._get_distributed_train_params(clf_data.cls_df_train)
|
||||
assert train_params["tree_method"] == "gpu_hist"
|
||||
|
||||
def test_classifier_with_list_eval_metric(self, clf_data: ClfData) -> None:
|
||||
classifier = SparkXGBClassifier(eval_metric=["auc", "rmse"])
|
||||
model = classifier.fit(clf_data.cls_df_train)
|
||||
@@ -756,6 +751,53 @@ class TestPySparkLocal:
|
||||
model = classifier.fit(clf_data.cls_df_train)
|
||||
model.transform(clf_data.cls_df_test).collect()
|
||||
|
||||
def test_regressor_params_basic(self) -> None:
|
||||
py_reg = SparkXGBRegressor()
|
||||
assert hasattr(py_reg, "n_estimators")
|
||||
assert py_reg.n_estimators.parent == py_reg.uid
|
||||
assert not hasattr(py_reg, "gpu_id")
|
||||
assert hasattr(py_reg, "device")
|
||||
assert py_reg.getOrDefault(py_reg.n_estimators) == 100
|
||||
assert py_reg.getOrDefault(getattr(py_reg, "objective")), "reg:squarederror"
|
||||
py_reg2 = SparkXGBRegressor(n_estimators=200)
|
||||
assert py_reg2.getOrDefault(getattr(py_reg2, "n_estimators")), 200
|
||||
py_reg3 = py_reg2.copy({getattr(py_reg2, "max_depth"): 10})
|
||||
assert py_reg3.getOrDefault(getattr(py_reg3, "n_estimators")), 200
|
||||
assert py_reg3.getOrDefault(getattr(py_reg3, "max_depth")), 10
|
||||
|
||||
def test_classifier_params_basic(self) -> None:
|
||||
py_clf = SparkXGBClassifier()
|
||||
assert hasattr(py_clf, "n_estimators")
|
||||
assert py_clf.n_estimators.parent == py_clf.uid
|
||||
assert not hasattr(py_clf, "gpu_id")
|
||||
assert hasattr(py_clf, "device")
|
||||
|
||||
assert py_clf.getOrDefault(py_clf.n_estimators) == 100
|
||||
assert py_clf.getOrDefault(getattr(py_clf, "objective")) is None
|
||||
py_clf2 = SparkXGBClassifier(n_estimators=200)
|
||||
assert py_clf2.getOrDefault(getattr(py_clf2, "n_estimators")) == 200
|
||||
py_clf3 = py_clf2.copy({getattr(py_clf2, "max_depth"): 10})
|
||||
assert py_clf3.getOrDefault(getattr(py_clf3, "n_estimators")) == 200
|
||||
assert py_clf3.getOrDefault(getattr(py_clf3, "max_depth")), 10
|
||||
|
||||
def test_classifier_kwargs_basic(self, clf_data: ClfData) -> None:
|
||||
py_clf = SparkXGBClassifier(**clf_data.cls_params)
|
||||
assert hasattr(py_clf, "n_estimators")
|
||||
assert py_clf.n_estimators.parent == py_clf.uid
|
||||
assert not hasattr(py_clf, "gpu_id")
|
||||
assert hasattr(py_clf, "device")
|
||||
assert hasattr(py_clf, "arbitrary_params_dict")
|
||||
assert py_clf.getOrDefault(py_clf.arbitrary_params_dict) == {}
|
||||
|
||||
# Testing overwritten params
|
||||
py_clf = SparkXGBClassifier()
|
||||
py_clf.setParams(x=1, y=2)
|
||||
py_clf.setParams(y=3, z=4)
|
||||
xgb_params = py_clf._gen_xgb_params_dict()
|
||||
assert xgb_params["x"] == 1
|
||||
assert xgb_params["y"] == 3
|
||||
assert xgb_params["z"] == 4
|
||||
|
||||
def test_regressor_model_save_load(self, reg_data: RegData) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
path = "file:" + tmpdir
|
||||
@@ -826,6 +868,24 @@ class TestPySparkLocal:
|
||||
)
|
||||
assert_model_compatible(model.stages[0], tmpdir)
|
||||
|
||||
def test_device_param(self, reg_data: RegData, clf_data: ClfData) -> None:
|
||||
clf = SparkXGBClassifier(device="cuda", tree_method="exact")
|
||||
with pytest.raises(ValueError, match="not supported on GPU"):
|
||||
clf.fit(clf_data.cls_df_train)
|
||||
regressor = SparkXGBRegressor(device="cuda", tree_method="exact")
|
||||
with pytest.raises(ValueError, match="not supported on GPU"):
|
||||
regressor.fit(reg_data.reg_df_train)
|
||||
|
||||
reg = SparkXGBRegressor(device="cuda", tree_method="gpu_hist")
|
||||
reg._validate_params()
|
||||
reg = SparkXGBRegressor(device="cuda")
|
||||
reg._validate_params()
|
||||
|
||||
clf = SparkXGBClassifier(device="cuda", tree_method="gpu_hist")
|
||||
clf._validate_params()
|
||||
clf = SparkXGBClassifier(device="cuda")
|
||||
clf._validate_params()
|
||||
|
||||
|
||||
class XgboostLocalTest(SparkTestCase):
|
||||
def setUp(self):
|
||||
@@ -1020,55 +1080,6 @@ class XgboostLocalTest(SparkTestCase):
|
||||
assert sklearn_regressor.max_depth == 3
|
||||
assert sklearn_regressor.get_params()["sketch_eps"] == 0.5
|
||||
|
||||
def test_regressor_params_basic(self):
|
||||
py_reg = SparkXGBRegressor()
|
||||
self.assertTrue(hasattr(py_reg, "n_estimators"))
|
||||
self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
|
||||
self.assertFalse(hasattr(py_reg, "gpu_id"))
|
||||
self.assertFalse(hasattr(py_reg, "device"))
|
||||
self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
|
||||
self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
|
||||
py_reg2 = SparkXGBRegressor(n_estimators=200)
|
||||
self.assertEqual(py_reg2.getOrDefault(py_reg2.n_estimators), 200)
|
||||
py_reg3 = py_reg2.copy({py_reg2.max_depth: 10})
|
||||
self.assertEqual(py_reg3.getOrDefault(py_reg3.n_estimators), 200)
|
||||
self.assertEqual(py_reg3.getOrDefault(py_reg3.max_depth), 10)
|
||||
|
||||
def test_classifier_params_basic(self):
|
||||
py_cls = SparkXGBClassifier()
|
||||
self.assertTrue(hasattr(py_cls, "n_estimators"))
|
||||
self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
|
||||
self.assertFalse(hasattr(py_cls, "gpu_id"))
|
||||
self.assertFalse(hasattr(py_cls, "device"))
|
||||
self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
|
||||
self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
|
||||
py_cls2 = SparkXGBClassifier(n_estimators=200)
|
||||
self.assertEqual(py_cls2.getOrDefault(py_cls2.n_estimators), 200)
|
||||
py_cls3 = py_cls2.copy({py_cls2.max_depth: 10})
|
||||
self.assertEqual(py_cls3.getOrDefault(py_cls3.n_estimators), 200)
|
||||
self.assertEqual(py_cls3.getOrDefault(py_cls3.max_depth), 10)
|
||||
|
||||
def test_classifier_kwargs_basic(self):
|
||||
py_cls = SparkXGBClassifier(**self.cls_params_kwargs)
|
||||
self.assertTrue(hasattr(py_cls, "n_estimators"))
|
||||
self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
|
||||
self.assertFalse(hasattr(py_cls, "gpu_id"))
|
||||
self.assertFalse(hasattr(py_cls, "device"))
|
||||
self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
|
||||
expected_kwargs = {"sketch_eps": 0.03}
|
||||
self.assertEqual(
|
||||
py_cls.getOrDefault(py_cls.arbitrary_params_dict), expected_kwargs
|
||||
)
|
||||
|
||||
# Testing overwritten params
|
||||
py_cls = SparkXGBClassifier()
|
||||
py_cls.setParams(x=1, y=2)
|
||||
py_cls.setParams(y=3, z=4)
|
||||
xgb_params = py_cls._gen_xgb_params_dict()
|
||||
assert xgb_params["x"] == 1
|
||||
assert xgb_params["y"] == 3
|
||||
assert xgb_params["z"] == 4
|
||||
|
||||
def test_param_alias(self):
|
||||
py_cls = SparkXGBClassifier(features_col="f1", label_col="l1")
|
||||
self.assertEqual(py_cls.getOrDefault(py_cls.featuresCol), "f1")
|
||||
@@ -1200,16 +1211,6 @@ class XgboostLocalTest(SparkTestCase):
|
||||
classifier = SparkXGBClassifier(num_workers=0)
|
||||
self.assertRaises(ValueError, classifier._validate_params)
|
||||
|
||||
def test_use_gpu_param(self):
|
||||
classifier = SparkXGBClassifier(use_gpu=True, tree_method="exact")
|
||||
self.assertRaises(ValueError, classifier._validate_params)
|
||||
regressor = SparkXGBRegressor(use_gpu=True, tree_method="exact")
|
||||
self.assertRaises(ValueError, regressor._validate_params)
|
||||
regressor = SparkXGBRegressor(use_gpu=True, tree_method="gpu_hist")
|
||||
regressor = SparkXGBRegressor(use_gpu=True)
|
||||
classifier = SparkXGBClassifier(use_gpu=True, tree_method="gpu_hist")
|
||||
classifier = SparkXGBClassifier(use_gpu=True)
|
||||
|
||||
def test_feature_importances(self):
|
||||
reg1 = SparkXGBRegressor(**self.reg_params)
|
||||
model = reg1.fit(self.reg_df_train)
|
||||
|
||||
Reference in New Issue
Block a user