Refactor PySpark tests. (#8605)

- Convert classifier tests to pytest tests. - Replace hardcoded tests.
2023-01-04 17:05:16 +08:00 · 2023-01-04 17:05:16 +08:00 · d308124910
commit d308124910
parent fa44a33ee6
1 changed files with 254 additions and 238 deletions
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@ -1,9 +1,10 @@
 import glob
 import logging
 import random
 import tempfile
 import uuid
 from collections import namedtuple
-from typing import Generator
+from typing import Generator, Sequence, Type
 import numpy as np
 import pytest
@ -248,6 +249,87 @@ def clf_with_weight(
    )
 ClfData = namedtuple(
    "ClfData", ("cls_params", "cls_df_train", "cls_df_train_large", "cls_df_test")
 )
@pytest.fixture
 def clf_data(spark: SparkSession) -> Generator[ClfData, None, None]:
    cls_params = {"max_depth": 5, "n_estimators": 10, "scale_pos_weight": 4}
    X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
    y = np.array([0, 1])
    cl1 = xgb.XGBClassifier()
    cl1.fit(X, y)
    predt0 = cl1.predict(X)
    proba0: np.ndarray = cl1.predict_proba(X)
    cl2 = xgb.XGBClassifier(max_depth=5, n_estimators=10, scale_pos_weight=4)
    cl2.fit(X, y)
    predt1 = cl2.predict(X)
    proba1: np.ndarray = cl2.predict_proba(X)
    # convert np array to pyspark dataframe
    cls_df_train_data = [
        (Vectors.dense(X[0, :]), int(y[0])),
        (Vectors.sparse(3, {1: float(X[1, 1]), 2: float(X[1, 2])}), int(y[1])),
    ]
    cls_df_train = spark.createDataFrame(cls_df_train_data, ["features", "label"])
    cls_df_train_large = spark.createDataFrame(
        cls_df_train_data * 100, ["features", "label"]
    )
    cls_df_test = spark.createDataFrame(
        [
            (
                Vectors.dense(X[0, :]),
                int(predt0[0]),
                proba0[0, :].tolist(),
                int(predt1[0]),
                proba1[0, :].tolist(),
            ),
            (
                Vectors.sparse(3, {1: 1.0, 2: 5.5}),
                int(predt0[1]),
                proba0[1, :].tolist(),
                int(predt1[1]),
                proba1[1, :].tolist(),
            ),
        ],
        [
            "features",
            "expected_prediction",
            "expected_probability",
            "expected_prediction_with_params",
            "expected_probability_with_params",
        ],
    )
    yield ClfData(cls_params, cls_df_train, cls_df_train_large, cls_df_test)
 def assert_model_compatible(model: XGBModel, model_path: str) -> None:
    bst = xgb.Booster()
    path = glob.glob(f"{model_path}/**/model/part-00000", recursive=True)[0]
    bst.load_model(path)
    np.testing.assert_equal(
        np.array(model.get_booster().save_raw("json")), np.array(bst.save_raw("json"))
    )
 def check_sub_dict_match(
    sub_dist: dict, whole_dict: dict, excluding_keys: Sequence[str]
 ) -> None:
    for k in sub_dist:
        if k not in excluding_keys:
            assert k in whole_dict, f"check on {k} failed"
            assert sub_dist[k] == whole_dict[k], f"check on {k} failed"
 def get_params_map(params_kv: dict, estimator: Type) -> dict:
    return {getattr(estimator, k): v for k, v in params_kv.items()}
 class TestPySparkLocal:
    def test_regressor_with_weight_eval(self, reg_with_weight: RegWithWeight) -> None:
        # with weight
@ -350,10 +432,161 @@ class TestPySparkLocal:
        )
        for row in pred_result_with_weight_eval:
-            np.testing.assert_allclose(  # failed
+            np.testing.assert_allclose(
                row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
            )
    def test_classifier_model_save_load(self, clf_data: ClfData) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            path = "file:" + tmpdir
            clf = SparkXGBClassifier(**clf_data.cls_params)
            model = clf.fit(clf_data.cls_df_train)
            model.save(path)
            loaded_model = SparkXGBClassifierModel.load(path)
            assert model.uid == loaded_model.uid
            for k, v in clf_data.cls_params.items():
                assert loaded_model.getOrDefault(k) == v
            pred_result = loaded_model.transform(clf_data.cls_df_test).collect()
            for row in pred_result:
                np.testing.assert_allclose(
                    row.probability, row.expected_probability_with_params, atol=1e-3
                )
            with pytest.raises(AssertionError, match="Expected class name"):
                SparkXGBRegressorModel.load(path)
            assert_model_compatible(model, tmpdir)
    def test_classifier_basic(self, clf_data: ClfData) -> None:
        classifier = SparkXGBClassifier()
        model = classifier.fit(clf_data.cls_df_train)
        pred_result = model.transform(clf_data.cls_df_test).collect()
        for row in pred_result:
            np.testing.assert_equal(row.prediction, row.expected_prediction)
            np.testing.assert_allclose(
                row.probability, row.expected_probability, rtol=1e-3
            )
    def test_classifier_with_params(self, clf_data: ClfData) -> None:
        classifier = SparkXGBClassifier(**clf_data.cls_params)
        all_params = dict(
            **(classifier._gen_xgb_params_dict()),
            **(classifier._gen_fit_params_dict()),
            **(classifier._gen_predict_params_dict()),
        )
        check_sub_dict_match(
            clf_data.cls_params, all_params, excluding_keys=_non_booster_params
        )
        model = classifier.fit(clf_data.cls_df_train)
        all_params = dict(
            **(model._gen_xgb_params_dict()),
            **(model._gen_fit_params_dict()),
            **(model._gen_predict_params_dict()),
        )
        check_sub_dict_match(
            clf_data.cls_params, all_params, excluding_keys=_non_booster_params
        )
        pred_result = model.transform(clf_data.cls_df_test).collect()
        for row in pred_result:
            np.testing.assert_equal(row.prediction, row.expected_prediction_with_params)
            np.testing.assert_allclose(
                row.probability, row.expected_probability_with_params, rtol=1e-3
            )
    def test_classifier_model_pipeline_save_load(self, clf_data: ClfData) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            path = "file:" + tmpdir
            classifier = SparkXGBClassifier()
            pipeline = Pipeline(stages=[classifier])
            pipeline = pipeline.copy(
                extra=get_params_map(clf_data.cls_params, classifier)
            )
            model = pipeline.fit(clf_data.cls_df_train)
            model.save(path)
            loaded_model = PipelineModel.load(path)
            for k, v in clf_data.cls_params.items():
                assert loaded_model.stages[0].getOrDefault(k) == v
            pred_result = loaded_model.transform(clf_data.cls_df_test).collect()
            for row in pred_result:
                np.testing.assert_allclose(
                    row.probability, row.expected_probability_with_params, atol=1e-3
                )
            assert_model_compatible(model.stages[0], tmpdir)
    def test_classifier_with_cross_validator(self, clf_data: ClfData) -> None:
        xgb_classifer = SparkXGBClassifier(n_estimators=1)
        paramMaps = ParamGridBuilder().addGrid(xgb_classifer.max_depth, [1, 2]).build()
        cvBin = CrossValidator(
            estimator=xgb_classifer,
            estimatorParamMaps=paramMaps,
            evaluator=BinaryClassificationEvaluator(),
            seed=1,
            parallelism=4,
            numFolds=2,
        )
        cvBinModel = cvBin.fit(clf_data.cls_df_train_large)
        cvBinModel.transform(clf_data.cls_df_test)
    def test_convert_to_sklearn_model_clf(self, clf_data: ClfData) -> None:
        classifier = SparkXGBClassifier(
            n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
        )
        clf_model = classifier.fit(clf_data.cls_df_train)
        # Check that regardless of what booster, _convert_to_model converts to the
        # correct class type
        sklearn_classifier = classifier._convert_to_sklearn_model(
            clf_model.get_booster().save_raw("json"),
            clf_model.get_booster().save_config(),
        )
        assert isinstance(sklearn_classifier, XGBClassifier)
        assert sklearn_classifier.n_estimators == 200
        assert sklearn_classifier.missing == 2.0
        assert sklearn_classifier.max_depth == 3
        assert sklearn_classifier.get_params()["sketch_eps"] == 0.5
    def test_classifier_array_col_as_feature(self, clf_data: ClfData) -> None:
        train_dataset = clf_data.cls_df_train.withColumn(
            "features", vector_to_array(spark_sql_func.col("features"))
        )
        test_dataset = clf_data.cls_df_test.withColumn(
            "features", vector_to_array(spark_sql_func.col("features"))
        )
        classifier = SparkXGBClassifier()
        model = classifier.fit(train_dataset)
        pred_result = model.transform(test_dataset).collect()
        for row in pred_result:
            np.testing.assert_equal(row.prediction, row.expected_prediction)
            np.testing.assert_allclose(
                row.probability, row.expected_probability, rtol=1e-3
            )
    def test_classifier_with_feature_names_types_weights(
        self, clf_data: ClfData
    ) -> None:
        classifier = SparkXGBClassifier(
            feature_names=["a1", "a2", "a3"],
            feature_types=["i", "int", "float"],
            feature_weights=[2.0, 5.0, 3.0],
        )
        model = classifier.fit(clf_data.cls_df_train)
        model.transform(clf_data.cls_df_test).collect()
    def test_early_stop_param_validation(self, clf_data: ClfData) -> None:
        classifier = SparkXGBClassifier(early_stopping_rounds=1)
        with pytest.raises(ValueError, match="early_stopping_rounds"):
            classifier.fit(clf_data.cls_df_train)
    def test_gpu_param_setting(self, clf_data: ClfData) -> None:
        py_cls = SparkXGBClassifier(use_gpu=True)
        train_params = py_cls._get_distributed_train_params(clf_data.cls_df_train)
        assert train_params["tree_method"] == "gpu_hist"
 class XgboostLocalTest(SparkTestCase):
    def setUp(self):
@ -406,60 +639,6 @@ class XgboostLocalTest(SparkTestCase):
            ],
        )
        # >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
        # >>> y = np.array([0, 1])
        # >>> cl1 = xgboost.XGBClassifier()
        # >>> cl1.fit(X, y)
        # >>> cl1.predict(X)
        # array([0, 0])
        # >>> cl1.predict_proba(X)
        # array([[0.5, 0.5],
        #        [0.5, 0.5]], dtype=float32)
        # >>> cl2 = xgboost.XGBClassifier(max_depth=5, n_estimators=10, scale_pos_weight=4)
        # >>> cl2.fit(X, y)
        # >>> cl2.predict(X)
        # array([1, 1])
        # >>> cl2.predict_proba(X)
        # array([[0.27574146, 0.72425854 ],
        #        [0.27574146, 0.72425854 ]], dtype=float32)
        self.cls_params = {"max_depth": 5, "n_estimators": 10, "scale_pos_weight": 4}
        cls_df_train_data = [
            (Vectors.dense(1.0, 2.0, 3.0), 0),
            (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
        ]
        self.cls_df_train = self.session.createDataFrame(
            cls_df_train_data, ["features", "label"]
        )
        self.cls_df_train_large = self.session.createDataFrame(
            cls_df_train_data * 100, ["features", "label"]
        )
        self.cls_df_test = self.session.createDataFrame(
            [
                (
                    Vectors.dense(1.0, 2.0, 3.0),
                    0,
                    [0.5, 0.5],
                    1,
                    [0.27574146, 0.72425854],
                ),
                (
                    Vectors.sparse(3, {1: 1.0, 2: 5.5}),
                    0,
                    [0.5, 0.5],
                    1,
                    [0.27574146, 0.72425854],
                ),
            ],
            [
                "features",
                "expected_prediction",
                "expected_probability",
                "expected_prediction_with_params",
                "expected_probability_with_params",
            ],
        )
        # kwargs test (using the above data, train, we get the same results)
        self.cls_params_kwargs = {"tree_method": "approx", "sketch_eps": 0.03}
@ -610,6 +789,22 @@ class XgboostLocalTest(SparkTestCase):
        bst.load_model(path)
        self.assertEqual(model.get_booster().save_raw("json"), bst.save_raw("json"))
    def test_convert_to_sklearn_model_reg(self) -> None:
        regressor = SparkXGBRegressor(
            n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
        )
        reg_model = regressor.fit(self.reg_df_train)
        sklearn_regressor = regressor._convert_to_sklearn_model(
            reg_model.get_booster().save_raw("json"),
            reg_model.get_booster().save_config(),
        )
        assert isinstance(sklearn_regressor, XGBRegressor)
        assert sklearn_regressor.n_estimators == 200
        assert sklearn_regressor.missing == 2.0
        assert sklearn_regressor.max_depth == 3
        assert sklearn_regressor.get_params()["sketch_eps"] == 0.5
    def test_regressor_params_basic(self):
        py_reg = SparkXGBRegressor()
        self.assertTrue(hasattr(py_reg, "n_estimators"))
@ -665,11 +860,6 @@ class XgboostLocalTest(SparkTestCase):
        ):
            SparkXGBClassifier(featuresCol="f1")
    def test_gpu_param_setting(self):
        py_cls = SparkXGBClassifier(use_gpu=True)
        train_params = py_cls._get_distributed_train_params(self.cls_df_train)
        assert train_params["tree_method"] == "gpu_hist"
    @staticmethod
    def test_param_value_converter():
        py_cls = SparkXGBClassifier(missing=np.float64(1.0), sketch_eps=np.float64(0.3))
@ -691,16 +881,6 @@ class XgboostLocalTest(SparkTestCase):
                np.isclose(row.prediction, row.expected_prediction, atol=1e-3)
            )
    def test_classifier_basic(self):
        classifier = SparkXGBClassifier()
        model = classifier.fit(self.cls_df_train)
        pred_result = model.transform(self.cls_df_test).collect()
        for row in pred_result:
            self.assertEqual(row.prediction, row.expected_prediction)
            self.assertTrue(
                np.allclose(row.probability, row.expected_probability, rtol=1e-3)
            )
    def test_multi_classifier(self):
        classifier = SparkXGBClassifier()
        model = classifier.fit(self.multi_cls_df_train)
@ -710,12 +890,6 @@ class XgboostLocalTest(SparkTestCase):
                np.allclose(row.probability, row.expected_probability, rtol=1e-3)
            )
    def _check_sub_dict_match(self, sub_dist, whole_dict, excluding_keys):
        for k in sub_dist:
            if k not in excluding_keys:
                self.assertTrue(k in whole_dict, f"check on {k} failed")
                self.assertEqual(sub_dist[k], whole_dict[k], f"check on {k} failed")
    def test_regressor_with_params(self):
        regressor = SparkXGBRegressor(**self.reg_params)
        all_params = dict(
@ -723,7 +897,7 @@ class XgboostLocalTest(SparkTestCase):
            **(regressor._gen_fit_params_dict()),
            **(regressor._gen_predict_params_dict()),
        )
-        self._check_sub_dict_match(
+        check_sub_dict_match(
            self.reg_params, all_params, excluding_keys=_non_booster_params
        )
@ -733,7 +907,7 @@ class XgboostLocalTest(SparkTestCase):
            **(model._gen_fit_params_dict()),
            **(model._gen_predict_params_dict()),
        )
-        self._check_sub_dict_match(
+        check_sub_dict_match(
            self.reg_params, all_params, excluding_keys=_non_booster_params
        )
        pred_result = model.transform(self.reg_df_test).collect()
@ -744,35 +918,6 @@ class XgboostLocalTest(SparkTestCase):
                )
            )
    def test_classifier_with_params(self):
        classifier = SparkXGBClassifier(**self.cls_params)
        all_params = dict(
            **(classifier._gen_xgb_params_dict()),
            **(classifier._gen_fit_params_dict()),
            **(classifier._gen_predict_params_dict()),
        )
        self._check_sub_dict_match(
            self.cls_params, all_params, excluding_keys=_non_booster_params
        )
        model = classifier.fit(self.cls_df_train)
        all_params = dict(
            **(model._gen_xgb_params_dict()),
            **(model._gen_fit_params_dict()),
            **(model._gen_predict_params_dict()),
        )
        self._check_sub_dict_match(
            self.cls_params, all_params, excluding_keys=_non_booster_params
        )
        pred_result = model.transform(self.cls_df_test).collect()
        for row in pred_result:
            self.assertEqual(row.prediction, row.expected_prediction_with_params)
            self.assertTrue(
                np.allclose(
                    row.probability, row.expected_probability_with_params, rtol=1e-3
                )
            )
    def test_regressor_model_save_load(self):
        tmp_dir = self.get_local_tmp_dir()
        path = "file:" + tmp_dir
@ -797,40 +942,12 @@ class XgboostLocalTest(SparkTestCase):
        self.assert_model_compatible(model, tmp_dir)
    def test_classifier_model_save_load(self):
        tmp_dir = self.get_local_tmp_dir()
        path = "file:" + tmp_dir
        regressor = SparkXGBClassifier(**self.cls_params)
        model = regressor.fit(self.cls_df_train)
        model.save(path)
        loaded_model = SparkXGBClassifierModel.load(path)
        self.assertEqual(model.uid, loaded_model.uid)
        for k, v in self.cls_params.items():
            self.assertEqual(loaded_model.getOrDefault(k), v)
        pred_result = loaded_model.transform(self.cls_df_test).collect()
        for row in pred_result:
            self.assertTrue(
                np.allclose(
                    row.probability, row.expected_probability_with_params, atol=1e-3
                )
            )
        with self.assertRaisesRegex(AssertionError, "Expected class name"):
            SparkXGBRegressorModel.load(path)
        self.assert_model_compatible(model, tmp_dir)
    @staticmethod
    def _get_params_map(params_kv, estimator):
        return {getattr(estimator, k): v for k, v in params_kv.items()}
    def test_regressor_model_pipeline_save_load(self):
        tmp_dir = self.get_local_tmp_dir()
        path = "file:" + tmp_dir
        regressor = SparkXGBRegressor()
        pipeline = Pipeline(stages=[regressor])
-        pipeline = pipeline.copy(extra=self._get_params_map(self.reg_params, regressor))
+        pipeline = pipeline.copy(extra=get_params_map(self.reg_params, regressor))
        model = pipeline.fit(self.reg_df_train)
        model.save(path)
@ -847,44 +964,6 @@ class XgboostLocalTest(SparkTestCase):
            )
        self.assert_model_compatible(model.stages[0], tmp_dir)
    def test_classifier_model_pipeline_save_load(self):
        tmp_dir = self.get_local_tmp_dir()
        path = "file:" + tmp_dir
        classifier = SparkXGBClassifier()
        pipeline = Pipeline(stages=[classifier])
        pipeline = pipeline.copy(
            extra=self._get_params_map(self.cls_params, classifier)
        )
        model = pipeline.fit(self.cls_df_train)
        model.save(path)
        loaded_model = PipelineModel.load(path)
        for k, v in self.cls_params.items():
            self.assertEqual(loaded_model.stages[0].getOrDefault(k), v)
        pred_result = loaded_model.transform(self.cls_df_test).collect()
        for row in pred_result:
            self.assertTrue(
                np.allclose(
                    row.probability, row.expected_probability_with_params, atol=1e-3
                )
            )
        self.assert_model_compatible(model.stages[0], tmp_dir)
    def test_classifier_with_cross_validator(self):
        xgb_classifer = SparkXGBClassifier(n_estimators=1)
        paramMaps = ParamGridBuilder().addGrid(xgb_classifer.max_depth, [1, 2]).build()
        cvBin = CrossValidator(
            estimator=xgb_classifer,
            estimatorParamMaps=paramMaps,
            evaluator=BinaryClassificationEvaluator(),
            seed=1,
            parallelism=4,
            numFolds=2,
        )
        cvBinModel = cvBin.fit(self.cls_df_train_large)
        cvBinModel.transform(self.cls_df_test)
    def test_callbacks(self):
        from xgboost.callback import LearningRateScheduler
@ -1003,38 +1082,6 @@ class XgboostLocalTest(SparkTestCase):
        classifier = SparkXGBClassifier(use_gpu=True, tree_method="gpu_hist")
        classifier = SparkXGBClassifier(use_gpu=True)
    def test_convert_to_sklearn_model(self):
        classifier = SparkXGBClassifier(
            n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
        )
        clf_model = classifier.fit(self.cls_df_train)
        regressor = SparkXGBRegressor(
            n_estimators=200, missing=2.0, max_depth=3, sketch_eps=0.5
        )
        reg_model = regressor.fit(self.reg_df_train)
        # Check that regardless of what booster, _convert_to_model converts to the correct class type
        sklearn_classifier = classifier._convert_to_sklearn_model(
            clf_model.get_booster().save_raw("json"),
            clf_model.get_booster().save_config(),
        )
        assert isinstance(sklearn_classifier, XGBClassifier)
        assert sklearn_classifier.n_estimators == 200
        assert sklearn_classifier.missing == 2.0
        assert sklearn_classifier.max_depth == 3
        assert sklearn_classifier.get_params()["sketch_eps"] == 0.5
        sklearn_regressor = regressor._convert_to_sklearn_model(
            reg_model.get_booster().save_raw("json"),
            reg_model.get_booster().save_config(),
        )
        assert isinstance(sklearn_regressor, XGBRegressor)
        assert sklearn_regressor.n_estimators == 200
        assert sklearn_regressor.missing == 2.0
        assert sklearn_regressor.max_depth == 3
        assert sklearn_classifier.get_params()["sketch_eps"] == 0.5
    def test_feature_importances(self):
        reg1 = SparkXGBRegressor(**self.reg_params)
        model = reg1.fit(self.reg_df_train)
@ -1060,32 +1107,6 @@ class XgboostLocalTest(SparkTestCase):
                np.isclose(row.prediction, row.expected_prediction, atol=1e-3)
            )
    def test_classifier_array_col_as_feature(self):
        train_dataset = self.cls_df_train.withColumn(
            "features", vector_to_array(spark_sql_func.col("features"))
        )
        test_dataset = self.cls_df_test.withColumn(
            "features", vector_to_array(spark_sql_func.col("features"))
        )
        classifier = SparkXGBClassifier()
        model = classifier.fit(train_dataset)
        pred_result = model.transform(test_dataset).collect()
        for row in pred_result:
            self.assertEqual(row.prediction, row.expected_prediction)
            self.assertTrue(
                np.allclose(row.probability, row.expected_probability, rtol=1e-3)
            )
    def test_classifier_with_feature_names_types_weights(self):
        classifier = SparkXGBClassifier(
            feature_names=["a1", "a2", "a3"],
            feature_types=["i", "int", "float"],
            feature_weights=[2.0, 5.0, 3.0],
        )
        model = classifier.fit(self.cls_df_train)
        model.transform(self.cls_df_test).collect()
    def test_regressor_with_sparse_optim(self):
        regressor = SparkXGBRegressor(missing=0.0)
        model = regressor.fit(self.reg_df_sparse_train)
@ -1192,11 +1213,6 @@ class XgboostLocalTest(SparkTestCase):
            classifier = SparkXGBClassifier(num_workers=4, tree_method=tree_method)
            classifier.fit(data_trans)
    def test_early_stop_param_validation(self):
        classifier = SparkXGBClassifier(early_stopping_rounds=1)
        with pytest.raises(ValueError, match="early_stopping_rounds"):
            classifier.fit(self.cls_df_train)
    def test_unsupported_params(self):
        with pytest.raises(ValueError, match="evals_result"):
            SparkXGBClassifier(evals_result={})