[pyspark] fix empty data issue when constructing DMatrix (#8245)

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-09-20 16:43:20 +08:00
parent 70df36c99c
commit 520586ffa7
5 changed files with 86 additions and 7 deletions
--- a/tests/python/test_spark/test_data.py
+++ b/tests/python/test_spark/test_data.py
@@ -68,11 +68,11 @@ def run_dmatrix_ctor(is_dqm: bool) -> None:
    if is_dqm:
        cols = [f"feat-{i}" for i in range(n_features)]
        train_Xy, valid_Xy = create_dmatrix_from_partitions(
-            iter(dfs), cols, 0, kwargs, False
+            iter(dfs), cols, 0, kwargs, False, True
        )
    else:
        train_Xy, valid_Xy = create_dmatrix_from_partitions(
-            iter(dfs), None, None, kwargs, False
+            iter(dfs), None, None, kwargs, False, True
        )

    assert valid_Xy is not None
--- a/tests/python/test_spark/test_spark_local.py
+++ b/tests/python/test_spark/test_spark_local.py
@@ -17,6 +17,7 @@ from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator,
 )
+from pyspark.ml.feature import VectorAssembler
 from pyspark.ml.functions import vector_to_array
 from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
@@ -1058,3 +1059,65 @@ class XgboostLocalTest(SparkTestCase):

        for row in pred_result:
            assert np.isclose(row.prediction, row.expected_prediction, rtol=1e-3)
+
+    def test_empty_validation_data(self):
+        df_train = self.session.createDataFrame(
+            [
+                (Vectors.dense(10.1, 11.2, 11.3), 0, False),
+                (Vectors.dense(1, 1.2, 1.3), 1, False),
+                (Vectors.dense(14.0, 15.0, 16.0), 0, False),
+                (Vectors.dense(1.1, 1.2, 1.3), 1, True),
+            ],
+            ["features", "label", "val_col"],
+        )
+        classifier = SparkXGBClassifier(
+            num_workers=2,
+            min_child_weight=0.0,
+            reg_alpha=0,
+            reg_lambda=0,
+            validation_indicator_col="val_col",
+        )
+        model = classifier.fit(df_train)
+        pred_result = model.transform(df_train).collect()
+        for row in pred_result:
+            self.assertEqual(row.prediction, row.label)
+
+    def test_empty_train_data(self):
+        df_train = self.session.createDataFrame(
+            [
+                (Vectors.dense(10.1, 11.2, 11.3), 0, True),
+                (Vectors.dense(1, 1.2, 1.3), 1, True),
+                (Vectors.dense(14.0, 15.0, 16.0), 0, True),
+                (Vectors.dense(1.1, 1.2, 1.3), 1, False),
+            ],
+            ["features", "label", "val_col"],
+        )
+        classifier = SparkXGBClassifier(
+            num_workers=2,
+            min_child_weight=0.0,
+            reg_alpha=0,
+            reg_lambda=0,
+            validation_indicator_col="val_col",
+        )
+        model = classifier.fit(df_train)
+        pred_result = model.transform(df_train).collect()
+        for row in pred_result:
+            self.assertEqual(row.prediction, 1.0)
+
+    def test_empty_partition(self):
+        # raw_df.repartition(4) will result int severe data skew, actually,
+        # there is no any data in reducer partition 1, reducer partition 2
+        # see https://github.com/dmlc/xgboost/issues/8221
+        raw_df = self.session.range(0, 100, 1, 50).withColumn(
+            "label", spark_sql_func.when(spark_sql_func.rand(1) > 0.5, 1).otherwise(0)
+        )
+        vector_assembler = (
+            VectorAssembler().setInputCols(["id"]).setOutputCol("features")
+        )
+        data_trans = vector_assembler.setHandleInvalid("keep").transform(raw_df)
+        data_trans.show(100)
+
+        classifier = SparkXGBClassifier(
+            num_workers=4,
+        )
+        classifier.fit(data_trans)
--- a/tests/python/test_spark/utils.py
+++ b/tests/python/test_spark/utils.py
@@ -102,7 +102,7 @@ class SparkTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
    def setUpClass(cls):
        cls.setup_env(
            {
-                "spark.master": "local[2]",
+                "spark.master": "local[4]",
                "spark.python.worker.reuse": "false",
                "spark.driver.host": "127.0.0.1",
                "spark.task.maxFailures": "1",