[pyspark] Use quantile dmatrix. (#8284)

2022-10-12 20:38:53 +08:00
parent ce0382dcb0
commit 97a5b088a5
9 changed files with 225 additions and 120 deletions
--- a/tests/python/test_spark/test_data.py
+++ b/tests/python/test_spark/test_data.py
@@ -37,7 +37,7 @@ def test_stack() -> None:
    assert b.shape == (2, 1)


-def run_dmatrix_ctor(is_dqm: bool) -> None:
+def run_dmatrix_ctor(is_dqm: bool, on_gpu: bool) -> None:
    rng = np.random.default_rng(0)
    dfs: List[pd.DataFrame] = []
    n_features = 16
@@ -57,7 +57,7 @@ def run_dmatrix_ctor(is_dqm: bool) -> None:
        df = pd.DataFrame(
            {alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
        )
-        if is_dqm:
+        if on_gpu:
            for j in range(X.shape[1]):
                df[f"feat-{j}"] = pd.Series(X[:, j])
        else:
@@ -65,14 +65,18 @@ def run_dmatrix_ctor(is_dqm: bool) -> None:
        dfs.append(df)

    kwargs = {"feature_types": feature_types}
-    if is_dqm:
+    if on_gpu:
        cols = [f"feat-{i}" for i in range(n_features)]
        train_Xy, valid_Xy = create_dmatrix_from_partitions(
-            iter(dfs), cols, 0, kwargs, False, True
+            iter(dfs), cols, 0, is_dqm, kwargs, False, True
+        )
+    elif is_dqm:
+        train_Xy, valid_Xy = create_dmatrix_from_partitions(
+            iter(dfs), None, None, True, kwargs, False, True
        )
    else:
        train_Xy, valid_Xy = create_dmatrix_from_partitions(
-            iter(dfs), None, None, kwargs, False, True
+            iter(dfs), None, None, False, kwargs, False, True
        )

    assert valid_Xy is not None
@@ -106,7 +110,8 @@ def run_dmatrix_ctor(is_dqm: bool) -> None:


 def test_dmatrix_ctor() -> None:
-    run_dmatrix_ctor(False)
+    run_dmatrix_ctor(is_dqm=False, on_gpu=False)
+    run_dmatrix_ctor(is_dqm=True, on_gpu=False)


 def test_read_csr_matrix_from_unwrapped_spark_vec() -> None:
--- a/tests/python/test_spark/test_spark_local.py
+++ b/tests/python/test_spark/test_spark_local.py
@@ -1047,67 +1047,79 @@ class XgboostLocalTest(SparkTestCase):
        for row in pred_result:
            assert np.isclose(row.prediction, row.expected_prediction, rtol=1e-3)

-    def test_empty_validation_data(self):
-        df_train = self.session.createDataFrame(
-            [
-                (Vectors.dense(10.1, 11.2, 11.3), 0, False),
-                (Vectors.dense(1, 1.2, 1.3), 1, False),
-                (Vectors.dense(14.0, 15.0, 16.0), 0, False),
-                (Vectors.dense(1.1, 1.2, 1.3), 1, True),
-            ],
-            ["features", "label", "val_col"],
-        )
-        classifier = SparkXGBClassifier(
-            num_workers=2,
-            min_child_weight=0.0,
-            reg_alpha=0,
-            reg_lambda=0,
-            validation_indicator_col="val_col",
-        )
-        model = classifier.fit(df_train)
-        pred_result = model.transform(df_train).collect()
-        for row in pred_result:
-            self.assertEqual(row.prediction, row.label)
+    def test_empty_validation_data(self) -> None:
+        for tree_method in [
+            "hist",
+            "approx",
+        ]:  # pytest.mark conflict with python unittest
+            df_train = self.session.createDataFrame(
+                [
+                    (Vectors.dense(10.1, 11.2, 11.3), 0, False),
+                    (Vectors.dense(1, 1.2, 1.3), 1, False),
+                    (Vectors.dense(14.0, 15.0, 16.0), 0, False),
+                    (Vectors.dense(1.1, 1.2, 1.3), 1, True),
+                ],
+                ["features", "label", "val_col"],
+            )
+            classifier = SparkXGBClassifier(
+                num_workers=2,
+                tree_method=tree_method,
+                min_child_weight=0.0,
+                reg_alpha=0,
+                reg_lambda=0,
+                validation_indicator_col="val_col",
+            )
+            model = classifier.fit(df_train)
+            pred_result = model.transform(df_train).collect()
+            for row in pred_result:
+                self.assertEqual(row.prediction, row.label)

-    def test_empty_train_data(self):
-        df_train = self.session.createDataFrame(
-            [
-                (Vectors.dense(10.1, 11.2, 11.3), 0, True),
-                (Vectors.dense(1, 1.2, 1.3), 1, True),
-                (Vectors.dense(14.0, 15.0, 16.0), 0, True),
-                (Vectors.dense(1.1, 1.2, 1.3), 1, False),
-            ],
-            ["features", "label", "val_col"],
-        )
-        classifier = SparkXGBClassifier(
-            num_workers=2,
-            min_child_weight=0.0,
-            reg_alpha=0,
-            reg_lambda=0,
-            validation_indicator_col="val_col",
-        )
-        model = classifier.fit(df_train)
-        pred_result = model.transform(df_train).collect()
-        for row in pred_result:
-            self.assertEqual(row.prediction, 1.0)
+    def test_empty_train_data(self) -> None:
+        for tree_method in [
+            "hist",
+            "approx",
+        ]:  # pytest.mark conflict with python unittest
+            df_train = self.session.createDataFrame(
+                [
+                    (Vectors.dense(10.1, 11.2, 11.3), 0, True),
+                    (Vectors.dense(1, 1.2, 1.3), 1, True),
+                    (Vectors.dense(14.0, 15.0, 16.0), 0, True),
+                    (Vectors.dense(1.1, 1.2, 1.3), 1, False),
+                ],
+                ["features", "label", "val_col"],
+            )
+            classifier = SparkXGBClassifier(
+                num_workers=2,
+                min_child_weight=0.0,
+                reg_alpha=0,
+                reg_lambda=0,
+                tree_method=tree_method,
+                validation_indicator_col="val_col",
+            )
+            model = classifier.fit(df_train)
+            pred_result = model.transform(df_train).collect()
+            for row in pred_result:
+                assert row.prediction == 1.0

    def test_empty_partition(self):
        # raw_df.repartition(4) will result int severe data skew, actually,
        # there is no any data in reducer partition 1, reducer partition 2
        # see https://github.com/dmlc/xgboost/issues/8221
-        raw_df = self.session.range(0, 100, 1, 50).withColumn(
-            "label", spark_sql_func.when(spark_sql_func.rand(1) > 0.5, 1).otherwise(0)
-        )
-        vector_assembler = (
-            VectorAssembler().setInputCols(["id"]).setOutputCol("features")
-        )
-        data_trans = vector_assembler.setHandleInvalid("keep").transform(raw_df)
-        data_trans.show(100)
+        for tree_method in [
+            "hist",
+            "approx",
+        ]:  # pytest.mark conflict with python unittest
+            raw_df = self.session.range(0, 100, 1, 50).withColumn(
+                "label",
+                spark_sql_func.when(spark_sql_func.rand(1) > 0.5, 1).otherwise(0),
+            )
+            vector_assembler = (
+                VectorAssembler().setInputCols(["id"]).setOutputCol("features")
+            )
+            data_trans = vector_assembler.setHandleInvalid("keep").transform(raw_df)

-        classifier = SparkXGBClassifier(
-            num_workers=4,
-        )
-        classifier.fit(data_trans)
+            classifier = SparkXGBClassifier(num_workers=4, tree_method=tree_method)
+            classifier.fit(data_trans)

    def test_early_stop_param_validation(self):
        classifier = SparkXGBClassifier(early_stopping_rounds=1)