[pyspark] Cleanup data processing. (#8088)

- Use numpy stack for handling list of arrays. - Reuse concat function from dask. - Prepare for `QuantileDMatrix`. - Remove unused code. - Use iterator for prediction to avoid initializing xgboost model
2022-07-26 15:00:52 +08:00
parent 3970e4e6bb
commit 546de5efd2
9 changed files with 416 additions and 472 deletions
--- a/tests/python/test_spark/test_data.py
+++ b/tests/python/test_spark/test_data.py
@@ -1,11 +1,9 @@
 import sys
-import tempfile
-import shutil
+from typing import List

-import pytest
 import numpy as np
 import pandas as pd
-
+import pytest
 import testing as tm

 if tm.no_spark()["condition"]:
@@ -13,156 +11,90 @@ if tm.no_spark()["condition"]:
 if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
    pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)

-from xgboost.spark.data import (
-    _row_tuple_list_to_feature_matrix_y_w,
-    _convert_partition_data_to_dmatrix,
-)
-
-from xgboost import DMatrix, XGBClassifier
-from xgboost.training import train as worker_train
-from .utils import SparkTestCase
-import logging
-
-logging.getLogger("py4j").setLevel(logging.INFO)
+from xgboost.spark.data import alias, create_dmatrix_from_partitions, stack_series


-class DataTest(SparkTestCase):
-    def test_sparse_dense_vector(self):
-        def row_tup_iter(data):
-            pdf = pd.DataFrame(data)
-            yield pdf
+def test_stack() -> None:
+    a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 2)

-        expected_ndarray = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
-        data = {"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]]}
-        feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
-            list(row_tup_iter(data)),
-            train=False,
-            has_weight=False,
-            has_fit_base_margin=False,
-            has_predict_base_margin=False,
+    a = pd.DataFrame({"a": [[1], [3]]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 1)
+
+    a = pd.DataFrame({"a": [np.array([1, 2]), np.array([3, 4])]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 2)
+
+    a = pd.DataFrame({"a": [np.array([1]), np.array([3])]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 1)
+
+
+def run_dmatrix_ctor(is_dqm: bool) -> None:
+    rng = np.random.default_rng(0)
+    dfs: List[pd.DataFrame] = []
+    n_features = 16
+    n_samples_per_batch = 16
+    n_batches = 10
+    feature_types = ["float"] * n_features
+
+    for i in range(n_batches):
+        X = rng.normal(loc=0, size=256).reshape(n_samples_per_batch, n_features)
+        y = rng.normal(loc=0, size=n_samples_per_batch)
+        m = rng.normal(loc=0, size=n_samples_per_batch)
+        w = rng.normal(loc=0.5, scale=0.5, size=n_samples_per_batch)
+        w -= w.min()
+
+        valid = rng.binomial(n=1, p=0.5, size=16).astype(np.bool_)
+
+        df = pd.DataFrame(
+            {alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
        )
-        self.assertIsNone(y)
-        self.assertIsNone(w)
-        self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
+        if is_dqm:
+            for j in range(X.shape[1]):
+                df[f"feat-{j}"] = pd.Series(X[:, j])
+        else:
+            df[alias.data] = pd.Series(list(X))
+        dfs.append(df)

-        data["label"] = [1, 0]
-        feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
-            row_tup_iter(data),
-            train=True,
-            has_weight=False,
-            has_fit_base_margin=False,
-            has_predict_base_margin=False,
-        )
-        self.assertIsNone(w)
-        self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
-        self.assertTrue(np.array_equal(y, np.array(data["label"])))
+    kwargs = {"feature_types": feature_types}
+    if is_dqm:
+        cols = [f"feat-{i}" for i in range(n_features)]
+        train_Xy, valid_Xy = create_dmatrix_from_partitions(iter(dfs), cols, kwargs)
+    else:
+        train_Xy, valid_Xy = create_dmatrix_from_partitions(iter(dfs), None, kwargs)

-        data["weight"] = [0.2, 0.8]
-        feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
-            list(row_tup_iter(data)),
-            train=True,
-            has_weight=True,
-            has_fit_base_margin=False,
-            has_predict_base_margin=False,
-        )
-        self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
-        self.assertTrue(np.array_equal(y, np.array(data["label"])))
-        self.assertTrue(np.array_equal(w, np.array(data["weight"])))
+    assert valid_Xy is not None
+    assert valid_Xy.num_row() + train_Xy.num_row() == n_samples_per_batch * n_batches
+    assert train_Xy.num_col() == n_features
+    assert valid_Xy.num_col() == n_features

-    def test_dmatrix_creator(self):
+    df = pd.concat(dfs, axis=0)
+    df_train = df.loc[~df[alias.valid], :]
+    df_valid = df.loc[df[alias.valid], :]

-        # This function acts as a pseudo-itertools.chain()
-        def row_tup_iter(data):
-            pdf = pd.DataFrame(data)
-            yield pdf
+    assert df_train.shape[0] == train_Xy.num_row()
+    assert df_valid.shape[0] == valid_Xy.num_row()

-        # Standard testing DMatrix creation
-        expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
-        expected_labels = np.array([1, 0] * 100)
-        expected_dmatrix = DMatrix(data=expected_features, label=expected_labels)
+    # margin
+    np.testing.assert_allclose(
+        df_train[alias.margin].to_numpy(), train_Xy.get_base_margin()
+    )
+    np.testing.assert_allclose(
+        df_valid[alias.margin].to_numpy(), valid_Xy.get_base_margin()
+    )
+    # weight
+    np.testing.assert_allclose(df_train[alias.weight].to_numpy(), train_Xy.get_weight())
+    np.testing.assert_allclose(df_valid[alias.weight].to_numpy(), valid_Xy.get_weight())
+    # label
+    np.testing.assert_allclose(df_train[alias.label].to_numpy(), train_Xy.get_label())
+    np.testing.assert_allclose(df_valid[alias.label].to_numpy(), valid_Xy.get_label())

-        data = {
-            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
-            "label": [1, 0] * 100,
-        }
-        output_dmatrix = _convert_partition_data_to_dmatrix(
-            [pd.DataFrame(data)],
-            has_weight=False,
-            has_validation=False,
-            has_base_margin=False,
-        )
-        # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
-        # the same classifier and making sure the outputs are equal
-        model = XGBClassifier()
-        model.fit(expected_features, expected_labels)
-        expected_preds = model.get_booster().predict(expected_dmatrix)
-        output_preds = model.get_booster().predict(output_dmatrix)
-        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
+    np.testing.assert_equal(train_Xy.feature_types, feature_types)
+    np.testing.assert_equal(valid_Xy.feature_types, feature_types)

-        # DMatrix creation with weights
-        expected_weight = np.array([0.2, 0.8] * 100)
-        expected_dmatrix = DMatrix(
-            data=expected_features, label=expected_labels, weight=expected_weight
-        )

-        data["weight"] = [0.2, 0.8] * 100
-        output_dmatrix = _convert_partition_data_to_dmatrix(
-            [pd.DataFrame(data)],
-            has_weight=True,
-            has_validation=False,
-            has_base_margin=False,
-        )
-
-        model.fit(expected_features, expected_labels, sample_weight=expected_weight)
-        expected_preds = model.get_booster().predict(expected_dmatrix)
-        output_preds = model.get_booster().predict(output_dmatrix)
-        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
-
-    def test_external_storage(self):
-        # Instantiating base data (features, labels)
-        features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
-        labels = np.array([1, 0] * 100)
-        normal_dmatrix = DMatrix(features, labels)
-        test_dmatrix = DMatrix(features)
-
-        data = {
-            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
-            "label": [1, 0] * 100,
-        }
-
-        # Creating the dmatrix based on storage
-        temporary_path = tempfile.mkdtemp()
-        storage_dmatrix = _convert_partition_data_to_dmatrix(
-            [pd.DataFrame(data)],
-            has_weight=False,
-            has_validation=False,
-            has_base_margin=False,
-        )
-
-        # Testing without weights
-        normal_booster = worker_train({}, normal_dmatrix)
-        storage_booster = worker_train({}, storage_dmatrix)
-        normal_preds = normal_booster.predict(test_dmatrix)
-        storage_preds = storage_booster.predict(test_dmatrix)
-        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
-        shutil.rmtree(temporary_path)
-
-        # Testing weights
-        weights = np.array([0.2, 0.8] * 100)
-        normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
-        data["weight"] = [0.2, 0.8] * 100
-
-        temporary_path = tempfile.mkdtemp()
-        storage_dmatrix = _convert_partition_data_to_dmatrix(
-            [pd.DataFrame(data)],
-            has_weight=True,
-            has_validation=False,
-            has_base_margin=False,
-        )
-
-        normal_booster = worker_train({}, normal_dmatrix)
-        storage_booster = worker_train({}, storage_dmatrix)
-        normal_preds = normal_booster.predict(test_dmatrix)
-        storage_preds = storage_booster.predict(test_dmatrix)
-        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
-        shutil.rmtree(temporary_path)
+def test_dmatrix_ctor() -> None:
+    run_dmatrix_ctor(False)