PySpark XGBoost integration (#8020)

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu> Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
2022-07-13 13:11:18 +08:00
parent 8959622836
commit 176fec8789
25 changed files with 3650 additions and 12 deletions
--- a/tests/python/test_spark/test_data.py
+++ b/tests/python/test_spark/test_data.py
@@ -0,0 +1,168 @@
+import sys
+import tempfile
+import shutil
+
+import pytest
+import numpy as np
+import pandas as pd
+
+import testing as tm
+
+if tm.no_spark()["condition"]:
+    pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
+if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
+    pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
+
+from xgboost.spark.data import (
+    _row_tuple_list_to_feature_matrix_y_w,
+    _convert_partition_data_to_dmatrix,
+)
+
+from xgboost import DMatrix, XGBClassifier
+from xgboost.training import train as worker_train
+from .utils import SparkTestCase
+import logging
+
+logging.getLogger("py4j").setLevel(logging.INFO)
+
+
+class DataTest(SparkTestCase):
+    def test_sparse_dense_vector(self):
+        def row_tup_iter(data):
+            pdf = pd.DataFrame(data)
+            yield pdf
+
+        expected_ndarray = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
+        data = {"values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]]}
+        feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
+            list(row_tup_iter(data)),
+            train=False,
+            has_weight=False,
+            has_fit_base_margin=False,
+            has_predict_base_margin=False,
+        )
+        self.assertIsNone(y)
+        self.assertIsNone(w)
+        self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
+
+        data["label"] = [1, 0]
+        feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
+            row_tup_iter(data),
+            train=True,
+            has_weight=False,
+            has_fit_base_margin=False,
+            has_predict_base_margin=False,
+        )
+        self.assertIsNone(w)
+        self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
+        self.assertTrue(np.array_equal(y, np.array(data["label"])))
+
+        data["weight"] = [0.2, 0.8]
+        feature_matrix, y, w, _ = _row_tuple_list_to_feature_matrix_y_w(
+            list(row_tup_iter(data)),
+            train=True,
+            has_weight=True,
+            has_fit_base_margin=False,
+            has_predict_base_margin=False,
+        )
+        self.assertTrue(np.allclose(feature_matrix, expected_ndarray))
+        self.assertTrue(np.array_equal(y, np.array(data["label"])))
+        self.assertTrue(np.array_equal(w, np.array(data["weight"])))
+
+    def test_dmatrix_creator(self):
+
+        # This function acts as a pseudo-itertools.chain()
+        def row_tup_iter(data):
+            pdf = pd.DataFrame(data)
+            yield pdf
+
+        # Standard testing DMatrix creation
+        expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
+        expected_labels = np.array([1, 0] * 100)
+        expected_dmatrix = DMatrix(data=expected_features, label=expected_labels)
+
+        data = {
+            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
+            "label": [1, 0] * 100,
+        }
+        output_dmatrix = _convert_partition_data_to_dmatrix(
+            [pd.DataFrame(data)],
+            has_weight=False,
+            has_validation=False,
+            has_base_margin=False,
+        )
+        # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
+        # the same classifier and making sure the outputs are equal
+        model = XGBClassifier()
+        model.fit(expected_features, expected_labels)
+        expected_preds = model.get_booster().predict(expected_dmatrix)
+        output_preds = model.get_booster().predict(output_dmatrix)
+        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
+
+        # DMatrix creation with weights
+        expected_weight = np.array([0.2, 0.8] * 100)
+        expected_dmatrix = DMatrix(
+            data=expected_features, label=expected_labels, weight=expected_weight
+        )
+
+        data["weight"] = [0.2, 0.8] * 100
+        output_dmatrix = _convert_partition_data_to_dmatrix(
+            [pd.DataFrame(data)],
+            has_weight=True,
+            has_validation=False,
+            has_base_margin=False,
+        )
+
+        model.fit(expected_features, expected_labels, sample_weight=expected_weight)
+        expected_preds = model.get_booster().predict(expected_dmatrix)
+        output_preds = model.get_booster().predict(output_dmatrix)
+        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
+
+    def test_external_storage(self):
+        # Instantiating base data (features, labels)
+        features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
+        labels = np.array([1, 0] * 100)
+        normal_dmatrix = DMatrix(features, labels)
+        test_dmatrix = DMatrix(features)
+
+        data = {
+            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
+            "label": [1, 0] * 100,
+        }
+
+        # Creating the dmatrix based on storage
+        temporary_path = tempfile.mkdtemp()
+        storage_dmatrix = _convert_partition_data_to_dmatrix(
+            [pd.DataFrame(data)],
+            has_weight=False,
+            has_validation=False,
+            has_base_margin=False,
+        )
+
+        # Testing without weights
+        normal_booster = worker_train({}, normal_dmatrix)
+        storage_booster = worker_train({}, storage_dmatrix)
+        normal_preds = normal_booster.predict(test_dmatrix)
+        storage_preds = storage_booster.predict(test_dmatrix)
+        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
+        shutil.rmtree(temporary_path)
+
+        # Testing weights
+        weights = np.array([0.2, 0.8] * 100)
+        normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
+        data["weight"] = [0.2, 0.8] * 100
+
+        temporary_path = tempfile.mkdtemp()
+        storage_dmatrix = _convert_partition_data_to_dmatrix(
+            [pd.DataFrame(data)],
+            has_weight=True,
+            has_validation=False,
+            has_base_margin=False,
+        )
+
+        normal_booster = worker_train({}, normal_dmatrix)
+        storage_booster = worker_train({}, storage_dmatrix)
+        normal_preds = normal_booster.predict(test_dmatrix)
+        storage_preds = storage_booster.predict(test_dmatrix)
+        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
+        shutil.rmtree(temporary_path)