[pyspark] Cleanup data processing. (#8088)

- Use numpy stack for handling list of arrays. - Reuse concat function from dask. - Prepare for `QuantileDMatrix`. - Remove unused code. - Use iterator for prediction to avoid initializing xgboost model
2022-07-26 15:00:52 +08:00
parent 3970e4e6bb
commit 546de5efd2
9 changed files with 416 additions and 472 deletions
--- a/tests/python-gpu/test_spark_with_gpu/test_data.py
+++ b/tests/python-gpu/test_spark_with_gpu/test_data.py
@@ -0,0 +1,23 @@
+import sys
+from typing import List
+
+import numpy as np
+import pandas as pd
+import pytest
+
+sys.path.append("tests/python")
+
+import testing as tm
+
+if tm.no_spark()["condition"]:
+    pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
+if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
+    pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
+
+
+from test_spark.test_data import run_dmatrix_ctor
+
+
+@pytest.mark.skipif(**tm.no_cudf())
+def test_qdm_ctor() -> None:
+    run_dmatrix_ctor(True)