[pyspark] Cleanup data processing. (#8088)

- Use numpy stack for handling list of arrays.
- Reuse concat function from dask.
- Prepare for `QuantileDMatrix`.
- Remove unused code.
- Use iterator for prediction to avoid initializing xgboost model
This commit is contained in:
Jiaming Yuan
2022-07-26 15:00:52 +08:00
committed by GitHub
parent 3970e4e6bb
commit 546de5efd2
9 changed files with 416 additions and 472 deletions

View File

@@ -0,0 +1,23 @@
import sys
from typing import List
import numpy as np
import pandas as pd
import pytest
sys.path.append("tests/python")
import testing as tm
if tm.no_spark()["condition"]:
pytest.skip(msg=tm.no_spark()["reason"], allow_module_level=True)
if sys.platform.startswith("win") or sys.platform.startswith("darwin"):
pytest.skip("Skipping PySpark tests on Windows", allow_module_level=True)
from test_spark.test_data import run_dmatrix_ctor
@pytest.mark.skipif(**tm.no_cudf())
def test_qdm_ctor() -> None:
run_dmatrix_ctor(True)