[pyspark] disable repartition_random_shuffle by default (#8283)

2022-09-29 10:50:51 +08:00
parent 6925b222e0
commit c91fed083d
2 changed files with 9 additions and 1 deletions
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -88,6 +88,7 @@ _pyspark_specific_params = [
    "features_cols",
    "enable_sparse_data_optim",
    "qid_col",
    "repartition_random_shuffle",
 ]
 _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -477,7 +478,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
            num_workers=1,
            use_gpu=False,
            force_repartition=False,
-            repartition_random_shuffle=True,
+            repartition_random_shuffle=False,
            feature_names=None,
            feature_types=None,
            arbitrary_params_dict={},
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -9,6 +9,8 @@ from xgboost.compat import concat
 from xgboost import DataIter, DeviceQuantileDMatrix, DMatrix
 from .utils import get_logger  # type: ignore
 def stack_series(series: pd.Series) -> np.ndarray:
    """Stack a series of arrays."""
@@ -246,6 +248,11 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
        else:
            append_fn = append_m
        cache_partitions(iterator, append_fn)
        if len(train_data) == 0:
            get_logger("XGBoostPySpark").warning(
                "Detected an empty partition in the training data. "
                "Consider to enable repartition_random_shuffle"
            )
        dtrain = make(train_data, kwargs)
    else:
        cache_partitions(iterator, append_dqm)