diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index ffeeae8a7..03d431dd4 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -88,6 +88,7 @@ _pyspark_specific_params = [ "features_cols", "enable_sparse_data_optim", "qid_col", + "repartition_random_shuffle", ] _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"] @@ -477,7 +478,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable): num_workers=1, use_gpu=False, force_repartition=False, - repartition_random_shuffle=True, + repartition_random_shuffle=False, feature_names=None, feature_types=None, arbitrary_params_dict={}, diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py index a5b3b1e81..11dc02340 100644 --- a/python-package/xgboost/spark/data.py +++ b/python-package/xgboost/spark/data.py @@ -9,6 +9,8 @@ from xgboost.compat import concat from xgboost import DataIter, DeviceQuantileDMatrix, DMatrix +from .utils import get_logger # type: ignore + def stack_series(series: pd.Series) -> np.ndarray: """Stack a series of arrays.""" @@ -246,6 +248,11 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments else: append_fn = append_m cache_partitions(iterator, append_fn) + if len(train_data) == 0: + get_logger("XGBoostPySpark").warning( + "Detected an empty partition in the training data. " + "Consider to enable repartition_random_shuffle" + ) dtrain = make(train_data, kwargs) else: cache_partitions(iterator, append_dqm)