[pyspark] disable repartition_random_shuffle by default (#8283)
This commit is contained in:
parent
6925b222e0
commit
c91fed083d
@ -88,6 +88,7 @@ _pyspark_specific_params = [
|
|||||||
"features_cols",
|
"features_cols",
|
||||||
"enable_sparse_data_optim",
|
"enable_sparse_data_optim",
|
||||||
"qid_col",
|
"qid_col",
|
||||||
|
"repartition_random_shuffle",
|
||||||
]
|
]
|
||||||
|
|
||||||
_non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
|
_non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
|
||||||
@ -477,7 +478,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
|||||||
num_workers=1,
|
num_workers=1,
|
||||||
use_gpu=False,
|
use_gpu=False,
|
||||||
force_repartition=False,
|
force_repartition=False,
|
||||||
repartition_random_shuffle=True,
|
repartition_random_shuffle=False,
|
||||||
feature_names=None,
|
feature_names=None,
|
||||||
feature_types=None,
|
feature_types=None,
|
||||||
arbitrary_params_dict={},
|
arbitrary_params_dict={},
|
||||||
|
|||||||
@ -9,6 +9,8 @@ from xgboost.compat import concat
|
|||||||
|
|
||||||
from xgboost import DataIter, DeviceQuantileDMatrix, DMatrix
|
from xgboost import DataIter, DeviceQuantileDMatrix, DMatrix
|
||||||
|
|
||||||
|
from .utils import get_logger # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def stack_series(series: pd.Series) -> np.ndarray:
|
def stack_series(series: pd.Series) -> np.ndarray:
|
||||||
"""Stack a series of arrays."""
|
"""Stack a series of arrays."""
|
||||||
@ -246,6 +248,11 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
|
|||||||
else:
|
else:
|
||||||
append_fn = append_m
|
append_fn = append_m
|
||||||
cache_partitions(iterator, append_fn)
|
cache_partitions(iterator, append_fn)
|
||||||
|
if len(train_data) == 0:
|
||||||
|
get_logger("XGBoostPySpark").warning(
|
||||||
|
"Detected an empty partition in the training data. "
|
||||||
|
"Consider to enable repartition_random_shuffle"
|
||||||
|
)
|
||||||
dtrain = make(train_data, kwargs)
|
dtrain = make(train_data, kwargs)
|
||||||
else:
|
else:
|
||||||
cache_partitions(iterator, append_dqm)
|
cache_partitions(iterator, append_dqm)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user