[pyspark] fix empty data issue when constructing DMatrix (#8245)
Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -658,12 +658,17 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
col(self.getOrDefault(self.weightCol)).alias(alias.weight)
|
||||
)
|
||||
|
||||
has_validation_col = False
|
||||
if self.isDefined(self.validationIndicatorCol) and self.getOrDefault(
|
||||
self.validationIndicatorCol
|
||||
):
|
||||
select_cols.append(
|
||||
col(self.getOrDefault(self.validationIndicatorCol)).alias(alias.valid)
|
||||
)
|
||||
# In some cases, see https://issues.apache.org/jira/browse/SPARK-40407,
|
||||
# the df.repartition can result in some reducer partitions without data,
|
||||
# which will cause exception or hanging issue when creating DMatrix.
|
||||
has_validation_col = True
|
||||
|
||||
if self.isDefined(self.base_margin_col) and self.getOrDefault(
|
||||
self.base_margin_col
|
||||
@@ -765,6 +770,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
gpu_id,
|
||||
dmatrix_kwargs,
|
||||
enable_sparse_data_optim=enable_sparse_data_optim,
|
||||
has_validation_col=has_validation_col,
|
||||
)
|
||||
if dvalid is not None:
|
||||
dval = [(dtrain, "training"), (dvalid, "validation")]
|
||||
|
||||
@@ -147,12 +147,13 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:
|
||||
)
|
||||
|
||||
|
||||
def create_dmatrix_from_partitions(
|
||||
def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
|
||||
iterator: Iterator[pd.DataFrame],
|
||||
feature_cols: Optional[Sequence[str]],
|
||||
gpu_id: Optional[int],
|
||||
kwargs: Dict[str, Any], # use dict to make sure this parameter is passed.
|
||||
enable_sparse_data_optim: bool,
|
||||
has_validation_col: bool,
|
||||
) -> Tuple[DMatrix, Optional[DMatrix]]:
|
||||
"""Create DMatrix from spark data partitions. This is not particularly efficient as
|
||||
we need to convert the pandas series format to numpy then concatenate all the data.
|
||||
@@ -173,7 +174,7 @@ def create_dmatrix_from_partitions(
|
||||
|
||||
def append_m(part: pd.DataFrame, name: str, is_valid: bool) -> None:
|
||||
nonlocal n_features
|
||||
if name in part.columns:
|
||||
if name in part.columns and part[name].shape[0] > 0:
|
||||
array = part[name]
|
||||
if name == alias.data:
|
||||
array = stack_series(array)
|
||||
@@ -224,6 +225,10 @@ def create_dmatrix_from_partitions(
|
||||
train_data[name].append(array)
|
||||
|
||||
def make(values: Dict[str, List[np.ndarray]], kwargs: Dict[str, Any]) -> DMatrix:
|
||||
if len(values) == 0:
|
||||
# We must construct an empty DMatrix to bypass the AllReduce
|
||||
return DMatrix(data=np.empty((0, 0)), **kwargs)
|
||||
|
||||
data = concat_or_none(values[alias.data])
|
||||
label = concat_or_none(values.get(alias.label, None))
|
||||
weight = concat_or_none(values.get(alias.weight, None))
|
||||
@@ -247,9 +252,14 @@ def create_dmatrix_from_partitions(
|
||||
it = PartIter(train_data, gpu_id)
|
||||
dtrain = DeviceQuantileDMatrix(it, **kwargs)
|
||||
|
||||
dvalid = make(valid_data, kwargs) if len(valid_data) != 0 else None
|
||||
# Using has_validation_col here to indicate if there is validation col
|
||||
# instead of getting it from iterator, since the iterator may be empty
|
||||
# in some special case. That is to say, we must ensure every worker
|
||||
# construct DMatrix even there is no any data since we need to ensure every
|
||||
# worker do the AllReduce when constructing DMatrix, or else it may hang
|
||||
# forever.
|
||||
dvalid = make(valid_data, kwargs) if has_validation_col else None
|
||||
|
||||
assert dtrain.num_col() == n_features
|
||||
if dvalid is not None:
|
||||
assert dvalid.num_col() == dtrain.num_col()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user