[pyspark] Cleanup data processing. (#8344)
* Enable additional combinations of ctor parameters. * Unify procedures for QuantileDMatrix and DMatrix.
This commit is contained in:
@@ -747,7 +747,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
||||
k: v for k, v in train_call_kwargs_params.items() if v is not None
|
||||
}
|
||||
dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
|
||||
use_qdm = booster_params.get("tree_method") in ("hist", "gpu_hist")
|
||||
use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
|
||||
|
||||
def _train_booster(pandas_df_iter):
|
||||
"""Takes in an RDD partition and outputs a booster for that partition after
|
||||
|
||||
@@ -208,14 +208,27 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
|
||||
|
||||
def append_m(part: pd.DataFrame, name: str, is_valid: bool) -> None:
|
||||
nonlocal n_features
|
||||
if name in part.columns and part[name].shape[0] > 0:
|
||||
array = part[name]
|
||||
if name == alias.data:
|
||||
if name == alias.data or name in part.columns:
|
||||
if (
|
||||
name == alias.data
|
||||
and feature_cols is not None
|
||||
and part[feature_cols].shape[0] > 0 # guard against empty partition
|
||||
):
|
||||
array: Optional[np.ndarray] = part[feature_cols]
|
||||
elif part[name].shape[0] > 0:
|
||||
array = part[name]
|
||||
array = stack_series(array)
|
||||
else:
|
||||
array = None
|
||||
|
||||
if name == alias.data and array is not None:
|
||||
if n_features == 0:
|
||||
n_features = array.shape[1]
|
||||
assert n_features == array.shape[1]
|
||||
|
||||
if array is None:
|
||||
return
|
||||
|
||||
if is_valid:
|
||||
valid_data[name].append(array)
|
||||
else:
|
||||
@@ -238,26 +251,6 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
|
||||
else:
|
||||
train_data[name].append(array)
|
||||
|
||||
def append_qdm(part: pd.DataFrame, name: str, is_valid: bool) -> None:
|
||||
"""Preprocessing for QuantileDMatrix."""
|
||||
nonlocal n_features
|
||||
if name == alias.data or name in part.columns:
|
||||
if name == alias.data and feature_cols is not None:
|
||||
array = part[feature_cols]
|
||||
else:
|
||||
array = part[name]
|
||||
array = stack_series(array)
|
||||
|
||||
if name == alias.data:
|
||||
if n_features == 0:
|
||||
n_features = array.shape[1]
|
||||
assert n_features == array.shape[1]
|
||||
|
||||
if is_valid:
|
||||
valid_data[name].append(array)
|
||||
else:
|
||||
train_data[name].append(array)
|
||||
|
||||
def make(values: Dict[str, List[np.ndarray]], kwargs: Dict[str, Any]) -> DMatrix:
|
||||
if len(values) == 0:
|
||||
get_logger("XGBoostPySpark").warning(
|
||||
@@ -305,13 +298,14 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
|
||||
|
||||
meta, params = split_params()
|
||||
|
||||
if feature_cols is not None: # rapidsai plugin
|
||||
assert gpu_id is not None
|
||||
assert use_qdm is True
|
||||
cache_partitions(iterator, append_qdm)
|
||||
if feature_cols is not None and use_qdm:
|
||||
cache_partitions(iterator, append_fn)
|
||||
dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
|
||||
elif use_qdm:
|
||||
cache_partitions(iterator, append_qdm)
|
||||
elif feature_cols is not None and not use_qdm:
|
||||
cache_partitions(iterator, append_fn)
|
||||
dtrain = make(train_data, kwargs)
|
||||
elif feature_cols is None and use_qdm:
|
||||
cache_partitions(iterator, append_fn)
|
||||
dtrain = make_qdm(train_data, gpu_id, meta, None, params)
|
||||
else:
|
||||
cache_partitions(iterator, append_fn)
|
||||
|
||||
Reference in New Issue
Block a user