[pyspark] Cleanup data processing. (#8344)

* Enable additional combinations of ctor parameters.
* Unify procedures for QuantileDMatrix and DMatrix.
This commit is contained in:
Jiaming Yuan
2022-10-18 14:56:23 +08:00
committed by GitHub
parent 521086d56b
commit 3901f5d9db
5 changed files with 68 additions and 55 deletions

View File

@@ -747,7 +747,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
k: v for k, v in train_call_kwargs_params.items() if v is not None
}
dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
use_qdm = booster_params.get("tree_method") in ("hist", "gpu_hist")
use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
def _train_booster(pandas_df_iter):
"""Takes in an RDD partition and outputs a booster for that partition after

View File

@@ -208,14 +208,27 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
def append_m(part: pd.DataFrame, name: str, is_valid: bool) -> None:
nonlocal n_features
if name in part.columns and part[name].shape[0] > 0:
array = part[name]
if name == alias.data:
if name == alias.data or name in part.columns:
if (
name == alias.data
and feature_cols is not None
and part[feature_cols].shape[0] > 0 # guard against empty partition
):
array: Optional[np.ndarray] = part[feature_cols]
elif part[name].shape[0] > 0:
array = part[name]
array = stack_series(array)
else:
array = None
if name == alias.data and array is not None:
if n_features == 0:
n_features = array.shape[1]
assert n_features == array.shape[1]
if array is None:
return
if is_valid:
valid_data[name].append(array)
else:
@@ -238,26 +251,6 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
else:
train_data[name].append(array)
def append_qdm(part: pd.DataFrame, name: str, is_valid: bool) -> None:
"""Preprocessing for QuantileDMatrix."""
nonlocal n_features
if name == alias.data or name in part.columns:
if name == alias.data and feature_cols is not None:
array = part[feature_cols]
else:
array = part[name]
array = stack_series(array)
if name == alias.data:
if n_features == 0:
n_features = array.shape[1]
assert n_features == array.shape[1]
if is_valid:
valid_data[name].append(array)
else:
train_data[name].append(array)
def make(values: Dict[str, List[np.ndarray]], kwargs: Dict[str, Any]) -> DMatrix:
if len(values) == 0:
get_logger("XGBoostPySpark").warning(
@@ -305,13 +298,14 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
meta, params = split_params()
if feature_cols is not None: # rapidsai plugin
assert gpu_id is not None
assert use_qdm is True
cache_partitions(iterator, append_qdm)
if feature_cols is not None and use_qdm:
cache_partitions(iterator, append_fn)
dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
elif use_qdm:
cache_partitions(iterator, append_qdm)
elif feature_cols is not None and not use_qdm:
cache_partitions(iterator, append_fn)
dtrain = make(train_data, kwargs)
elif feature_cols is None and use_qdm:
cache_partitions(iterator, append_fn)
dtrain = make_qdm(train_data, gpu_id, meta, None, params)
else:
cache_partitions(iterator, append_fn)