Define the new device parameter. (#9362)

This commit is contained in:
Jiaming Yuan
2023-07-13 19:30:25 +08:00
committed by GitHub
parent 2d0cd2817e
commit 04aff3af8e
63 changed files with 827 additions and 477 deletions

View File

@@ -1393,13 +1393,13 @@ class _ProxyDMatrix(DMatrix):
class QuantileDMatrix(DMatrix):
"""A DMatrix variant that generates quantilized data directly from input for
``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
number of bins during quantisation, which should be consistent with the training
parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
it defeats the purpose of saving memory) constructed from training dataset. See
"""A DMatrix variant that generates quantilized data directly from input for the
``hist`` tree method. This DMatrix is primarily designed to save memory in training
by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
during quantisation, which should be consistent with the training parameter
``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
defeats the purpose of saving memory) constructed from training dataset. See
:py:obj:`xgboost.DMatrix` for documents on meta info.
.. note::
@@ -2277,10 +2277,10 @@ class Booster:
.. code-block:: python
booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
booster.set_param({"device": "cuda:0"})
booster.inplace_predict(cupy_array)
booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
booster.set_param({"device": "cpu"})
booster.inplace_predict(numpy_array)
.. versionadded:: 1.1.0
@@ -2311,8 +2311,8 @@ class Booster:
Returns
-------
prediction : numpy.ndarray/cupy.ndarray
The prediction result. When input data is on GPU, prediction
result is stored in a cupy array.
The prediction result. When input data is on GPU, prediction result is
stored in a cupy array.
"""
preds = ctypes.POINTER(ctypes.c_float)()

View File

@@ -273,7 +273,7 @@ __model_doc = f"""
* For linear model, only "weight" is defined and it's the normalized coefficients
without bias.
gpu_id : Optional[int]
device : Optional[str]
Device ordinal.
validate_parameters : Optional[bool]
Give warnings for unknown parameter.
@@ -647,7 +647,7 @@ class XGBModel(XGBModelBase):
monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
importance_type: Optional[str] = None,
gpu_id: Optional[int] = None,
device: Optional[str] = None,
validate_parameters: Optional[bool] = None,
enable_categorical: bool = False,
feature_types: Optional[FeatureTypes] = None,
@@ -693,7 +693,7 @@ class XGBModel(XGBModelBase):
self.monotone_constraints = monotone_constraints
self.interaction_constraints = interaction_constraints
self.importance_type = importance_type
self.gpu_id = gpu_id
self.device = device
self.validate_parameters = validate_parameters
self.enable_categorical = enable_categorical
self.feature_types = feature_types

View File

@@ -1,4 +1,4 @@
"""Xgboost pyspark integration submodule for core code."""
"""XGBoost pyspark integration submodule for core code."""
import base64
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@@ -133,6 +133,7 @@ _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.it
_unsupported_xgb_params = [
"gpu_id", # we have "use_gpu" pyspark param instead.
"device", # we have "use_gpu" pyspark param instead.
"enable_categorical", # Use feature_types param to specify categorical feature instead
"use_label_encoder",
"n_jobs", # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -899,12 +900,14 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
context = BarrierTaskContext.get()
gpu_id = None
dev_ordinal = None
use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
if use_gpu:
gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
booster_params["gpu_id"] = gpu_id
dev_ordinal = (
context.partitionId() if is_local else _get_gpu_id(context)
)
booster_params["device"] = "cuda:" + str(dev_ordinal)
# If cuDF is not installed, then using DMatrix instead of QDM,
# because without cuDF, DMatrix performs better than QDM.
# Note: Checking `is_cudf_available` in spark worker side because
@@ -945,7 +948,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dtrain, dvalid = create_dmatrix_from_partitions(
pandas_df_iter,
feature_prop.features_cols_names,
gpu_id,
dev_ordinal,
use_qdm,
dmatrix_kwargs,
enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,

View File

@@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:
def make_qdm(
data: Dict[str, List[np.ndarray]],
gpu_id: Optional[int],
dev_ordinal: Optional[int],
meta: Dict[str, Any],
ref: Optional[DMatrix],
params: Dict[str, Any],
@@ -165,7 +165,7 @@ def make_qdm(
"""Handle empty partition for QuantileDMatrix."""
if not data:
return QuantileDMatrix(np.empty((0, 0)), ref=ref)
it = PartIter(data, gpu_id, **meta)
it = PartIter(data, dev_ordinal, **meta)
m = QuantileDMatrix(it, **params, ref=ref)
return m
@@ -173,7 +173,7 @@ def make_qdm(
def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
iterator: Iterator[pd.DataFrame],
feature_cols: Optional[Sequence[str]],
gpu_id: Optional[int],
dev_ordinal: Optional[int],
use_qdm: bool,
kwargs: Dict[str, Any], # use dict to make sure this parameter is passed.
enable_sparse_data_optim: bool,
@@ -187,7 +187,7 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
Pyspark partition iterator.
feature_cols:
A sequence of feature names, used only when rapids plugin is enabled.
gpu_id:
dev_ordinal:
Device ordinal, used when GPU is enabled.
use_qdm :
Whether QuantileDMatrix should be used instead of DMatrix.
@@ -304,13 +304,13 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
if feature_cols is not None and use_qdm:
cache_partitions(iterator, append_fn)
dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
elif feature_cols is not None and not use_qdm:
cache_partitions(iterator, append_fn)
dtrain = make(train_data, kwargs)
elif feature_cols is None and use_qdm:
cache_partitions(iterator, append_fn)
dtrain = make_qdm(train_data, gpu_id, meta, None, params)
dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
else:
cache_partitions(iterator, append_fn)
dtrain = make(train_data, kwargs)
@@ -324,7 +324,7 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
if has_validation_col:
if use_qdm:
dvalid: Optional[DMatrix] = make_qdm(
valid_data, gpu_id, meta, dtrain, params
valid_data, dev_ordinal, meta, dtrain, params
)
else:
dvalid = make(valid_data, kwargs) if has_validation_col else None

View File

@@ -78,8 +78,7 @@ def _set_pyspark_xgb_cls_param_attrs(
class SparkXGBRegressor(_SparkXGBEstimator):
"""
SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
"""SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
@@ -89,8 +88,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
:py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
:py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details.
SparkXGBRegressor doesn't support setting `device` but supports another param
`use_gpu`, see doc below for more details.
SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
@@ -247,8 +246,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
:py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
:py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details.
SparkXGBClassifier doesn't support setting `device` but support another param
`use_gpu`, see doc below for more details.
SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
@@ -423,7 +422,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
:py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
:py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
SparkXGBRanker doesn't support setting `device` but support another param `use_gpu`,
see doc below for more details.
SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support

View File

@@ -723,24 +723,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
M = TypeVar("M", xgb.Booster, xgb.XGBModel)
def set_ordinal(ordinal: int, booster: M) -> M:
"""Temporary solution for setting the device ordinal until we move away from
`gpu_id`.
"""
if ordinal < 0:
params = {"gpu_id": -1, "tree_method": "hist"}
else:
params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
if isinstance(booster, xgb.Booster):
booster.set_param(params)
elif isinstance(booster, xgb.XGBModel):
booster.set_params(**params)
return booster
def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
"""Evaluation metric for xgb.train"""
label = dtrain.get_label()