[pyspark] Handle the device parameter in pyspark. (#9390)

- Handle the new `device` parameter in PySpark.
- Deprecate the old `use_gpu` parameter.
This commit is contained in:
Jiaming Yuan
2023-07-18 08:47:03 +08:00
committed by GitHub
parent 2a0ff209ff
commit 6e18d3a290
10 changed files with 244 additions and 169 deletions

View File

@@ -276,6 +276,27 @@ def _check_call(ret: int) -> None:
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
"""Validate parameters in distributed environments."""
device = kwargs.get("device", None)
if device and not isinstance(device, str):
msg = "Invalid type for the `device` parameter"
msg += _expect((str,), type(device))
raise TypeError(msg)
if device and device.find(":") != -1:
raise ValueError(
"Distributed training doesn't support selecting device ordinal as GPUs are"
" managed by the distributed framework. use `device=cuda` or `device=gpu`"
" instead."
)
if kwargs.get("booster", None) == "gblinear":
raise NotImplementedError(
f"booster `{kwargs['booster']}` is not supported for distributed training."
)
def build_info() -> dict:
"""Build information of XGBoost. The returned value format is not stable. Also,
please note that build time dependency is not the same as runtime dependency. For

View File

@@ -70,6 +70,7 @@ from .core import (
Metric,
Objective,
QuantileDMatrix,
_check_distributed_params,
_deprecate_positional_args,
_expect,
)
@@ -924,17 +925,7 @@ async def _train_async(
) -> Optional[TrainReturnT]:
workers = _get_workers_from_data(dtrain, evals)
_rabit_args = await _get_rabit_args(len(workers), dconfig, client)
if params.get("booster", None) == "gblinear":
raise NotImplementedError(
f"booster `{params['booster']}` is not yet supported for dask."
)
device = params.get("device", None)
if device and device.find(":") != -1:
raise ValueError(
"The dask interface for XGBoost doesn't support selecting specific device"
" ordinal. Use `device=cpu` or `device=cuda` instead."
)
_check_distributed_params(params)
def dispatched_train(
parameters: Dict,

View File

@@ -1004,13 +1004,17 @@ class XGBModel(XGBModelBase):
Validation metrics will help us track the performance of the model.
eval_metric : str, list of str, or callable, optional
.. deprecated:: 1.6.0
Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
early_stopping_rounds : int
.. deprecated:: 1.6.0
Use `early_stopping_rounds` in :py:meth:`__init__` or
:py:meth:`set_params` instead.
Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
instead.
verbose :
If `verbose` is True and an evaluation set is used, the evaluation metric
measured on the validation set is printed to stdout at each boosting stage.

View File

@@ -60,7 +60,7 @@ from scipy.special import expit, softmax # pylint: disable=no-name-in-module
import xgboost
from xgboost import XGBClassifier
from xgboost.compat import is_cudf_available
from xgboost.core import Booster
from xgboost.core import Booster, _check_distributed_params
from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
from xgboost.training import train as worker_train
@@ -92,6 +92,7 @@ from .utils import (
get_class_name,
get_logger,
serialize_booster,
use_cuda,
)
# Put pyspark specific params here, they won't be passed to XGBoost.
@@ -108,7 +109,6 @@ _pyspark_specific_params = [
"arbitrary_params_dict",
"force_repartition",
"num_workers",
"use_gpu",
"feature_names",
"features_cols",
"enable_sparse_data_optim",
@@ -132,8 +132,7 @@ _pyspark_param_alias_map = {
_inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.items()}
_unsupported_xgb_params = [
"gpu_id", # we have "use_gpu" pyspark param instead.
"device", # we have "use_gpu" pyspark param instead.
"gpu_id", # we have "device" pyspark param instead.
"enable_categorical", # Use feature_types param to specify categorical feature instead
"use_label_encoder",
"n_jobs", # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -198,11 +197,24 @@ class _SparkXGBParams(
"The number of XGBoost workers. Each XGBoost worker corresponds to one spark task.",
TypeConverters.toInt,
)
device = Param(
Params._dummy(),
"device",
(
"The device type for XGBoost executors. Available options are `cpu`,`cuda`"
" and `gpu`. Set `device` to `cuda` or `gpu` if the executors are running "
"on GPU instances. Currently, only one GPU per task is supported."
),
TypeConverters.toString,
)
use_gpu = Param(
Params._dummy(),
"use_gpu",
"A boolean variable. Set use_gpu=true if the executors "
+ "are running on GPU instances. Currently, only one GPU per task is supported.",
(
"Deprecated, use `device` instead. A boolean variable. Set use_gpu=true "
"if the executors are running on GPU instances. Currently, only one GPU per"
" task is supported."
),
TypeConverters.toBoolean,
)
force_repartition = Param(
@@ -336,10 +348,20 @@ class _SparkXGBParams(
f"It cannot be less than 1 [Default is 1]"
)
tree_method = self.getOrDefault(self.getParam("tree_method"))
if (
self.getOrDefault(self.use_gpu) or use_cuda(self.getOrDefault(self.device))
) and not _can_use_qdm(tree_method):
raise ValueError(
f"The `{tree_method}` tree method is not supported on GPU."
)
if self.getOrDefault(self.features_cols):
if not self.getOrDefault(self.use_gpu):
if not use_cuda(self.getOrDefault(self.device)) and not self.getOrDefault(
self.use_gpu
):
raise ValueError(
"features_col param with list value requires enabling use_gpu."
"features_col param with list value requires `device=cuda`."
)
if self.getOrDefault("objective") is not None:
@@ -392,17 +414,7 @@ class _SparkXGBParams(
"`pyspark.ml.linalg.Vector` type."
)
if self.getOrDefault(self.use_gpu):
tree_method = self.getParam("tree_method")
if (
self.getOrDefault(tree_method) is not None
and self.getOrDefault(tree_method) != "gpu_hist"
):
raise ValueError(
f"tree_method should be 'gpu_hist' or None when use_gpu is True,"
f"found {self.getOrDefault(tree_method)}."
)
if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
gpu_per_task = (
_get_spark_session()
.sparkContext.getConf()
@@ -424,8 +436,8 @@ class _SparkXGBParams(
# so it's okay for printing the below warning instead of checking the real
# gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning(
"You enabled use_gpu in spark local mode. Please make sure your local node "
"has at least %d GPUs",
"You enabled GPU in spark local mode. Please make sure your local "
"node has at least %d GPUs",
self.getOrDefault(self.num_workers),
)
else:
@@ -558,6 +570,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
# they are added in `setParams`.
self._setDefault(
num_workers=1,
device="cpu",
use_gpu=False,
force_repartition=False,
repartition_random_shuffle=False,
@@ -566,9 +579,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
arbitrary_params_dict={},
)
def setParams(
self, **kwargs: Dict[str, Any]
) -> None: # pylint: disable=invalid-name
def setParams(self, **kwargs: Any) -> None: # pylint: disable=invalid-name
"""
Set params for the estimator.
"""
@@ -613,6 +624,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
)
raise ValueError(err_msg)
_extra_params[k] = v
_check_distributed_params(kwargs)
_existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
@@ -709,9 +722,6 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
# TODO: support "num_parallel_tree" for random forest
params["num_boost_round"] = self.getOrDefault("n_estimators")
if self.getOrDefault(self.use_gpu):
params["tree_method"] = "gpu_hist"
return params
@classmethod
@@ -883,8 +893,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dmatrix_kwargs,
) = self._get_xgb_parameters(dataset)
use_gpu = self.getOrDefault(self.use_gpu)
run_on_gpu = use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
self.use_gpu
)
is_local = _is_local(_get_spark_session().sparkContext)
num_workers = self.getOrDefault(self.num_workers)
@@ -903,7 +914,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
dev_ordinal = None
use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
if use_gpu:
if run_on_gpu:
dev_ordinal = (
context.partitionId() if is_local else _get_gpu_id(context)
)

View File

@@ -3,8 +3,8 @@
# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
# pylint: disable=unused-argument, too-many-locals
from typing import Any, Dict, List, Optional, Type, Union
import warnings
from typing import Any, List, Optional, Type, Union
import numpy as np
from pyspark import keyword_only
@@ -77,27 +77,35 @@ def _set_pyspark_xgb_cls_param_attrs(
set_param_attrs(name, param_obj)
def _deprecated_use_gpu() -> None:
warnings.warn(
"`use_gpu` is deprecated since 2.0.0, use `device` instead", FutureWarning
)
class SparkXGBRegressor(_SparkXGBEstimator):
"""SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
:py:class:`~pyspark.ml.classification.OneVsRest`
and PySpark ML meta algorithms like
- :py:class:`~pyspark.ml.tuning.CrossValidator`/
- :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
- :py:class:`~pyspark.ml.classification.OneVsRest`
SparkXGBRegressor automatically supports most of the parameters in
:py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
:py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
:py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict`
method.
SparkXGBRegressor doesn't support setting `device` but supports another param
`use_gpu`, see doc below for more details.
To enable GPU support, set `device` to `cuda` or `gpu`.
SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but
support another param called `base_margin_col`. see doc below for more details.
SparkXGBRegressor doesn't support `validate_features` and `output_margin` param.
SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the
`nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
config value.
Parameters
@@ -133,8 +141,11 @@ class SparkXGBRegressor(_SparkXGBEstimator):
How many XGBoost workers to be used to train.
Each XGBoost worker corresponds to one spark task.
use_gpu:
Boolean value to specify whether the executors are running on GPU
instances.
.. deprecated:: 2.0.0
Use `device` instead.
device:
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training.
@@ -193,14 +204,17 @@ class SparkXGBRegressor(_SparkXGBEstimator):
weight_col: Optional[str] = None,
base_margin_col: Optional[str] = None,
num_workers: int = 1,
use_gpu: bool = False,
use_gpu: Optional[bool] = None,
device: Optional[str] = None,
force_repartition: bool = False,
repartition_random_shuffle: bool = False,
enable_sparse_data_optim: bool = False,
**kwargs: Dict[str, Any],
**kwargs: Any,
) -> None:
super().__init__()
input_kwargs = self._input_kwargs
if use_gpu:
_deprecated_use_gpu()
self.setParams(**input_kwargs)
@classmethod
@@ -238,27 +252,29 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
"""SparkXGBClassifier is a PySpark ML estimator. It implements the XGBoost
classification algorithm based on XGBoost python library, and it can be used in
PySpark Pipeline and PySpark ML meta algorithms like
:py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
:py:class:`~pyspark.ml.classification.OneVsRest`
- :py:class:`~pyspark.ml.tuning.CrossValidator`/
- :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
- :py:class:`~pyspark.ml.classification.OneVsRest`
SparkXGBClassifier automatically supports most of the parameters in
:py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
:py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
:py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict`
method.
SparkXGBClassifier doesn't support setting `device` but support another param
`use_gpu`, see doc below for more details.
To enable GPU support, set `device` to `cuda` or `gpu`.
SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but
support another param called `base_margin_col`. see doc below for more details.
SparkXGBClassifier doesn't support setting `output_margin`, but we can get output margin
from the raw prediction column. See `raw_prediction_col` param doc below for more details.
SparkXGBClassifier doesn't support setting `output_margin`, but we can get output
margin from the raw prediction column. See `raw_prediction_col` param doc below for
more details.
SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.
SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the
`nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
config value.
Parameters
@@ -300,8 +316,11 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
How many XGBoost workers to be used to train.
Each XGBoost worker corresponds to one spark task.
use_gpu:
Boolean value to specify whether the executors are running on GPU
instances.
.. deprecated:: 2.0.0
Use `device` instead.
device:
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training.
@@ -360,11 +379,12 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
weight_col: Optional[str] = None,
base_margin_col: Optional[str] = None,
num_workers: int = 1,
use_gpu: bool = False,
use_gpu: Optional[bool] = None,
device: Optional[str] = None,
force_repartition: bool = False,
repartition_random_shuffle: bool = False,
enable_sparse_data_optim: bool = False,
**kwargs: Dict[str, Any],
**kwargs: Any,
) -> None:
super().__init__()
# The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
@@ -372,6 +392,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
# binary or multinomial input dataset, and we need to remove the fixed default
# param value as well to avoid causing ambiguity.
input_kwargs = self._input_kwargs
if use_gpu:
_deprecated_use_gpu()
self.setParams(**input_kwargs)
self._setDefault(objective=None)
@@ -422,19 +444,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
:py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
:py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
SparkXGBRanker doesn't support setting `device` but support another param `use_gpu`,
see doc below for more details.
To enable GPU support, set `device` to `cuda` or `gpu`.
SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
another param called `base_margin_col`. see doc below for more details.
SparkXGBRanker doesn't support setting `output_margin`, but we can get output margin
from the raw prediction column. See `raw_prediction_col` param doc below for more details.
from the raw prediction column. See `raw_prediction_col` param doc below for more
details.
SparkXGBRanker doesn't support `validate_features` and `output_margin` param.
SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the
`nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
config value.
Parameters
@@ -467,13 +490,15 @@ class SparkXGBRanker(_SparkXGBEstimator):
:py:class:`xgboost.XGBRanker` fit method.
qid_col:
Query id column name.
num_workers:
How many XGBoost workers to be used to train.
Each XGBoost worker corresponds to one spark task.
use_gpu:
Boolean value to specify whether the executors are running on GPU
instances.
.. deprecated:: 2.0.0
Use `device` instead.
device:
Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
force_repartition:
Boolean value to specify if forcing the input dataset to be repartitioned
before XGBoost training.
@@ -538,14 +563,17 @@ class SparkXGBRanker(_SparkXGBEstimator):
base_margin_col: Optional[str] = None,
qid_col: Optional[str] = None,
num_workers: int = 1,
use_gpu: bool = False,
use_gpu: Optional[bool] = None,
device: Optional[str] = None,
force_repartition: bool = False,
repartition_random_shuffle: bool = False,
enable_sparse_data_optim: bool = False,
**kwargs: Dict[str, Any],
**kwargs: Any,
) -> None:
super().__init__()
input_kwargs = self._input_kwargs
if use_gpu:
_deprecated_use_gpu()
self.setParams(**input_kwargs)
@classmethod

View File

@@ -7,7 +7,7 @@ import os
import sys
import uuid
from threading import Thread
from typing import Any, Callable, Dict, Set, Type
from typing import Any, Callable, Dict, Optional, Set, Type
import pyspark
from pyspark import BarrierTaskContext, SparkContext, SparkFiles
@@ -186,3 +186,8 @@ def deserialize_booster(model: str) -> Booster:
f.write(model)
booster.load_model(tmp_file_name)
return booster
def use_cuda(device: Optional[str]) -> bool:
"""Whether xgboost is using CUDA workers."""
return device in ("cuda", "gpu")