[pyspark] hotfix for GPU setup validation (#9495)

* [pyspark] fix a bug of validating gpu configuration

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
Bobby Wang 2023-08-17 16:01:39 +08:00 committed by GitHub
parent 5188e27513
commit 68be454cfa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -424,35 +424,41 @@ class _SparkXGBParams(
if is_local: if is_local:
# checking spark local mode. # checking spark local mode.
if gpu_per_task: if gpu_per_task is not None:
raise RuntimeError( raise RuntimeError(
"The spark cluster does not support gpu configuration for local mode. " "The spark local mode does not support gpu configuration."
"Please delete spark.executor.resource.gpu.amount and " "Please remove spark.executor.resource.gpu.amount and "
"spark.task.resource.gpu.amount" "spark.task.resource.gpu.amount"
) )
# Support GPU training in Spark local mode is just for debugging purposes, # Support GPU training in Spark local mode is just for debugging
# so it's okay for printing the below warning instead of checking the real # purposes, so it's okay for printing the below warning instead of
# gpu numbers and raising the exception. # checking the real gpu numbers and raising the exception.
get_logger(self.__class__.__name__).warning( get_logger(self.__class__.__name__).warning(
"You enabled GPU in spark local mode. Please make sure your local " "You have enabled GPU in spark local mode. Please make sure your"
"node has at least %d GPUs", " local node has at least %d GPUs",
self.getOrDefault(self.num_workers), self.getOrDefault(self.num_workers),
) )
else: else:
# checking spark non-local mode. # checking spark non-local mode.
if not gpu_per_task or int(gpu_per_task) < 1: if gpu_per_task is not None:
raise RuntimeError( if float(gpu_per_task) < 1.0:
"The spark cluster does not have the necessary GPU" raise ValueError(
+ "configuration for the spark task. Therefore, we cannot" "XGBoost doesn't support GPU fractional configurations. "
+ "run xgboost training using GPU." "Please set `spark.task.resource.gpu.amount=spark.executor"
) ".resource.gpu.amount`"
)
if int(gpu_per_task) > 1: if float(gpu_per_task) > 1.0:
get_logger(self.__class__.__name__).warning( get_logger(self.__class__.__name__).warning(
"You configured %s GPU cores for each spark task, but in " "%s GPUs for each Spark task is configured, but each "
"XGBoost training, every Spark task will only use one GPU core.", "XGBoost training task uses only 1 GPU.",
gpu_per_task, gpu_per_task,
)
else:
raise ValueError(
"The `spark.task.resource.gpu.amount` is required for training"
" on GPU."
) )