From 748d516c50f0eadf3d9328612f9d98ca61378dd1 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 13 Oct 2022 21:03:45 +0800 Subject: [PATCH] [pyspark] Enable running GPU tests on variable number of GPUs. (#8335) --- .../python-gpu/test_gpu_spark/discover_gpu.sh | 15 ++++++++++++++- .../test_gpu_spark/test_gpu_spark.py | 18 ++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/python-gpu/test_gpu_spark/discover_gpu.sh b/tests/python-gpu/test_gpu_spark/discover_gpu.sh index 42dd05517..fc2c71741 100755 --- a/tests/python-gpu/test_gpu_spark/discover_gpu.sh +++ b/tests/python-gpu/test_gpu_spark/discover_gpu.sh @@ -1,3 +1,16 @@ #!/bin/bash -echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}" +# This script is only made for running XGBoost tests on official CI where we have access +# to a 4-GPU cluster, the discovery command is for running tests on a local machine where +# the driver and the GPU worker might be the same machine for the ease of development. + +if ! command -v nvidia-smi &> /dev/null +then + # default to 4 GPUs + echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}" + exit +else + # https://github.com/apache/spark/blob/master/examples/src/main/scripts/getGpusResources.sh + ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'` + echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]} +fi diff --git a/tests/python-gpu/test_gpu_spark/test_gpu_spark.py b/tests/python-gpu/test_gpu_spark/test_gpu_spark.py index 6836718fe..bcae96dc5 100644 --- a/tests/python-gpu/test_gpu_spark/test_gpu_spark.py +++ b/tests/python-gpu/test_gpu_spark/test_gpu_spark.py @@ -1,4 +1,6 @@ +import json import logging +import subprocess import sys import pytest @@ -18,8 +20,20 @@ from pyspark.sql import SparkSession from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor gpu_discovery_script_path = "tests/python-gpu/test_gpu_spark/discover_gpu.sh" -executor_gpu_amount = 4 -executor_cores = 4 + + +def get_devices(): + """This works only if driver is the same machine of worker.""" + completed = subprocess.run(gpu_discovery_script_path, stdout=subprocess.PIPE) + assert completed.returncode == 0, "Failed to execute discovery script." + msg = completed.stdout.decode("utf-8") + result = json.loads(msg) + addresses = result["addresses"] + return addresses + + +executor_gpu_amount = len(get_devices()) +executor_cores = executor_gpu_amount num_workers = executor_gpu_amount