From 748d516c50f0eadf3d9328612f9d98ca61378dd1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 13 Oct 2022 21:03:45 +0800
Subject: [PATCH] [pyspark] Enable running GPU tests on variable number of
 GPUs. (#8335)

---
 .../python-gpu/test_gpu_spark/discover_gpu.sh  | 15 ++++++++++++++-
 .../test_gpu_spark/test_gpu_spark.py           | 18 ++++++++++++++++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/tests/python-gpu/test_gpu_spark/discover_gpu.sh b/tests/python-gpu/test_gpu_spark/discover_gpu.sh
index 42dd05517..fc2c71741 100755
--- a/tests/python-gpu/test_gpu_spark/discover_gpu.sh
+++ b/tests/python-gpu/test_gpu_spark/discover_gpu.sh
@@ -1,3 +1,16 @@
 #!/bin/bash
 
-echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}"
+# This script is only made for running XGBoost tests on official CI where we have access
+# to a 4-GPU cluster, the discovery command is for running tests on a local machine where
+# the driver and the GPU worker might be the same machine for the ease of development.
+
+if ! command -v nvidia-smi &> /dev/null
+then
+    # default to 4 GPUs
+    echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}"
+    exit
+else
+    # https://github.com/apache/spark/blob/master/examples/src/main/scripts/getGpusResources.sh
+    ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`
+    echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
+fi
diff --git a/tests/python-gpu/test_gpu_spark/test_gpu_spark.py b/tests/python-gpu/test_gpu_spark/test_gpu_spark.py
index 6836718fe..bcae96dc5 100644
--- a/tests/python-gpu/test_gpu_spark/test_gpu_spark.py
+++ b/tests/python-gpu/test_gpu_spark/test_gpu_spark.py
@@ -1,4 +1,6 @@
+import json
 import logging
+import subprocess
 import sys
 
 import pytest
@@ -18,8 +20,20 @@ from pyspark.sql import SparkSession
 from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
 
 gpu_discovery_script_path = "tests/python-gpu/test_gpu_spark/discover_gpu.sh"
-executor_gpu_amount = 4
-executor_cores = 4
+
+
+def get_devices():
+    """This works only if driver is the same machine of worker."""
+    completed = subprocess.run(gpu_discovery_script_path, stdout=subprocess.PIPE)
+    assert completed.returncode == 0, "Failed to execute discovery script."
+    msg = completed.stdout.decode("utf-8")
+    result = json.loads(msg)
+    addresses = result["addresses"]
+    return addresses
+
+
+executor_gpu_amount = len(get_devices())
+executor_cores = executor_gpu_amount
 num_workers = executor_gpu_amount