Clean up MGPU C++ tests (#9430)

2023-08-01 23:31:18 -07:00
parent a9da2e244a
commit c2b85ab68a
28 changed files with 200 additions and 194 deletions
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -34,24 +34,12 @@
 #define DeclareUnifiedTest(name) name
 #endif

-#if defined(__CUDACC__)
-#define GPUIDX 0
-#else
-#define GPUIDX -1
-#endif
-
 #if defined(__CUDACC__)
 #define DeclareUnifiedDistributedTest(name) MGPU ## name
 #else
 #define DeclareUnifiedDistributedTest(name) name
 #endif

-#if defined(__CUDACC__)
-#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
-#else
-#define WORLD_SIZE_FOR_TEST (3)
-#endif
-
 namespace xgboost {
 class ObjFunction;
 class Metric;
@@ -522,11 +510,15 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint

 inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }

-template <typename Function, typename... Args>
+template <bool use_nccl = false, typename Function, typename... Args>
 void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
  auto run = [&](auto rank) {
    Json config{JsonObject()};
-    config["xgboost_communicator"] = String("in-memory");
+    if constexpr (use_nccl) {
+      config["xgboost_communicator"] = String("in-memory-nccl");
+    } else {
+      config["xgboost_communicator"] = String("in-memory");
+    }
    config["in_memory_world_size"] = world_size;
    config["in_memory_rank"] = rank;
    xgboost::collective::Init(config);
@@ -548,15 +540,44 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
 #endif
 }

-class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
+inline int GetGPUId() {
+#if defined(__CUDACC__)
+  auto const n_gpus = common::AllVisibleGPUs();
+  return n_gpus == 1 ? 0 : collective::GetRank();
+#else
+  return -1;
+#endif
+}
+
+class BaseMGPUTest : public ::testing::Test {
 protected:
  int world_size_;
+  bool use_nccl_{false};

  void SetUp() override {
-    world_size_ = WORLD_SIZE_FOR_TEST;
-    if (world_size_ <= 1) {
-      GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
+    auto const n_gpus = common::AllVisibleGPUs();
+    if (n_gpus <= 1) {
+      // Use a single GPU to simulate distributed environment.
+      world_size_ = 3;
+      // NCCL doesn't like sharing a single GPU, so we use the adapter instead.
+      use_nccl_ = false;
+    } else {
+      // Use multiple GPUs for real.
+      world_size_ = n_gpus;
+      use_nccl_ = true;
+    }
+  }
+
+  template <typename Function, typename... Args>
+  void DoTest(Function&& function, Args&&... args) {
+    if (use_nccl_) {
+      RunWithInMemoryCommunicator<true>(world_size_, function, args...);
+    } else {
+      RunWithInMemoryCommunicator<false>(world_size_, function, args...);
    }
  }
 };
+
+class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
+
 }  // namespace xgboost