Fix quantile tests running on multi-gpus (#8775)

* Fix quantile tests running on multi-gpus * Run some gtests with multiple GPUs * fix mgpu test naming * Instruct NCCL to print extra logs * Allocate extra space in /dev/shm to enable NCCL * use gtest_skip to skip mgpu tests --------- Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2023-02-12 17:00:26 -08:00 · 2023-02-12 17:00:26 -08:00 · ed91e775ec
commit ed91e775ec
parent 225b3158f6
3 changed files with 50 additions and 32 deletions
--- a/tests/buildkite/pipeline-mgpu.yml
+++ b/tests/buildkite/pipeline-mgpu.yml
@ -36,6 +36,11 @@ steps:
      queue: linux-amd64-mgpu
  - wait
  #### -------- TEST --------
  - label: ":console: Run Google Tests"
    command: "tests/buildkite/test-cpp-mgpu.sh"
    key: test-cpp-mgpu
    agents:
      queue: linux-amd64-mgpu
  - label: ":console: Test Python package, 4 GPUs"
    command: "tests/buildkite/test-python-gpu.sh mgpu"
    key: test-python-mgpu
--- a/tests/buildkite/test-cpp-mgpu.sh
+++ b/tests/buildkite/test-cpp-mgpu.sh
@ -0,0 +1,16 @@
 #!/bin/bash
 set -euo pipefail
 source tests/buildkite/conftest.sh
 # Allocate extra space in /dev/shm to enable NCCL
 export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
 echo "--- Run Google Tests with CUDA, using multiple GPUs"
 buildkite-agent artifact download "build/testxgboost" . --step build-cuda
 chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
  build/testxgboost --gtest_filter=*MGPU*
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@ -341,30 +341,26 @@ TEST(GPUQuantile, MultiMerge) {
 namespace {
 void TestAllReduceBasic(int32_t n_gpus) {
  auto const world = collective::GetWorldSize();
  if (world != 1) {
    ASSERT_EQ(world, n_gpus);
  } else {
    return;
  }
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
    auto const device = collective::GetRank();
    // Set up single node version;
-    HostDeviceVector<FeatureType> ft;
+    HostDeviceVector<FeatureType> ft({}, device);
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);
    size_t intermediate_num_cuts = std::min(
        kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
    std::vector<SketchContainer> containers;
    for (auto rank = 0; rank < world; ++rank) {
-      HostDeviceVector<float> storage;
+      HostDeviceVector<float> storage({}, device);
      std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(device)
                                      .Seed(rank + seed)
                                      .GenerateArrayInterface(&storage);
      data::CupyAdapter adapter(interface_str);
-      HostDeviceVector<FeatureType> ft;
+      HostDeviceVector<FeatureType> ft({}, device);
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      containers.emplace_back(ft, n_bins, kCols, kRows, device);
      AdapterDeviceSketch(adapter.Value(), n_bins, info,
                          std::numeric_limits<float>::quiet_NaN(),
                          &containers.back());
@ -375,16 +371,16 @@ void TestAllReduceBasic(int32_t n_gpus) {
      sketch_on_single_node.FixError();
    }
    sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
+    TestQuantileElemRank(device, sketch_on_single_node.Data(),
                         sketch_on_single_node.ColumnsPtr(), true);
    // Set up distributed version.  We rely on using rank as seed to generate
    // the exact same copy of data.
    auto rank = collective::GetRank();
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
-    HostDeviceVector<float> storage;
+    HostDeviceVector<float> storage({}, device);
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
+                                    .Device(device)
                                    .Seed(rank + seed)
                                    .GenerateArrayInterface(&storage);
    data::CupyAdapter adapter(interface_str);
@ -399,7 +395,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
    ASSERT_EQ(sketch_distributed.Data().size(),
              sketch_on_single_node.Data().size());
-    TestQuantileElemRank(0, sketch_distributed.Data(),
+    TestQuantileElemRank(device, sketch_distributed.Data(),
                         sketch_distributed.ColumnsPtr(), true);
    std::vector<SketchEntry> single_node_data(
@ -420,29 +416,27 @@ void TestAllReduceBasic(int32_t n_gpus) {
 }
 }  // anonymous namespace
-TEST(GPUQuantile, AllReduceBasic) {
+TEST(GPUQuantile, MGPUAllReduceBasic) {
  auto const n_gpus = AllVisibleGPUs();
  if (n_gpus <= 1) {
    GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
  }
  RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
 }
 namespace {
 void TestSameOnAllWorkers(int32_t n_gpus) {
  auto world = collective::GetWorldSize();
  if (world != 1) {
    ASSERT_EQ(world, n_gpus);
  } else {
    return;
  }
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
-    auto rank = collective::GetRank();
+    auto const rank = collective::GetRank();
-    HostDeviceVector<FeatureType> ft;
+    auto const device = rank;
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
+    HostDeviceVector<FeatureType> ft({}, device);
-    HostDeviceVector<float> storage;
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
    HostDeviceVector<float> storage({}, device);
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
+                                    .Device(device)
                                    .Seed(rank + seed)
                                    .GenerateArrayInterface(&storage);
    data::CupyAdapter adapter(interface_str);
@ -451,7 +445,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
                        &sketch_distributed);
    sketch_distributed.AllReduce();
    sketch_distributed.Unique();
-    TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
+    TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
    // Test for all workers having the same sketch.
    size_t n_data = sketch_distributed.Data().size();
@ -468,7 +462,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
    thrust::copy(thrust::device, local_data.data(),
                 local_data.data() + local_data.size(),
                 all_workers.begin() + local_data.size() * rank);
-    collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
+    collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device);
    communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
    communicator->Synchronize();
@ -492,8 +486,11 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
 }
 }  // anonymous namespace
-TEST(GPUQuantile, SameOnAllWorkers) {
+TEST(GPUQuantile, MGPUSameOnAllWorkers) {
  auto const n_gpus = AllVisibleGPUs();
  if (n_gpus <= 1) {
    GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
  }
  RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
 }