Reduce time for some multi-gpu tests (#8288)

* Faster dask tests * Reuse AllReducer objects in tests. * Faster boost from prediction tests. * Use rmm dask fixture. * Speed up dask demo. * mypy * Format with black. * mypy * Clang-tidy Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-10-04 12:49:33 +02:00
parent ca0547bb65
commit d686bf52a6
8 changed files with 337 additions and 336 deletions
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -349,6 +349,9 @@ TEST(GPUQuantile, AllReduceBasic) {
    return;
  }

+  auto reducer = std::make_shared<dh::AllReducer>();
+  reducer->Init(0);
+
  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
    // Set up single node version;
@@ -378,12 +381,12 @@ TEST(GPUQuantile, AllReduceBasic) {
    }
    sketch_on_single_node.Unique();
    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+                         sketch_on_single_node.ColumnsPtr(), true);

    // Set up distributed version.  We rely on using rank as seed to generate
    // the exact same copy of data.
    auto rank = rabit::GetRank();
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                    .Device(0)
@@ -402,7 +405,7 @@ TEST(GPUQuantile, AllReduceBasic) {
              sketch_on_single_node.Data().size());

    TestQuantileElemRank(0, sketch_distributed.Data(),
-                         sketch_distributed.ColumnsPtr());
+                         sketch_distributed.ColumnsPtr(), true);

    std::vector<SketchEntry> single_node_data(
        sketch_on_single_node.Data().size());
@@ -432,13 +435,15 @@ TEST(GPUQuantile, SameOnAllWorkers) {
  } else {
    return;
  }
+  auto reducer = std::make_shared<dh::AllReducer>();
+  reducer->Init(0);

  constexpr size_t kRows = 1000, kCols = 100;
  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                 MetaInfo const &info) {
    auto rank = rabit::GetRank();
    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
    HostDeviceVector<float> storage;
    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
                                    .Device(0)
@@ -450,7 +455,7 @@ TEST(GPUQuantile, SameOnAllWorkers) {
                        &sketch_distributed);
    sketch_distributed.AllReduce();
    sketch_distributed.Unique();
-    TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr());
+    TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);

    // Test for all workers having the same sketch.
    size_t n_data = sketch_distributed.Data().size();
@@ -467,12 +472,9 @@ TEST(GPUQuantile, SameOnAllWorkers) {
    thrust::copy(thrust::device, local_data.data(),
                 local_data.data() + local_data.size(),
                 all_workers.begin() + local_data.size() * rank);
-    dh::AllReducer reducer;
-    reducer.Init(0);
-
-    reducer.AllReduceSum(all_workers.data().get(), all_workers.data().get(),
+    reducer->AllReduceSum(all_workers.data().get(), all_workers.data().get(),
                         all_workers.size());
-    reducer.Synchronize();
+    reducer->Synchronize();

    auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
    std::vector<float> h_base_line(base_line.size());
--- a/tests/cpp/common/test_quantile.h
+++ b/tests/cpp/common/test_quantile.h
@@ -37,12 +37,12 @@ inline void InitRabitContext(std::string msg, int32_t n_workers) {
 }

 template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
-  std::vector<int32_t> seeds(4);
+  std::vector<int32_t> seeds(2);
  SimpleLCG lcg;
  SimpleRealUniformDistribution<float> dist(3, 1000);
  std::generate(seeds.begin(), seeds.end(), [&](){ return dist(&lcg); });

-  std::vector<size_t> bins(8);
+  std::vector<size_t> bins(2);
  for (size_t i = 0; i < bins.size() - 1; ++i) {
    bins[i] = i * 35 + 2;
  }