Reduce time for some multi-gpu tests (#8288)

* Faster dask tests

* Reuse AllReducer objects in tests.

* Faster boost from prediction tests.

* Use rmm dask fixture.

* Speed up dask demo.

* mypy

* Format with black.

* mypy

* Clang-tidy

Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rory Mitchell
2022-10-04 12:49:33 +02:00
committed by GitHub
parent ca0547bb65
commit d686bf52a6
8 changed files with 337 additions and 336 deletions

View File

@@ -349,6 +349,9 @@ TEST(GPUQuantile, AllReduceBasic) {
return;
}
auto reducer = std::make_shared<dh::AllReducer>();
reducer->Init(0);
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
// Set up single node version;
@@ -378,12 +381,12 @@ TEST(GPUQuantile, AllReduceBasic) {
}
sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr());
sketch_on_single_node.ColumnsPtr(), true);
// Set up distributed version. We rely on using rank as seed to generate
// the exact same copy of data.
auto rank = rabit::GetRank();
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
@@ -402,7 +405,7 @@ TEST(GPUQuantile, AllReduceBasic) {
sketch_on_single_node.Data().size());
TestQuantileElemRank(0, sketch_distributed.Data(),
sketch_distributed.ColumnsPtr());
sketch_distributed.ColumnsPtr(), true);
std::vector<SketchEntry> single_node_data(
sketch_on_single_node.Data().size());
@@ -432,13 +435,15 @@ TEST(GPUQuantile, SameOnAllWorkers) {
} else {
return;
}
auto reducer = std::make_shared<dh::AllReducer>();
reducer->Init(0);
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
auto rank = rabit::GetRank();
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0, reducer);
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
@@ -450,7 +455,7 @@ TEST(GPUQuantile, SameOnAllWorkers) {
&sketch_distributed);
sketch_distributed.AllReduce();
sketch_distributed.Unique();
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr());
TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
// Test for all workers having the same sketch.
size_t n_data = sketch_distributed.Data().size();
@@ -467,12 +472,9 @@ TEST(GPUQuantile, SameOnAllWorkers) {
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
dh::AllReducer reducer;
reducer.Init(0);
reducer.AllReduceSum(all_workers.data().get(), all_workers.data().get(),
reducer->AllReduceSum(all_workers.data().get(), all_workers.data().get(),
all_workers.size());
reducer.Synchronize();
reducer->Synchronize();
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
std::vector<float> h_base_line(base_line.size());

View File

@@ -37,12 +37,12 @@ inline void InitRabitContext(std::string msg, int32_t n_workers) {
}
template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
std::vector<int32_t> seeds(4);
std::vector<int32_t> seeds(2);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(3, 1000);
std::generate(seeds.begin(), seeds.end(), [&](){ return dist(&lcg); });
std::vector<size_t> bins(8);
std::vector<size_t> bins(2);
for (size_t i = 0; i < bins.size() - 1; ++i) {
bins[i] = i * 35 + 2;
}