Enable distributed GPU training over Rabit (#7930)

This commit is contained in:
Rong Ou
2022-05-30 13:09:45 -07:00
committed by GitHub
parent 6275cdc486
commit 80339c3427
9 changed files with 458 additions and 129 deletions

View File

@@ -339,7 +339,6 @@ TEST(GPUQuantile, MultiMerge) {
TEST(GPUQuantile, AllReduceBasic) {
// This test is supposed to run by a python test that setups the environment.
std::string msg {"Skipping AllReduce test"};
#if defined(__linux__) && defined(XGBOOST_USE_NCCL)
auto n_gpus = AllVisibleGPUs();
InitRabitContext(msg, n_gpus);
auto world = rabit::GetWorldSize();
@@ -420,15 +419,10 @@ TEST(GPUQuantile, AllReduceBasic) {
}
});
rabit::Finalize();
#else
LOG(WARNING) << msg;
return;
#endif // !defined(__linux__) && defined(XGBOOST_USE_NCCL)
}
TEST(GPUQuantile, SameOnAllWorkers) {
std::string msg {"Skipping SameOnAllWorkers test"};
#if defined(__linux__) && defined(XGBOOST_USE_NCCL)
auto n_gpus = AllVisibleGPUs();
InitRabitContext(msg, n_gpus);
auto world = rabit::GetWorldSize();
@@ -495,10 +489,6 @@ TEST(GPUQuantile, SameOnAllWorkers) {
offset += size_as_float;
}
});
#else
LOG(WARNING) << msg;
return;
#endif // !defined(__linux__) && defined(XGBOOST_USE_NCCL)
}
TEST(GPUQuantile, Push) {

View File

@@ -4,7 +4,6 @@
*/
#include "test_transform_range.cc"
#if defined(XGBOOST_USE_NCCL)
namespace xgboost {
namespace common {
@@ -15,7 +14,7 @@ TEST(Transform, MGPU_SpecifiedGpuId) { // NOLINT
}
// Use 1 GPU, Numbering of GPU starts from 1
auto device = 1;
const size_t size {256};
auto const size {256};
std::vector<bst_float> h_in(size);
std::vector<bst_float> h_out(size);
std::iota(h_in.begin(), h_in.end(), 0);
@@ -34,4 +33,3 @@ TEST(Transform, MGPU_SpecifiedGpuId) { // NOLINT
} // namespace common
} // namespace xgboost
#endif