Clean up MGPU C++ tests (#9430)

This commit is contained in:
Rong Ou
2023-08-01 23:31:18 -07:00
committed by GitHub
parent a9da2e244a
commit c2b85ab68a
28 changed files with 200 additions and 194 deletions

View File

@@ -34,24 +34,12 @@
#define DeclareUnifiedTest(name) name
#endif
#if defined(__CUDACC__)
#define GPUIDX 0
#else
#define GPUIDX -1
#endif
#if defined(__CUDACC__)
#define DeclareUnifiedDistributedTest(name) MGPU ## name
#else
#define DeclareUnifiedDistributedTest(name) name
#endif
#if defined(__CUDACC__)
#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
#else
#define WORLD_SIZE_FOR_TEST (3)
#endif
namespace xgboost {
class ObjFunction;
class Metric;
@@ -522,11 +510,15 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
template <typename Function, typename... Args>
template <bool use_nccl = false, typename Function, typename... Args>
void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
auto run = [&](auto rank) {
Json config{JsonObject()};
config["xgboost_communicator"] = String("in-memory");
if constexpr (use_nccl) {
config["xgboost_communicator"] = String("in-memory-nccl");
} else {
config["xgboost_communicator"] = String("in-memory");
}
config["in_memory_world_size"] = world_size;
config["in_memory_rank"] = rank;
xgboost::collective::Init(config);
@@ -548,15 +540,44 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
#endif
}
class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
inline int GetGPUId() {
#if defined(__CUDACC__)
auto const n_gpus = common::AllVisibleGPUs();
return n_gpus == 1 ? 0 : collective::GetRank();
#else
return -1;
#endif
}
class BaseMGPUTest : public ::testing::Test {
protected:
int world_size_;
bool use_nccl_{false};
void SetUp() override {
world_size_ = WORLD_SIZE_FOR_TEST;
if (world_size_ <= 1) {
GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
auto const n_gpus = common::AllVisibleGPUs();
if (n_gpus <= 1) {
// Use a single GPU to simulate distributed environment.
world_size_ = 3;
// NCCL doesn't like sharing a single GPU, so we use the adapter instead.
use_nccl_ = false;
} else {
// Use multiple GPUs for real.
world_size_ = n_gpus;
use_nccl_ = true;
}
}
template <typename Function, typename... Args>
void DoTest(Function&& function, Args&&... args) {
if (use_nccl_) {
RunWithInMemoryCommunicator<true>(world_size_, function, args...);
} else {
RunWithInMemoryCommunicator<false>(world_size_, function, args...);
}
}
};
class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
} // namespace xgboost