Clean up MGPU C++ tests (#9430)
This commit is contained in:
@@ -34,24 +34,12 @@
|
||||
#define DeclareUnifiedTest(name) name
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#define GPUIDX 0
|
||||
#else
|
||||
#define GPUIDX -1
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#define DeclareUnifiedDistributedTest(name) MGPU ## name
|
||||
#else
|
||||
#define DeclareUnifiedDistributedTest(name) name
|
||||
#endif
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
|
||||
#else
|
||||
#define WORLD_SIZE_FOR_TEST (3)
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
class ObjFunction;
|
||||
class Metric;
|
||||
@@ -522,11 +510,15 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
|
||||
|
||||
inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
|
||||
|
||||
template <typename Function, typename... Args>
|
||||
template <bool use_nccl = false, typename Function, typename... Args>
|
||||
void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
|
||||
auto run = [&](auto rank) {
|
||||
Json config{JsonObject()};
|
||||
config["xgboost_communicator"] = String("in-memory");
|
||||
if constexpr (use_nccl) {
|
||||
config["xgboost_communicator"] = String("in-memory-nccl");
|
||||
} else {
|
||||
config["xgboost_communicator"] = String("in-memory");
|
||||
}
|
||||
config["in_memory_world_size"] = world_size;
|
||||
config["in_memory_rank"] = rank;
|
||||
xgboost::collective::Init(config);
|
||||
@@ -548,15 +540,44 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
|
||||
#endif
|
||||
}
|
||||
|
||||
class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
|
||||
inline int GetGPUId() {
|
||||
#if defined(__CUDACC__)
|
||||
auto const n_gpus = common::AllVisibleGPUs();
|
||||
return n_gpus == 1 ? 0 : collective::GetRank();
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
class BaseMGPUTest : public ::testing::Test {
|
||||
protected:
|
||||
int world_size_;
|
||||
bool use_nccl_{false};
|
||||
|
||||
void SetUp() override {
|
||||
world_size_ = WORLD_SIZE_FOR_TEST;
|
||||
if (world_size_ <= 1) {
|
||||
GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
|
||||
auto const n_gpus = common::AllVisibleGPUs();
|
||||
if (n_gpus <= 1) {
|
||||
// Use a single GPU to simulate distributed environment.
|
||||
world_size_ = 3;
|
||||
// NCCL doesn't like sharing a single GPU, so we use the adapter instead.
|
||||
use_nccl_ = false;
|
||||
} else {
|
||||
// Use multiple GPUs for real.
|
||||
world_size_ = n_gpus;
|
||||
use_nccl_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Function, typename... Args>
|
||||
void DoTest(Function&& function, Args&&... args) {
|
||||
if (use_nccl_) {
|
||||
RunWithInMemoryCommunicator<true>(world_size_, function, args...);
|
||||
} else {
|
||||
RunWithInMemoryCommunicator<false>(world_size_, function, args...);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
Reference in New Issue
Block a user