Common interface for collective communication (#8057)

* implement broadcast for federated communicator

* implement allreduce

* add communicator factory

* add device adapter

* add device communicator to factory

* add rabit communicator

* add rabit communicator to the factory

* add nccl device communicator

* add synchronize to device communicator

* add back print and getprocessorname

* add python wrapper and c api

* clean up types

* fix non-gpu build

* try to fix ci

* fix std::size_t

* portable string compare ignore case

* c style size_t

* fix lint errors

* cross platform setenv

* fix memory leak

* fix lint errors

* address review feedback

* add python test for rabit communicator

* fix failing gtest

* use json to configure communicators

* fix lint error

* get rid of factories

* fix cpu build

* fix include

* fix python import

* don't export collective.py yet

* skip collective communicator pytest on windows

* add review feedback

* update documentation

* remove mpi communicator type

* fix tests

* shutdown the communicator separately

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
Rong Ou
2022-09-12 15:21:12 -07:00
committed by GitHub
parent bc818316f2
commit a2686543a9
25 changed files with 1771 additions and 95 deletions

View File

@@ -0,0 +1,54 @@
/*!
* Copyright 2022 XGBoost contributors
*/
#include <dmlc/parameter.h>
#include <gtest/gtest.h>
#include "../../../src/collective/communicator.h"
namespace xgboost {
namespace collective {
TEST(CommunicatorFactory, TypeFromEnv) {
EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromEnv());
dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "rabit");
EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromEnv());
dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "Federated");
EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromEnv());
dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "foo");
EXPECT_THROW(Communicator::GetTypeFromEnv(), dmlc::Error);
}
TEST(CommunicatorFactory, TypeFromArgs) {
Json config{JsonObject()};
EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromConfig(config));
config["xgboost_communicator"] = String("rabit");
EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromConfig(config));
config["xgboost_communicator"] = String("federated");
EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromConfig(config));
config["xgboost_communicator"] = String("foo");
EXPECT_THROW(Communicator::GetTypeFromConfig(config), dmlc::Error);
}
TEST(CommunicatorFactory, TypeFromArgsUpperCase) {
Json config{JsonObject()};
EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromConfig(config));
config["XGBOOST_COMMUNICATOR"] = String("rabit");
EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromConfig(config));
config["XGBOOST_COMMUNICATOR"] = String("federated");
EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromConfig(config));
config["XGBOOST_COMMUNICATOR"] = String("foo");
EXPECT_THROW(Communicator::GetTypeFromConfig(config), dmlc::Error);
}
} // namespace collective
} // namespace xgboost

View File

@@ -0,0 +1,26 @@
/*!
* Copyright 2022 XGBoost contributors
*/
#ifdef XGBOOST_USE_NCCL
#include <gtest/gtest.h>
#include "../../../src/collective/nccl_device_communicator.cuh"
namespace xgboost {
namespace collective {
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
EXPECT_THROW(construct(), dmlc::Error);
}
} // namespace collective
} // namespace xgboost
#endif

View File

@@ -0,0 +1,39 @@
/*!
* Copyright 2022 XGBoost contributors
*/
#include <gtest/gtest.h>
#include "../../../src/collective/rabit_communicator.h"
namespace xgboost {
namespace collective {
TEST(RabitCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
auto construct = []() { RabitCommunicator comm{0, 0}; };
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(RabitCommunicatorSimpleTest, ThrowOnRankTooSmall) {
auto construct = []() { RabitCommunicator comm{1, -1}; };
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(RabitCommunicatorSimpleTest, ThrowOnRankTooBig) {
auto construct = []() { RabitCommunicator comm{1, 1}; };
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(RabitCommunicatorSimpleTest, GetWorldSizeAndRank) {
RabitCommunicator comm{6, 3};
EXPECT_EQ(comm.GetWorldSize(), 6);
EXPECT_EQ(comm.GetRank(), 3);
}
TEST(RabitCommunicatorSimpleTest, IsNotDistributed) {
RabitCommunicator comm{2, 1};
// Rabit is only distributed with a tracker.
EXPECT_FALSE(comm.IsDistributed());
}
} // namespace collective
} // namespace xgboost