Common interface for collective communication (#8057)
* implement broadcast for federated communicator * implement allreduce * add communicator factory * add device adapter * add device communicator to factory * add rabit communicator * add rabit communicator to the factory * add nccl device communicator * add synchronize to device communicator * add back print and getprocessorname * add python wrapper and c api * clean up types * fix non-gpu build * try to fix ci * fix std::size_t * portable string compare ignore case * c style size_t * fix lint errors * cross platform setenv * fix memory leak * fix lint errors * address review feedback * add python test for rabit communicator * fix failing gtest * use json to configure communicators * fix lint error * get rid of factories * fix cpu build * fix include * fix python import * don't export collective.py yet * skip collective communicator pytest on windows * add review feedback * update documentation * remove mpi communicator type * fix tests * shutdown the communicator separately Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -22,7 +22,7 @@ if (PLUGIN_FEDERATED)
|
||||
target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
|
||||
target_link_libraries(testxgboost PRIVATE federated_client)
|
||||
else (PLUGIN_FEDERATED)
|
||||
file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.cc")
|
||||
file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.*")
|
||||
list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
|
||||
endif (PLUGIN_FEDERATED)
|
||||
|
||||
|
||||
54
tests/cpp/collective/test_communicator.cc
Normal file
54
tests/cpp/collective/test_communicator.cc
Normal file
@@ -0,0 +1,54 @@
|
||||
/*!
|
||||
* Copyright 2022 XGBoost contributors
|
||||
*/
|
||||
#include <dmlc/parameter.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../src/collective/communicator.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
TEST(CommunicatorFactory, TypeFromEnv) {
|
||||
EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromEnv());
|
||||
|
||||
dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "rabit");
|
||||
EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromEnv());
|
||||
|
||||
dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "Federated");
|
||||
EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromEnv());
|
||||
|
||||
dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "foo");
|
||||
EXPECT_THROW(Communicator::GetTypeFromEnv(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(CommunicatorFactory, TypeFromArgs) {
|
||||
Json config{JsonObject()};
|
||||
EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromConfig(config));
|
||||
|
||||
config["xgboost_communicator"] = String("rabit");
|
||||
EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromConfig(config));
|
||||
|
||||
config["xgboost_communicator"] = String("federated");
|
||||
EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromConfig(config));
|
||||
|
||||
config["xgboost_communicator"] = String("foo");
|
||||
EXPECT_THROW(Communicator::GetTypeFromConfig(config), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(CommunicatorFactory, TypeFromArgsUpperCase) {
|
||||
Json config{JsonObject()};
|
||||
EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromConfig(config));
|
||||
|
||||
config["XGBOOST_COMMUNICATOR"] = String("rabit");
|
||||
EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromConfig(config));
|
||||
|
||||
config["XGBOOST_COMMUNICATOR"] = String("federated");
|
||||
EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromConfig(config));
|
||||
|
||||
config["XGBOOST_COMMUNICATOR"] = String("foo");
|
||||
EXPECT_THROW(Communicator::GetTypeFromConfig(config), dmlc::Error);
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
26
tests/cpp/collective/test_nccl_device_communicator.cu
Normal file
26
tests/cpp/collective/test_nccl_device_communicator.cu
Normal file
@@ -0,0 +1,26 @@
|
||||
/*!
|
||||
* Copyright 2022 XGBoost contributors
|
||||
*/
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../src/collective/nccl_device_communicator.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
||||
auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
|
||||
auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
|
||||
#endif
|
||||
39
tests/cpp/collective/test_rabit_communicator.cc
Normal file
39
tests/cpp/collective/test_rabit_communicator.cc
Normal file
@@ -0,0 +1,39 @@
|
||||
/*!
|
||||
* Copyright 2022 XGBoost contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "../../../src/collective/rabit_communicator.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
TEST(RabitCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
|
||||
auto construct = []() { RabitCommunicator comm{0, 0}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(RabitCommunicatorSimpleTest, ThrowOnRankTooSmall) {
|
||||
auto construct = []() { RabitCommunicator comm{1, -1}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(RabitCommunicatorSimpleTest, ThrowOnRankTooBig) {
|
||||
auto construct = []() { RabitCommunicator comm{1, 1}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(RabitCommunicatorSimpleTest, GetWorldSizeAndRank) {
|
||||
RabitCommunicator comm{6, 3};
|
||||
EXPECT_EQ(comm.GetWorldSize(), 6);
|
||||
EXPECT_EQ(comm.GetRank(), 3);
|
||||
}
|
||||
|
||||
TEST(RabitCommunicatorSimpleTest, IsNotDistributed) {
|
||||
RabitCommunicator comm{2, 1};
|
||||
// Rabit is only distributed with a tracker.
|
||||
EXPECT_FALSE(comm.IsDistributed());
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
105
tests/cpp/plugin/test_federated_adapter.cu
Normal file
105
tests/cpp/plugin/test_federated_adapter.cu
Normal file
@@ -0,0 +1,105 @@
|
||||
/*!
|
||||
* Copyright 2022 XGBoost contributors
|
||||
*/
|
||||
#include <grpcpp/server_builder.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <thrust/host_vector.h>
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "../../../plugin/federated/federated_communicator.h"
|
||||
#include "../../../plugin/federated/federated_server.h"
|
||||
#include "../../../src/collective/device_communicator_adapter.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp)
|
||||
|
||||
class FederatedAdapterTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
server_thread_.reset(new std::thread([this] {
|
||||
grpc::ServerBuilder builder;
|
||||
federated::FederatedService service{kWorldSize};
|
||||
builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials());
|
||||
builder.RegisterService(&service);
|
||||
server_ = builder.BuildAndStart();
|
||||
server_->Wait();
|
||||
}));
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
server_->Shutdown();
|
||||
server_thread_->join();
|
||||
}
|
||||
|
||||
static int const kWorldSize{2};
|
||||
std::unique_ptr<std::thread> server_thread_;
|
||||
std::unique_ptr<grpc::Server> server_;
|
||||
};
|
||||
|
||||
TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
|
||||
auto construct = []() { DeviceCommunicatorAdapter adapter{-1, nullptr}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
|
||||
auto construct = []() { DeviceCommunicatorAdapter adapter{0, nullptr}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
|
||||
std::vector<std::thread> threads;
|
||||
for (auto rank = 0; rank < kWorldSize; rank++) {
|
||||
threads.emplace_back(std::thread([rank] {
|
||||
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
|
||||
DeviceCommunicatorAdapter adapter{rank, &comm};
|
||||
int const count = 3;
|
||||
thrust::device_vector<double> buffer(count, 0);
|
||||
thrust::sequence(buffer.begin(), buffer.end());
|
||||
adapter.AllReduceSum(buffer.data().get(), count);
|
||||
thrust::host_vector<double> host_buffer = buffer;
|
||||
EXPECT_EQ(host_buffer.size(), count);
|
||||
for (auto i = 0; i < count; i++) {
|
||||
EXPECT_EQ(host_buffer[i], i * 2);
|
||||
}
|
||||
}));
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
|
||||
std::vector<std::thread> threads;
|
||||
for (auto rank = 0; rank < kWorldSize; rank++) {
|
||||
threads.emplace_back(std::thread([rank] {
|
||||
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
|
||||
DeviceCommunicatorAdapter adapter{rank, &comm};
|
||||
|
||||
int const count = rank + 2;
|
||||
thrust::device_vector<char> buffer(count, 0);
|
||||
thrust::sequence(buffer.begin(), buffer.end());
|
||||
std::vector<std::size_t> segments(kWorldSize);
|
||||
dh::caching_device_vector<char> receive_buffer{};
|
||||
|
||||
adapter.AllGatherV(buffer.data().get(), count, &segments, &receive_buffer);
|
||||
|
||||
EXPECT_EQ(segments[0], 2);
|
||||
EXPECT_EQ(segments[1], 3);
|
||||
thrust::host_vector<char> host_buffer = receive_buffer;
|
||||
EXPECT_EQ(host_buffer.size(), 5);
|
||||
int expected[] = {0, 1, 0, 1, 2};
|
||||
for (auto i = 0; i < 5; i++) {
|
||||
EXPECT_EQ(host_buffer[i], expected[i]);
|
||||
}
|
||||
}));
|
||||
}
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
119
tests/cpp/plugin/test_federated_communicator.cc
Normal file
119
tests/cpp/plugin/test_federated_communicator.cc
Normal file
@@ -0,0 +1,119 @@
|
||||
/*!
|
||||
* Copyright 2022 XGBoost contributors
|
||||
*/
|
||||
#include <dmlc/parameter.h>
|
||||
#include <grpcpp/server_builder.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "../../../plugin/federated/federated_communicator.h"
|
||||
#include "../../../plugin/federated/federated_server.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace collective {
|
||||
|
||||
std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp)
|
||||
|
||||
class FederatedCommunicatorTest : public ::testing::Test {
|
||||
public:
|
||||
static void VerifyAllreduce(int rank) {
|
||||
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
|
||||
CheckAllreduce(comm);
|
||||
}
|
||||
|
||||
static void VerifyBroadcast(int rank) {
|
||||
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
|
||||
CheckBroadcast(comm, rank);
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
server_thread_.reset(new std::thread([this] {
|
||||
grpc::ServerBuilder builder;
|
||||
federated::FederatedService service{kWorldSize};
|
||||
builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials());
|
||||
builder.RegisterService(&service);
|
||||
server_ = builder.BuildAndStart();
|
||||
server_->Wait();
|
||||
}));
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
server_->Shutdown();
|
||||
server_thread_->join();
|
||||
}
|
||||
|
||||
static void CheckAllreduce(FederatedCommunicator &comm) {
|
||||
int buffer[] = {1, 2, 3, 4, 5};
|
||||
comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
|
||||
int expected[] = {3, 6, 9, 12, 15};
|
||||
for (auto i = 0; i < 5; i++) {
|
||||
EXPECT_EQ(buffer[i], expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void CheckBroadcast(FederatedCommunicator &comm, int rank) {
|
||||
if (rank == 0) {
|
||||
std::string buffer{"hello"};
|
||||
comm.Broadcast(&buffer[0], buffer.size(), 0);
|
||||
EXPECT_EQ(buffer, "hello");
|
||||
} else {
|
||||
std::string buffer{" "};
|
||||
comm.Broadcast(&buffer[0], buffer.size(), 0);
|
||||
EXPECT_EQ(buffer, "hello");
|
||||
}
|
||||
}
|
||||
|
||||
static int const kWorldSize{3};
|
||||
std::unique_ptr<std::thread> server_thread_;
|
||||
std::unique_ptr<grpc::Server> server_;
|
||||
};
|
||||
|
||||
TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
|
||||
auto construct = []() { FederatedCommunicator comm{0, 0, kServerAddress, "", "", ""}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooSmall) {
|
||||
auto construct = []() { FederatedCommunicator comm{1, -1, kServerAddress, "", "", ""}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooBig) {
|
||||
auto construct = []() { FederatedCommunicator comm{1, 1, kServerAddress, "", "", ""}; };
|
||||
EXPECT_THROW(construct(), dmlc::Error);
|
||||
}
|
||||
|
||||
TEST(FederatedCommunicatorSimpleTest, GetWorldSizeAndRank) {
|
||||
FederatedCommunicator comm{6, 3, kServerAddress};
|
||||
EXPECT_EQ(comm.GetWorldSize(), 6);
|
||||
EXPECT_EQ(comm.GetRank(), 3);
|
||||
}
|
||||
|
||||
TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
|
||||
FederatedCommunicator comm{2, 1, kServerAddress};
|
||||
EXPECT_TRUE(comm.IsDistributed());
|
||||
}
|
||||
|
||||
TEST_F(FederatedCommunicatorTest, Allreduce) {
|
||||
std::vector<std::thread> threads;
|
||||
for (auto rank = 0; rank < kWorldSize; rank++) {
|
||||
threads.emplace_back(std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank));
|
||||
}
|
||||
for (auto &thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(FederatedCommunicatorTest, Broadcast) {
|
||||
std::vector<std::thread> threads;
|
||||
for (auto rank = 0; rank < kWorldSize; rank++) {
|
||||
threads.emplace_back(std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank));
|
||||
}
|
||||
for (auto &thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
} // namespace collective
|
||||
} // namespace xgboost
|
||||
@@ -62,7 +62,7 @@ class FederatedServerTest : public ::testing::Test {
|
||||
static void CheckAllreduce(federated::FederatedClient& client) {
|
||||
int data[] = {1, 2, 3, 4, 5};
|
||||
std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
|
||||
auto reply = client.Allreduce(send_buffer, federated::INT, federated::SUM);
|
||||
auto reply = client.Allreduce(send_buffer, federated::INT32, federated::SUM);
|
||||
auto const* result = reinterpret_cast<int const*>(reply.data());
|
||||
int expected[] = {3, 6, 9, 12, 15};
|
||||
for (auto i = 0; i < 5; i++) {
|
||||
|
||||
Reference in New Issue
Block a user