Update collective implementation. (#10152)

* Update collective implementation.

- Cleanup resource during `Finalize` to avoid handling threads in destructor.
- Calculate the size for allgather automatically.
- Use simple allgather for small (smaller than the number of worker) allreduce.
This commit is contained in:
Jiaming Yuan
2024-03-30 18:57:31 +08:00
committed by GitHub
parent 230010d9a0
commit 8bad677c2f
31 changed files with 233 additions and 127 deletions

View File

@@ -60,8 +60,7 @@ TEST_F(FederatedCollTest, Allgather) {
std::vector<std::int32_t> buffer(n_workers, 0);
buffer[comm->Rank()] = comm->Rank();
auto rc = coll.Allgather(*comm, common::EraseType(common::Span{buffer.data(), buffer.size()}),
sizeof(int));
auto rc = coll.Allgather(*comm, common::EraseType(common::Span{buffer.data(), buffer.size()}));
ASSERT_TRUE(rc.OK());
for (auto i = 0; i < n_workers; i++) {
ASSERT_EQ(buffer[i], i);

View File

@@ -5,13 +5,13 @@
#include <gtest/gtest.h>
#include <xgboost/collective/result.h> // for Result
#include "../../../../src/collective/allreduce.h"
#include "../../../../src/common/common.h" // for AllVisibleGPUs
#include "../../../../src/common/device_helpers.cuh" // for device_vector
#include "../../../../src/common/type.h" // for EraseType
#include "../../collective/test_worker.h" // for SocketTest
#include "../../helpers.h" // for MakeCUDACtx
#include "federated_coll.cuh"
#include "federated_comm.cuh"
#include "test_worker.h" // for TestFederated
namespace xgboost::collective {
@@ -71,7 +71,7 @@ void TestAllgather(std::shared_ptr<FederatedComm> comm, std::int32_t rank, std::
dh::device_vector<std::int32_t> buffer(n_workers, 0);
buffer[comm->Rank()] = comm->Rank();
auto rc = w.coll->Allgather(*w.nccl_comm, common::EraseType(dh::ToSpan(buffer)), sizeof(int));
auto rc = w.coll->Allgather(*w.nccl_comm, common::EraseType(dh::ToSpan(buffer)));
ASSERT_TRUE(rc.OK());
for (auto i = 0; i < n_workers; i++) {
ASSERT_EQ(buffer[i], i);