Support bitwise allreduce in NCCL communicator (#9300)

This commit is contained in:
Rong Ou
2023-06-16 10:56:50 -07:00
committed by GitHub
parent 2718ff530c
commit d8beb517ed
3 changed files with 302 additions and 164 deletions

View File

@@ -5,10 +5,12 @@
#include <gtest/gtest.h>
#include <bitset>
#include <string> // for string
#include "../../../src/collective/nccl_device_communicator.cuh"
#include "../../../src/collective/communicator-inl.cuh"
#include "../../../src/collective/nccl_device_communicator.cuh"
#include "../helpers.h"
namespace xgboost {
namespace collective {
@@ -31,6 +33,69 @@ TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
ASSERT_TRUE(str.find("environment variables") != std::string::npos);
}
}
namespace {
void VerifyAllReduceBitwiseAND() {
auto const rank = collective::GetRank();
std::bitset<64> original{};
original[rank] = true;
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
collective::Synchronize(rank);
EXPECT_EQ(buffer.HostVector()[0], 0ULL);
}
} // anonymous namespace
TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseAND) {
auto const n_gpus = common::AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseAND test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseAND);
}
namespace {
void VerifyAllReduceBitwiseOR() {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::bitset<64> original{};
original[rank] = true;
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
collective::Synchronize(rank);
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
}
} // anonymous namespace
TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseOR) {
auto const n_gpus = common::AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseOR test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseOR);
}
namespace {
void VerifyAllReduceBitwiseXOR() {
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::bitset<64> original{~0ULL};
original[rank] = false;
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
collective::Synchronize(rank);
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
}
} // anonymous namespace
TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseXOR) {
auto const n_gpus = common::AllVisibleGPUs();
if (n_gpus <= 1) {
GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseXOR test with # GPUs = " << n_gpus;
}
RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseXOR);
}
} // namespace collective
} // namespace xgboost