Support bitwise allreduce operations in the communicator (#8623)
This commit is contained in:
parent
c7e82b5914
commit
77b069c25d
@ -25,6 +25,9 @@ enum ReduceOperation {
|
|||||||
MAX = 0;
|
MAX = 0;
|
||||||
MIN = 1;
|
MIN = 1;
|
||||||
SUM = 2;
|
SUM = 2;
|
||||||
|
BITWISE_AND = 3;
|
||||||
|
BITWISE_OR = 4;
|
||||||
|
BITWISE_XOR = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
message AllreduceRequest {
|
message AllreduceRequest {
|
||||||
|
|||||||
@ -191,6 +191,9 @@ class Op(IntEnum):
|
|||||||
MAX = 0
|
MAX = 0
|
||||||
MIN = 1
|
MIN = 1
|
||||||
SUM = 2
|
SUM = 2
|
||||||
|
BITWISE_AND = 3
|
||||||
|
BITWISE_OR = 4
|
||||||
|
BITWISE_XOR = 5
|
||||||
|
|
||||||
|
|
||||||
def allreduce( # pylint:disable=invalid-name
|
def allreduce( # pylint:disable=invalid-name
|
||||||
|
|||||||
@ -133,7 +133,9 @@ enum OpType {
|
|||||||
kMax = 0,
|
kMax = 0,
|
||||||
kMin = 1,
|
kMin = 1,
|
||||||
kSum = 2,
|
kSum = 2,
|
||||||
kBitwiseOR = 3
|
kBitwiseAND = 3,
|
||||||
|
kBitwiseOR = 4,
|
||||||
|
kBitwiseXOR = 5,
|
||||||
};
|
};
|
||||||
/*!\brief enum of supported data types */
|
/*!\brief enum of supported data types */
|
||||||
enum DataType {
|
enum DataType {
|
||||||
|
|||||||
@ -85,6 +85,13 @@ struct Sum {
|
|||||||
dst += src;
|
dst += src;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
struct BitAND {
|
||||||
|
static const engine::mpi::OpType kType = engine::mpi::kBitwiseAND;
|
||||||
|
template<typename DType>
|
||||||
|
inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
|
||||||
|
dst &= src;
|
||||||
|
}
|
||||||
|
};
|
||||||
struct BitOR {
|
struct BitOR {
|
||||||
static const engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
|
static const engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
@ -92,6 +99,13 @@ struct BitOR {
|
|||||||
dst |= src;
|
dst |= src;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
struct BitXOR {
|
||||||
|
static const engine::mpi::OpType kType = engine::mpi::kBitwiseXOR;
|
||||||
|
template<typename DType>
|
||||||
|
inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
|
||||||
|
dst ^= src;
|
||||||
|
}
|
||||||
|
};
|
||||||
template <typename OP, typename DType>
|
template <typename OP, typename DType>
|
||||||
inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &) {
|
inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &) {
|
||||||
const DType *src = static_cast<const DType *>(src_);
|
const DType *src = static_cast<const DType *>(src_);
|
||||||
|
|||||||
@ -50,11 +50,21 @@ struct Min;
|
|||||||
* \brief sum reduction operator
|
* \brief sum reduction operator
|
||||||
*/
|
*/
|
||||||
struct Sum;
|
struct Sum;
|
||||||
|
/*!
|
||||||
|
* \class rabit::op::BitAND
|
||||||
|
* \brief bitwise AND reduction operator
|
||||||
|
*/
|
||||||
|
struct BitAND;
|
||||||
/*!
|
/*!
|
||||||
* \class rabit::op::BitOR
|
* \class rabit::op::BitOR
|
||||||
* \brief bitwise OR reduction operator
|
* \brief bitwise OR reduction operator
|
||||||
*/
|
*/
|
||||||
struct BitOR;
|
struct BitOR;
|
||||||
|
/*!
|
||||||
|
* \class rabit::op::BitXOR
|
||||||
|
* \brief bitwise XOR reduction operator
|
||||||
|
*/
|
||||||
|
struct BitXOR;
|
||||||
} // namespace op
|
} // namespace op
|
||||||
/*!
|
/*!
|
||||||
* \brief initializes rabit, call this once at the beginning of your program
|
* \brief initializes rabit, call this once at the beginning of your program
|
||||||
|
|||||||
@ -23,6 +23,17 @@ struct FHelper {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename DType>
|
||||||
|
struct FHelper<op::BitAND, DType> {
|
||||||
|
static void
|
||||||
|
Allreduce(DType *,
|
||||||
|
size_t ,
|
||||||
|
void (*)(void *arg),
|
||||||
|
void *) {
|
||||||
|
utils::Error("DataType does not support bitwise AND operation");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<typename DType>
|
template<typename DType>
|
||||||
struct FHelper<op::BitOR, DType> {
|
struct FHelper<op::BitOR, DType> {
|
||||||
static void
|
static void
|
||||||
@ -30,7 +41,18 @@ struct FHelper<op::BitOR, DType> {
|
|||||||
size_t ,
|
size_t ,
|
||||||
void (*)(void *arg),
|
void (*)(void *arg),
|
||||||
void *) {
|
void *) {
|
||||||
utils::Error("DataType does not support bitwise or operation");
|
utils::Error("DataType does not support bitwise OR operation");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename DType>
|
||||||
|
struct FHelper<op::BitXOR, DType> {
|
||||||
|
static void
|
||||||
|
Allreduce(DType *,
|
||||||
|
size_t ,
|
||||||
|
void (*)(void *arg),
|
||||||
|
void *) {
|
||||||
|
utils::Error("DataType does not support bitwise XOR operation");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -111,12 +133,24 @@ void Allreduce(void *sendrecvbuf,
|
|||||||
count, enum_dtype,
|
count, enum_dtype,
|
||||||
prepare_fun, prepare_arg);
|
prepare_fun, prepare_arg);
|
||||||
return;
|
return;
|
||||||
|
case kBitwiseAND:
|
||||||
|
Allreduce<op::BitAND>
|
||||||
|
(sendrecvbuf,
|
||||||
|
count, enum_dtype,
|
||||||
|
prepare_fun, prepare_arg);
|
||||||
|
return;
|
||||||
case kBitwiseOR:
|
case kBitwiseOR:
|
||||||
Allreduce<op::BitOR>
|
Allreduce<op::BitOR>
|
||||||
(sendrecvbuf,
|
(sendrecvbuf,
|
||||||
count, enum_dtype,
|
count, enum_dtype,
|
||||||
prepare_fun, prepare_arg);
|
prepare_fun, prepare_arg);
|
||||||
return;
|
return;
|
||||||
|
case kBitwiseXOR:
|
||||||
|
Allreduce<op::BitXOR>
|
||||||
|
(sendrecvbuf,
|
||||||
|
count, enum_dtype,
|
||||||
|
prepare_fun, prepare_arg);
|
||||||
|
return;
|
||||||
default: utils::Error("unknown enum_op");
|
default: utils::Error("unknown enum_op");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -58,7 +58,14 @@ inline std::size_t GetTypeSize(DataType data_type) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** @brief Defines the reduction operation. */
|
/** @brief Defines the reduction operation. */
|
||||||
enum class Operation { kMax = 0, kMin = 1, kSum = 2 };
|
enum class Operation {
|
||||||
|
kMax = 0,
|
||||||
|
kMin = 1,
|
||||||
|
kSum = 2,
|
||||||
|
kBitwiseAND = 3,
|
||||||
|
kBitwiseOR = 4,
|
||||||
|
kBitwiseXOR = 5
|
||||||
|
};
|
||||||
|
|
||||||
class DeviceCommunicator;
|
class DeviceCommunicator;
|
||||||
|
|
||||||
|
|||||||
@ -30,6 +30,29 @@ class AllreduceFunctor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
template <class T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
|
||||||
|
void AccumulateBitwise(T* buffer, T const* input, std::size_t size,
|
||||||
|
Operation reduce_operation) const {
|
||||||
|
switch (reduce_operation) {
|
||||||
|
case Operation::kBitwiseAND:
|
||||||
|
std::transform(buffer, buffer + size, input, buffer, std::bit_and<T>());
|
||||||
|
break;
|
||||||
|
case Operation::kBitwiseOR:
|
||||||
|
std::transform(buffer, buffer + size, input, buffer, std::bit_or<T>());
|
||||||
|
break;
|
||||||
|
case Operation::kBitwiseXOR:
|
||||||
|
std::transform(buffer, buffer + size, input, buffer, std::bit_xor<T>());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw std::invalid_argument("Invalid reduce operation");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
|
||||||
|
void AccumulateBitwise(T*, T const*, std::size_t, Operation) const {
|
||||||
|
LOG(FATAL) << "Floating point types do not support bitwise operations.";
|
||||||
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void Accumulate(T* buffer, T const* input, std::size_t size, Operation reduce_operation) const {
|
void Accumulate(T* buffer, T const* input, std::size_t size, Operation reduce_operation) const {
|
||||||
switch (reduce_operation) {
|
switch (reduce_operation) {
|
||||||
@ -44,6 +67,11 @@ class AllreduceFunctor {
|
|||||||
case Operation::kSum:
|
case Operation::kSum:
|
||||||
std::transform(buffer, buffer + size, input, buffer, std::plus<T>());
|
std::transform(buffer, buffer + size, input, buffer, std::plus<T>());
|
||||||
break;
|
break;
|
||||||
|
case Operation::kBitwiseAND:
|
||||||
|
case Operation::kBitwiseOR:
|
||||||
|
case Operation::kBitwiseXOR:
|
||||||
|
AccumulateBitwise(buffer, input, size, reduce_operation);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::invalid_argument("Invalid reduce operation");
|
throw std::invalid_argument("Invalid reduce operation");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -96,11 +96,33 @@ class RabitCommunicator : public Communicator {
|
|||||||
void Print(const std::string &message) override { rabit::TrackerPrint(message); }
|
void Print(const std::string &message) override { rabit::TrackerPrint(message); }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void Shutdown() override {
|
void Shutdown() override { rabit::Finalize(); }
|
||||||
rabit::Finalize();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
template <typename DType, std::enable_if_t<std::is_integral<DType>::value> * = nullptr>
|
||||||
|
void DoBitwiseAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
|
||||||
|
switch (op) {
|
||||||
|
case Operation::kBitwiseAND:
|
||||||
|
rabit::Allreduce<rabit::op::BitAND, DType>(static_cast<DType *>(send_receive_buffer),
|
||||||
|
count);
|
||||||
|
break;
|
||||||
|
case Operation::kBitwiseOR:
|
||||||
|
rabit::Allreduce<rabit::op::BitOR, DType>(static_cast<DType *>(send_receive_buffer), count);
|
||||||
|
break;
|
||||||
|
case Operation::kBitwiseXOR:
|
||||||
|
rabit::Allreduce<rabit::op::BitXOR, DType>(static_cast<DType *>(send_receive_buffer),
|
||||||
|
count);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOG(FATAL) << "Unknown allreduce operation";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename DType, std::enable_if_t<std::is_floating_point<DType>::value> * = nullptr>
|
||||||
|
void DoBitwiseAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
|
||||||
|
LOG(FATAL) << "Floating point types do not support bitwise operations.";
|
||||||
|
}
|
||||||
|
|
||||||
template <typename DType>
|
template <typename DType>
|
||||||
void DoAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
|
void DoAllReduce(void *send_receive_buffer, std::size_t count, Operation op) {
|
||||||
switch (op) {
|
switch (op) {
|
||||||
@ -113,6 +135,11 @@ class RabitCommunicator : public Communicator {
|
|||||||
case Operation::kSum:
|
case Operation::kSum:
|
||||||
rabit::Allreduce<rabit::op::Sum, DType>(static_cast<DType *>(send_receive_buffer), count);
|
rabit::Allreduce<rabit::op::Sum, DType>(static_cast<DType *>(send_receive_buffer), count);
|
||||||
break;
|
break;
|
||||||
|
case Operation::kBitwiseAND:
|
||||||
|
case Operation::kBitwiseOR:
|
||||||
|
case Operation::kBitwiseXOR:
|
||||||
|
DoBitwiseAllReduce<DType>(send_receive_buffer, count, op);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
LOG(FATAL) << "Unknown allreduce operation";
|
LOG(FATAL) << "Unknown allreduce operation";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
#include <dmlc/parameter.h>
|
#include <dmlc/parameter.h>
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <bitset>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
#include "../../../src/collective/in_memory_communicator.h"
|
#include "../../../src/collective/in_memory_communicator.h"
|
||||||
@ -13,7 +14,37 @@ namespace collective {
|
|||||||
|
|
||||||
class InMemoryCommunicatorTest : public ::testing::Test {
|
class InMemoryCommunicatorTest : public ::testing::Test {
|
||||||
public:
|
public:
|
||||||
static void VerifyAllreduce(int rank) {
|
static void Verify(void (*function)(int)) {
|
||||||
|
std::vector<std::thread> threads;
|
||||||
|
for (auto rank = 0; rank < kWorldSize; rank++) {
|
||||||
|
threads.emplace_back(function, rank);
|
||||||
|
}
|
||||||
|
for (auto &thread : threads) {
|
||||||
|
thread.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void AllreduceMax(int rank) {
|
||||||
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
|
int buffer[] = {1 + rank, 2 + rank, 3 + rank, 4 + rank, 5 + rank};
|
||||||
|
comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kMax);
|
||||||
|
int expected[] = {3, 4, 5, 6, 7};
|
||||||
|
for (auto i = 0; i < 5; i++) {
|
||||||
|
EXPECT_EQ(buffer[i], expected[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void AllreduceMin(int rank) {
|
||||||
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
|
int buffer[] = {1 + rank, 2 + rank, 3 + rank, 4 + rank, 5 + rank};
|
||||||
|
comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kMin);
|
||||||
|
int expected[] = {1, 2, 3, 4, 5};
|
||||||
|
for (auto i = 0; i < 5; i++) {
|
||||||
|
EXPECT_EQ(buffer[i], expected[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void AllreduceSum(int rank) {
|
||||||
InMemoryCommunicator comm{kWorldSize, rank};
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
int buffer[] = {1, 2, 3, 4, 5};
|
int buffer[] = {1, 2, 3, 4, 5};
|
||||||
comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
|
comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
|
||||||
@ -23,7 +54,35 @@ class InMemoryCommunicatorTest : public ::testing::Test {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void VerifyBroadcast(int rank) {
|
static void AllreduceBitwiseAND(int rank) {
|
||||||
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
|
std::bitset<2> original(rank);
|
||||||
|
auto buffer = original.to_ulong();
|
||||||
|
comm.AllReduce(&buffer, 1, DataType::kUInt32, Operation::kBitwiseAND);
|
||||||
|
EXPECT_EQ(buffer, 0UL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void AllreduceBitwiseOR(int rank) {
|
||||||
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
|
std::bitset<2> original(rank);
|
||||||
|
auto buffer = original.to_ulong();
|
||||||
|
comm.AllReduce(&buffer, 1, DataType::kUInt32, Operation::kBitwiseOR);
|
||||||
|
std::bitset<2> actual(buffer);
|
||||||
|
std::bitset<2> expected{0b11};
|
||||||
|
EXPECT_EQ(actual, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void AllreduceBitwiseXOR(int rank) {
|
||||||
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
|
std::bitset<3> original(rank * 2);
|
||||||
|
auto buffer = original.to_ulong();
|
||||||
|
comm.AllReduce(&buffer, 1, DataType::kUInt32, Operation::kBitwiseXOR);
|
||||||
|
std::bitset<3> actual(buffer);
|
||||||
|
std::bitset<3> expected{0b110};
|
||||||
|
EXPECT_EQ(actual, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Broadcast(int rank) {
|
||||||
InMemoryCommunicator comm{kWorldSize, rank};
|
InMemoryCommunicator comm{kWorldSize, rank};
|
||||||
if (rank == 0) {
|
if (rank == 0) {
|
||||||
std::string buffer{"hello"};
|
std::string buffer{"hello"};
|
||||||
@ -88,25 +147,19 @@ TEST(InMemoryCommunicatorSimpleTest, IsDistributed) {
|
|||||||
EXPECT_TRUE(comm.IsDistributed());
|
EXPECT_TRUE(comm.IsDistributed());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(InMemoryCommunicatorTest, Allreduce) {
|
TEST_F(InMemoryCommunicatorTest, AllreduceMax) { Verify(&AllreduceMax); }
|
||||||
std::vector<std::thread> threads;
|
|
||||||
for (auto rank = 0; rank < kWorldSize; rank++) {
|
|
||||||
threads.emplace_back(std::thread(&InMemoryCommunicatorTest::VerifyAllreduce, rank));
|
|
||||||
}
|
|
||||||
for (auto &thread : threads) {
|
|
||||||
thread.join();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_F(InMemoryCommunicatorTest, Broadcast) {
|
TEST_F(InMemoryCommunicatorTest, AllreduceMin) { Verify(&AllreduceMin); }
|
||||||
std::vector<std::thread> threads;
|
|
||||||
for (auto rank = 0; rank < kWorldSize; rank++) {
|
TEST_F(InMemoryCommunicatorTest, AllreduceSum) { Verify(&AllreduceSum); }
|
||||||
threads.emplace_back(std::thread(&InMemoryCommunicatorTest::VerifyBroadcast, rank));
|
|
||||||
}
|
TEST_F(InMemoryCommunicatorTest, AllreduceBitwiseAND) { Verify(&AllreduceBitwiseAND); }
|
||||||
for (auto &thread : threads) {
|
|
||||||
thread.join();
|
TEST_F(InMemoryCommunicatorTest, AllreduceBitwiseOR) { Verify(&AllreduceBitwiseOR); }
|
||||||
}
|
|
||||||
}
|
TEST_F(InMemoryCommunicatorTest, AllreduceBitwiseXOR) { Verify(&AllreduceBitwiseXOR); }
|
||||||
|
|
||||||
|
TEST_F(InMemoryCommunicatorTest, Broadcast) { Verify(&Broadcast); }
|
||||||
|
|
||||||
} // namespace collective
|
} // namespace collective
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user