More explict sharding methods for device memory (#4396)
* Rename the Reshard method to Shard * Add a new Reshard method for sharding a vector that's already sharded
This commit is contained in:
@@ -23,7 +23,7 @@ void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
|
||||
HostDeviceVector<int> *v) {
|
||||
// create the vector
|
||||
GPUSet devices = distribution.Devices();
|
||||
v->Reshard(distribution);
|
||||
v->Shard(distribution);
|
||||
v->Resize(n);
|
||||
|
||||
ASSERT_EQ(v->Size(), n);
|
||||
@@ -178,6 +178,27 @@ TEST(HostDeviceVector, TestCopy) {
|
||||
SetCudaSetDeviceHandler(nullptr);
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Shard) {
|
||||
std::vector<int> h_vec (2345);
|
||||
for (size_t i = 0; i < h_vec.size(); ++i) {
|
||||
h_vec[i] = i;
|
||||
}
|
||||
HostDeviceVector<int> vec (h_vec);
|
||||
auto devices = GPUSet::Range(0, 1);
|
||||
|
||||
vec.Shard(devices);
|
||||
ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
auto span = vec.DeviceSpan(0); // sync to device
|
||||
|
||||
vec.Reshard(GPUDistribution::Empty()); // pull back to cpu, empty devices.
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
ASSERT_TRUE(vec.Devices().IsEmpty());
|
||||
|
||||
auto h_vec_1 = vec.HostVector();
|
||||
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Reshard) {
|
||||
std::vector<int> h_vec (2345);
|
||||
for (size_t i = 0; i < h_vec.size(); ++i) {
|
||||
@@ -186,22 +207,24 @@ TEST(HostDeviceVector, Reshard) {
|
||||
HostDeviceVector<int> vec (h_vec);
|
||||
auto devices = GPUSet::Range(0, 1);
|
||||
|
||||
vec.Reshard(devices);
|
||||
vec.Shard(devices);
|
||||
ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
auto span = vec.DeviceSpan(0); // sync to device
|
||||
PlusOne(&vec);
|
||||
|
||||
vec.Reshard(GPUSet::Empty()); // pull back to cpu, empty devices.
|
||||
vec.Reshard(GPUDistribution::Empty());
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
ASSERT_TRUE(vec.Devices().IsEmpty());
|
||||
|
||||
auto h_vec_1 = vec.HostVector();
|
||||
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
|
||||
for (size_t i = 0; i < h_vec_1.size(); ++i) {
|
||||
ASSERT_EQ(h_vec_1.at(i), i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Span) {
|
||||
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
|
||||
vec.Reshard(GPUSet{0, 1});
|
||||
vec.Shard(GPUSet{0, 1});
|
||||
auto span = vec.DeviceSpan(0);
|
||||
ASSERT_EQ(vec.DeviceSize(0), span.size());
|
||||
ASSERT_EQ(vec.DevicePointer(0), span.data());
|
||||
@@ -212,7 +235,7 @@ TEST(HostDeviceVector, Span) {
|
||||
|
||||
// Multi-GPUs' test
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
TEST(HostDeviceVector, MGPU_Reshard) {
|
||||
TEST(HostDeviceVector, MGPU_Shard) {
|
||||
auto devices = GPUSet::AllVisible();
|
||||
if (devices.Size() < 2) {
|
||||
LOG(WARNING) << "Not testing in multi-gpu environment.";
|
||||
@@ -229,7 +252,7 @@ TEST(HostDeviceVector, MGPU_Reshard) {
|
||||
std::vector<size_t> devices_size (devices.Size());
|
||||
|
||||
// From CPU to GPUs.
|
||||
vec.Reshard(devices);
|
||||
vec.Shard(devices);
|
||||
size_t total_size = 0;
|
||||
for (size_t i = 0; i < devices.Size(); ++i) {
|
||||
total_size += vec.DeviceSize(i);
|
||||
@@ -238,16 +261,16 @@ TEST(HostDeviceVector, MGPU_Reshard) {
|
||||
ASSERT_EQ(total_size, h_vec.size());
|
||||
ASSERT_EQ(total_size, vec.Size());
|
||||
|
||||
// Reshard from devices to devices with different distribution.
|
||||
// Shard from devices to devices with different distribution.
|
||||
EXPECT_ANY_THROW(
|
||||
vec.Reshard(GPUDistribution::Granular(devices, 12)));
|
||||
vec.Shard(GPUDistribution::Granular(devices, 12)));
|
||||
|
||||
// All data is drawn back to CPU
|
||||
vec.Reshard(GPUSet::Empty());
|
||||
vec.Reshard(GPUDistribution::Empty());
|
||||
ASSERT_TRUE(vec.Devices().IsEmpty());
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
|
||||
vec.Reshard(GPUDistribution::Granular(devices, 12));
|
||||
vec.Shard(GPUDistribution::Granular(devices, 12));
|
||||
total_size = 0;
|
||||
for (size_t i = 0; i < devices.Size(); ++i) {
|
||||
total_size += vec.DeviceSize(i);
|
||||
@@ -256,6 +279,41 @@ TEST(HostDeviceVector, MGPU_Reshard) {
|
||||
ASSERT_EQ(total_size, h_vec.size());
|
||||
ASSERT_EQ(total_size, vec.Size());
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, MGPU_Reshard) {
|
||||
auto devices = GPUSet::AllVisible();
|
||||
if (devices.Size() < 2) {
|
||||
LOG(WARNING) << "Not testing in multi-gpu environment.";
|
||||
return;
|
||||
}
|
||||
|
||||
size_t n = 1001;
|
||||
int n_devices = 2;
|
||||
auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
|
||||
std::vector<size_t> starts{0, 501};
|
||||
std::vector<size_t> sizes{501, 500};
|
||||
|
||||
HostDeviceVector<int> v;
|
||||
InitHostDeviceVector(n, distribution, &v);
|
||||
CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
|
||||
PlusOne(&v);
|
||||
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kRead);
|
||||
CheckHost(&v, GPUAccess::kWrite);
|
||||
|
||||
auto distribution1 = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
|
||||
v.Reshard(distribution1);
|
||||
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
auto span = v.DeviceSpan(i); // sync to device
|
||||
}
|
||||
|
||||
std::vector<size_t> starts1{0, 500};
|
||||
std::vector<size_t> sizes1{501, 501};
|
||||
CheckDevice(&v, starts1, sizes1, 1, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kRead);
|
||||
CheckHost(&v, GPUAccess::kWrite);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace common
|
||||
|
||||
@@ -22,10 +22,10 @@ TEST(Transform, MGPU_Basic) {
|
||||
GPUDistribution::Block(GPUSet::Empty())};
|
||||
out_vec.Fill(0);
|
||||
|
||||
in_vec.Reshard(GPUDistribution::Granular(devices, 8));
|
||||
out_vec.Reshard(GPUDistribution::Block(devices));
|
||||
in_vec.Shard(GPUDistribution::Granular(devices, 8));
|
||||
out_vec.Shard(GPUDistribution::Block(devices));
|
||||
|
||||
// Granularity is different, resharding will throw.
|
||||
// Granularity is different, sharding will throw.
|
||||
EXPECT_ANY_THROW(
|
||||
Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
|
||||
.Eval(&out_vec, &in_vec));
|
||||
|
||||
Reference in New Issue
Block a user