More explict sharding methods for device memory (#4396)

* Rename the Reshard method to Shard * Add a new Reshard method for sharding a vector that's already sharded
2019-04-30 16:47:23 -07:00
parent 797ba8e72d
commit eaab364a63
12 changed files with 154 additions and 77 deletions
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -23,7 +23,7 @@ void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
                     HostDeviceVector<int> *v) {
  // create the vector
  GPUSet devices = distribution.Devices();
-  v->Reshard(distribution);
+  v->Shard(distribution);
  v->Resize(n);

  ASSERT_EQ(v->Size(), n);
@@ -178,6 +178,27 @@ TEST(HostDeviceVector, TestCopy) {
  SetCudaSetDeviceHandler(nullptr);
 }

+TEST(HostDeviceVector, Shard) {
+  std::vector<int> h_vec (2345);
+  for (size_t i = 0; i < h_vec.size(); ++i) {
+    h_vec[i] = i;
+  }
+  HostDeviceVector<int> vec (h_vec);
+  auto devices = GPUSet::Range(0, 1);
+
+  vec.Shard(devices);
+  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  auto span = vec.DeviceSpan(0);  // sync to device
+
+  vec.Reshard(GPUDistribution::Empty());  // pull back to cpu, empty devices.
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  ASSERT_TRUE(vec.Devices().IsEmpty());
+
+  auto h_vec_1 = vec.HostVector();
+  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+}
+
 TEST(HostDeviceVector, Reshard) {
  std::vector<int> h_vec (2345);
  for (size_t i = 0; i < h_vec.size(); ++i) {
@@ -186,22 +207,24 @@ TEST(HostDeviceVector, Reshard) {
  HostDeviceVector<int> vec (h_vec);
  auto devices = GPUSet::Range(0, 1);

-  vec.Reshard(devices);
+  vec.Shard(devices);
  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
  ASSERT_EQ(vec.Size(), h_vec.size());
-  auto span = vec.DeviceSpan(0);  // sync to device
+  PlusOne(&vec);

-  vec.Reshard(GPUSet::Empty());  // pull back to cpu, empty devices.
+  vec.Reshard(GPUDistribution::Empty());
  ASSERT_EQ(vec.Size(), h_vec.size());
  ASSERT_TRUE(vec.Devices().IsEmpty());

  auto h_vec_1 = vec.HostVector();
-  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+  for (size_t i = 0; i < h_vec_1.size(); ++i) {
+    ASSERT_EQ(h_vec_1.at(i), i + 1);
+  }
 }

 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.Reshard(GPUSet{0, 1});
+  vec.Shard(GPUSet{0, 1});
  auto span = vec.DeviceSpan(0);
  ASSERT_EQ(vec.DeviceSize(0), span.size());
  ASSERT_EQ(vec.DevicePointer(0), span.data());
@@ -212,7 +235,7 @@ TEST(HostDeviceVector, Span) {

 // Multi-GPUs' test
 #if defined(XGBOOST_USE_NCCL)
-TEST(HostDeviceVector, MGPU_Reshard) {
+TEST(HostDeviceVector, MGPU_Shard) {
  auto devices = GPUSet::AllVisible();
  if (devices.Size() < 2) {
    LOG(WARNING) << "Not testing in multi-gpu environment.";
@@ -229,7 +252,7 @@ TEST(HostDeviceVector, MGPU_Reshard) {
  std::vector<size_t> devices_size (devices.Size());

  // From CPU to GPUs.
-  vec.Reshard(devices);
+  vec.Shard(devices);
  size_t total_size = 0;
  for (size_t i = 0; i < devices.Size(); ++i) {
    total_size += vec.DeviceSize(i);
@@ -238,16 +261,16 @@ TEST(HostDeviceVector, MGPU_Reshard) {
  ASSERT_EQ(total_size, h_vec.size());
  ASSERT_EQ(total_size, vec.Size());

-  // Reshard from devices to devices with different distribution.
+  // Shard from devices to devices with different distribution.
  EXPECT_ANY_THROW(
-      vec.Reshard(GPUDistribution::Granular(devices, 12)));
+      vec.Shard(GPUDistribution::Granular(devices, 12)));

  // All data is drawn back to CPU
-  vec.Reshard(GPUSet::Empty());
+  vec.Reshard(GPUDistribution::Empty());
  ASSERT_TRUE(vec.Devices().IsEmpty());
  ASSERT_EQ(vec.Size(), h_vec.size());

-  vec.Reshard(GPUDistribution::Granular(devices, 12));
+  vec.Shard(GPUDistribution::Granular(devices, 12));
  total_size = 0;
  for (size_t i = 0; i < devices.Size(); ++i) {
    total_size += vec.DeviceSize(i);
@@ -256,6 +279,41 @@ TEST(HostDeviceVector, MGPU_Reshard) {
  ASSERT_EQ(total_size, h_vec.size());
  ASSERT_EQ(total_size, vec.Size());
 }
+
+TEST(HostDeviceVector, MGPU_Reshard) {
+  auto devices = GPUSet::AllVisible();
+  if (devices.Size() < 2) {
+    LOG(WARNING) << "Not testing in multi-gpu environment.";
+    return;
+  }
+
+  size_t n = 1001;
+  int n_devices = 2;
+  auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices));
+  std::vector<size_t> starts{0, 501};
+  std::vector<size_t> sizes{501, 500};
+
+  HostDeviceVector<int> v;
+  InitHostDeviceVector(n, distribution, &v);
+  CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
+  PlusOne(&v);
+  CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
+  CheckHost(&v, GPUAccess::kRead);
+  CheckHost(&v, GPUAccess::kWrite);
+
+  auto distribution1 = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1);
+  v.Reshard(distribution1);
+
+  for (size_t i = 0; i < n_devices; ++i) {
+    auto span = v.DeviceSpan(i);  // sync to device
+  }
+
+  std::vector<size_t> starts1{0, 500};
+  std::vector<size_t> sizes1{501, 501};
+  CheckDevice(&v, starts1, sizes1, 1, GPUAccess::kWrite);
+  CheckHost(&v, GPUAccess::kRead);
+  CheckHost(&v, GPUAccess::kWrite);
+}
 #endif

 }  // namespace common
--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@@ -22,10 +22,10 @@ TEST(Transform, MGPU_Basic) {
        GPUDistribution::Block(GPUSet::Empty())};
  out_vec.Fill(0);

-  in_vec.Reshard(GPUDistribution::Granular(devices, 8));
-  out_vec.Reshard(GPUDistribution::Block(devices));
+  in_vec.Shard(GPUDistribution::Granular(devices, 8));
+  out_vec.Shard(GPUDistribution::Block(devices));

-  // Granularity is different, resharding will throw.
+  // Granularity is different, sharding will throw.
  EXPECT_ANY_THROW(
      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
      .Eval(&out_vec, &in_vec));