Implement devices to devices reshard. (#3721)

* Force clearing device memory before Reshard. * Remove calculating row_segments for gpu_hist and gpu_sketch. * Guard against changing device.
2018-09-28 17:40:23 +12:00
parent 0b7fd74138
commit 5a7f7e7d49
11 changed files with 179 additions and 96 deletions
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -8,6 +8,17 @@
 #include "../../../src/common/timer.h"
 #include "gtest/gtest.h"

+struct Shard { int id; };
+
+TEST(DeviceHelpers, Basic) {
+  std::vector<Shard> shards (4);
+  for (int i = 0; i < 4; ++i) {
+    shards[i].id = i;
+  }
+  int sum = dh::ReduceShards<int>(&shards, [](Shard& s) { return s.id ; });
+  ASSERT_EQ(sum, 6);
+}
+
 void CreateTestData(xgboost::bst_uint num_rows, int max_row_size,
                    thrust::host_vector<int> *row_ptr,
                    thrust::host_vector<xgboost::bst_uint> *rows) {
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@@ -28,7 +28,7 @@ TEST(gpu_hist_util, TestDeviceSketch) {
  tree::TrainParam p;
  p.max_bin = 20;
  p.gpu_id = 0;
-  p.n_gpus = 1;
+  p.n_gpus = GPUSet::AllVisible().Size();
  // ensure that the exact quantiles are found
  p.gpu_batch_nrows = nrows * 10;

--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -162,7 +162,7 @@ TEST(HostDeviceVector, TestCopy) {
  std::vector<size_t> starts{0, 501};
  std::vector<size_t> sizes{501, 500};
  SetCudaSetDeviceHandler(SetDevice);
-  
+
  HostDeviceVector<int> v;
  {
    // a separate scope to ensure that v1 is gone before further checks
@@ -178,6 +178,52 @@ TEST(HostDeviceVector, TestCopy) {
  SetCudaSetDeviceHandler(nullptr);
 }

+// The test is not really useful if n_gpus < 2
+TEST(HostDeviceVector, Reshard) {
+  std::vector<int> h_vec (2345);
+  for (size_t i = 0; i < h_vec.size(); ++i) {
+    h_vec[i] = i;
+  }
+  HostDeviceVector<int> vec (h_vec);
+  auto devices = GPUSet::AllVisible();
+  std::vector<size_t> devices_size (devices.Size());
+
+  // From CPU to GPUs.
+  // Assuming we have > 1 devices.
+  vec.Reshard(devices);
+  size_t total_size = 0;
+  for (size_t i = 0; i < devices.Size(); ++i) {
+    total_size += vec.DeviceSize(i);
+    devices_size[i] = vec.DeviceSize(i);
+  }
+  ASSERT_EQ(total_size, h_vec.size());
+  ASSERT_EQ(total_size, vec.Size());
+  auto h_vec_1 = vec.HostVector();
+
+  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+  vec.Reshard(GPUSet::Empty()); // clear out devices memory
+
+  // Shrink down the number of devices.
+  vec.Reshard(GPUSet::Range(0, 1));
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  h_vec_1 = vec.HostVector();
+  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+  vec.Reshard(GPUSet::Empty()); // clear out devices memory
+
+  // Grow the number of devices.
+  vec.Reshard(devices);
+  total_size = 0;
+  for (size_t i = 0; i < devices.Size(); ++i) {
+    total_size += vec.DeviceSize(i);
+    ASSERT_EQ(devices_size[i], vec.DeviceSize(i));
+  }
+  ASSERT_EQ(total_size, h_vec.size());
+  ASSERT_EQ(total_size, vec.Size());
+  h_vec_1 = vec.HostVector();
+  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+}
+
 TEST(HostDeviceVector, Span) {
  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
  vec.Reshard(GPUSet{0, 1});