Implement transform to reduce CPU/GPU code duplication. (#3643)

* Implement Transform class. * Add tests for softmax. * Use Transform in regression, softmax and hinge objectives, except for Cox. * Mark old gpu objective functions deprecated. * static_assert for softmax. * Split up multi-gpu tests.
2018-10-02 15:06:21 +13:00
parent 87aca8c244
commit d594b11f35
31 changed files with 1514 additions and 997 deletions
--- a/tests/cpp/common/test_gpu_compressed_iterator.cu
+++ b/tests/cpp/common/test_gpu_compressed_iterator.cu
@@ -14,7 +14,7 @@ struct WriteSymbolFunction {
  WriteSymbolFunction(CompressedBufferWriter cbw, unsigned char* buffer_data_d,
                      int* input_data_d)
    : cbw(cbw), buffer_data_d(buffer_data_d), input_data_d(input_data_d) {}
-                                           
+
  __device__ void operator()(size_t i) {
    cbw.AtomicWriteSymbol(buffer_data_d, input_data_d[i], i);
  }
@@ -28,7 +28,7 @@ struct ReadSymbolFunction {

  __device__ void operator()(size_t i) {
    output_data_d[i] = ci[i];
-  }                                           
+  }
 };

 TEST(CompressedIterator, TestGPU) {
--- a/tests/cpp/common/test_gpu_hist_util.cu
+++ b/tests/cpp/common/test_gpu_hist_util.cu
@@ -10,7 +10,7 @@
 namespace xgboost {
 namespace common {

-TEST(gpu_hist_util, TestDeviceSketch) {
+void TestDeviceSketch(const GPUSet& devices) {
  // create the data
  int nrows = 10001;
  std::vector<float> test_data(nrows);
@@ -28,7 +28,7 @@ TEST(gpu_hist_util, TestDeviceSketch) {
  tree::TrainParam p;
  p.max_bin = 20;
  p.gpu_id = 0;
-  p.n_gpus = GPUSet::AllVisible().Size();
+  p.n_gpus = devices.Size();
  // ensure that the exact quantiles are found
  p.gpu_batch_nrows = nrows * 10;

@@ -54,5 +54,17 @@ TEST(gpu_hist_util, TestDeviceSketch) {
  delete dmat;
 }

+TEST(gpu_hist_util, DeviceSketch) {
+  TestDeviceSketch(GPUSet::Range(0, 1));
+}
+
+#if defined(XGBOOST_USE_NCCL)
+TEST(gpu_hist_util, MGPU_DeviceSketch) {
+  auto devices = GPUSet::AllVisible();
+  CHECK_GT(devices.Size(), 1);
+  TestDeviceSketch(devices);
+}
+#endif
+
 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -178,18 +178,57 @@ TEST(HostDeviceVector, TestCopy) {
  SetCudaSetDeviceHandler(nullptr);
 }

-// The test is not really useful if n_gpus < 2
 TEST(HostDeviceVector, Reshard) {
  std::vector<int> h_vec (2345);
  for (size_t i = 0; i < h_vec.size(); ++i) {
    h_vec[i] = i;
  }
  HostDeviceVector<int> vec (h_vec);
+  auto devices = GPUSet::Range(0, 1);
+
+  vec.Reshard(devices);
+  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  auto span = vec.DeviceSpan(0);  // sync to device
+
+  vec.Reshard(GPUSet::Empty());  // pull back to cpu, empty devices.
+  ASSERT_EQ(vec.Size(), h_vec.size());
+  ASSERT_TRUE(vec.Devices().IsEmpty());
+
+  auto h_vec_1 = vec.HostVector();
+  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
+}
+
+TEST(HostDeviceVector, Span) {
+  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
+  vec.Reshard(GPUSet{0, 1});
+  auto span = vec.DeviceSpan(0);
+  ASSERT_EQ(vec.DeviceSize(0), span.size());
+  ASSERT_EQ(vec.DevicePointer(0), span.data());
+  auto const_span = vec.ConstDeviceSpan(0);
+  ASSERT_EQ(vec.DeviceSize(0), span.size());
+  ASSERT_EQ(vec.ConstDevicePointer(0), span.data());
+}
+
+// Multi-GPUs' test
+#if defined(XGBOOST_USE_NCCL)
+TEST(HostDeviceVector, MGPU_Reshard) {
  auto devices = GPUSet::AllVisible();
+  if (devices.Size() < 2) {
+    LOG(WARNING) << "Not testing in multi-gpu environment.";
+    return;
+  }
+
+  std::vector<int> h_vec (2345);
+  for (size_t i = 0; i < h_vec.size(); ++i) {
+    h_vec[i] = i;
+  }
+  HostDeviceVector<int> vec (h_vec);
+
+  // Data size for each device.
  std::vector<size_t> devices_size (devices.Size());

  // From CPU to GPUs.
-  // Assuming we have > 1 devices.
  vec.Reshard(devices);
  size_t total_size = 0;
  for (size_t i = 0; i < devices.Size(); ++i) {
@@ -198,42 +237,26 @@ TEST(HostDeviceVector, Reshard) {
  }
  ASSERT_EQ(total_size, h_vec.size());
  ASSERT_EQ(total_size, vec.Size());
-  auto h_vec_1 = vec.HostVector();

-  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
-  vec.Reshard(GPUSet::Empty()); // clear out devices memory
+  // Reshard from devices to devices with different distribution.
+  EXPECT_ANY_THROW(
+      vec.Reshard(GPUDistribution::Granular(devices, 12)));

-  // Shrink down the number of devices.
-  vec.Reshard(GPUSet::Range(0, 1));
+  // All data is drawn back to CPU
+  vec.Reshard(GPUSet::Empty());
+  ASSERT_TRUE(vec.Devices().IsEmpty());
  ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_EQ(vec.DeviceSize(0), h_vec.size());
-  h_vec_1 = vec.HostVector();
-  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
-  vec.Reshard(GPUSet::Empty()); // clear out devices memory

-  // Grow the number of devices.
-  vec.Reshard(devices);
+  vec.Reshard(GPUDistribution::Granular(devices, 12));
  total_size = 0;
  for (size_t i = 0; i < devices.Size(); ++i) {
    total_size += vec.DeviceSize(i);
-    ASSERT_EQ(devices_size[i], vec.DeviceSize(i));
+    devices_size[i] = vec.DeviceSize(i);
  }
  ASSERT_EQ(total_size, h_vec.size());
  ASSERT_EQ(total_size, vec.Size());
-  h_vec_1 = vec.HostVector();
-  ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
-}
-
-TEST(HostDeviceVector, Span) {
-  HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.Reshard(GPUSet{0, 1});
-  auto span = vec.DeviceSpan(0);
-  ASSERT_EQ(vec.Size(), span.size());
-  ASSERT_EQ(vec.DevicePointer(0), span.data());
-  auto const_span = vec.ConstDeviceSpan(0);
-  ASSERT_EQ(vec.Size(), span.size());
-  ASSERT_EQ(vec.ConstDevicePointer(0), span.data());
 }
+#endif

 }  // namespace common
 }  // namespace xgboost
--- a/tests/cpp/common/test_span.h
+++ b/tests/cpp/common/test_span.h
@@ -7,6 +7,14 @@
 #include "../../include/xgboost/base.h"
 #include "../../../src/common/span.h"

+template <typename Iter>
+XGBOOST_DEVICE void InitializeRange(Iter _begin, Iter _end) {
+  float j = 0;
+  for (Iter i = _begin; i != _end; ++i, ++j) {
+    *i = j;
+  }
+}
+
 namespace xgboost {
 namespace common {

@@ -20,14 +28,6 @@ namespace common {
    *(status) = -1;                             \
  }

-template <typename Iter>
-XGBOOST_DEVICE void InitializeRange(Iter _begin, Iter _end) {
-  float j = 0;
-  for (Iter i = _begin; i != _end; ++i, ++j) {
-    *i = j;
-  }
-}
-
 struct TestTestStatus {
  int * status_;

--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -0,0 +1,61 @@
+#include <xgboost/base.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "../../../src/common/host_device_vector.h"
+#include "../../../src/common/transform.h"
+#include "../../../src/common/span.h"
+#include "../helpers.h"
+
+#if defined(__CUDACC__)
+
+#define TRANSFORM_GPU_RANGE GPUSet::Range(0, 1)
+#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Range(0, 1))
+
+#else
+
+#define TRANSFORM_GPU_RANGE GPUSet::Empty()
+#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Empty())
+
+#endif
+
+template <typename Iter>
+void InitializeRange(Iter _begin, Iter _end) {
+  float j = 0;
+  for (Iter i = _begin; i != _end; ++i, ++j) {
+    *i = j;
+  }
+}
+
+namespace xgboost {
+namespace common {
+
+template <typename T>
+struct TestTransformRange {
+  void XGBOOST_DEVICE operator()(size_t _idx,
+                                 Span<bst_float> _out, Span<const bst_float> _in) {
+    _out[_idx] = _in[_idx];
+  }
+};
+
+TEST(Transform, DeclareUnifiedTest(Basic)) {
+  const size_t size {256};
+  std::vector<bst_float> h_in(size);
+  std::vector<bst_float> h_out(size);
+  InitializeRange(h_in.begin(), h_in.end());
+  std::vector<bst_float> h_sol(size);
+  InitializeRange(h_sol.begin(), h_sol.end());
+
+  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU_DIST};
+  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU_DIST};
+  out_vec.Fill(0);
+
+  Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, TRANSFORM_GPU_RANGE)
+      .Eval(&out_vec, &in_vec);
+  std::vector<bst_float> res = out_vec.HostVector();
+
+  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
+}
+
+} // namespace common
+} // namespace xgboost
--- a/tests/cpp/common/test_transform_range.cu
+++ b/tests/cpp/common/test_transform_range.cu
@@ -0,0 +1,43 @@
+// This converts all tests from CPU to GPU.
+#include "test_transform_range.cc"
+
+#if defined(XGBOOST_USE_NCCL)
+namespace xgboost {
+namespace common {
+
+// Test here is multi gpu specific
+TEST(Transform, MGPU_Basic) {
+  auto devices = GPUSet::AllVisible();
+  CHECK_GT(devices.Size(), 1);
+  const size_t size {256};
+  std::vector<bst_float> h_in(size);
+  std::vector<bst_float> h_out(size);
+  InitializeRange(h_in.begin(), h_in.end());
+  std::vector<bst_float> h_sol(size);
+  InitializeRange(h_sol.begin(), h_sol.end());
+
+  const HostDeviceVector<bst_float> in_vec {h_in,
+        GPUDistribution::Block(GPUSet::Empty())};
+  HostDeviceVector<bst_float> out_vec {h_out,
+        GPUDistribution::Block(GPUSet::Empty())};
+  out_vec.Fill(0);
+
+  in_vec.Reshard(GPUDistribution::Granular(devices, 8));
+  out_vec.Reshard(GPUDistribution::Block(devices));
+
+  // Granularity is different, resharding will throw.
+  EXPECT_ANY_THROW(
+      Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size}, devices)
+      .Eval(&out_vec, &in_vec));
+
+
+  Transform<>::Init(TestTransformRange<bst_float>{}, Range{0, size},
+                    devices, false).Eval(&out_vec, &in_vec);
+  std::vector<bst_float> res = out_vec.HostVector();
+
+  ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
+}
+
+}  // namespace xgboost
+}  // namespace common
+#endif