Implement transform to reduce CPU/GPU code duplication. (#3643)

* Implement Transform class. * Add tests for softmax. * Use Transform in regression, softmax and hinge objectives, except for Cox. * Mark old gpu objective functions deprecated. * static_assert for softmax. * Split up multi-gpu tests.
2018-10-02 15:06:21 +13:00
parent 87aca8c244
commit d594b11f35
31 changed files with 1514 additions and 997 deletions
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -1,9 +1,11 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2018 by Contributors
 * \file common.cc
 * \brief Enable all kinds of global variables in common.
 */
 #include <dmlc/thread_local.h>
+
+#include "common.h"
 #include "./random.h"

 namespace xgboost {
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -11,7 +11,7 @@ int AllVisibleImpl::AllVisible() {
    // When compiled with CUDA but running on CPU only device,
    // cudaGetDeviceCount will fail.
    dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch(const std::exception& e) {
+  } catch(const thrust::system::system_error& err) {
    return 0;
  }
  return n_visgpus;
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2018 by Contributors
 * \file common.h
 * \brief Common utilities
 */
@@ -19,6 +19,13 @@
 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
+#define WITH_CUDA() true
+
+#else
+
+#define WITH_CUDA() false
+
 #endif

 namespace dh {
@@ -29,11 +36,11 @@ namespace dh {
 #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)

 inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                       int line) {
+                                    int line) {
  if (code != cudaSuccess) {
-    throw thrust::system_error(code, thrust::cuda_category(),
-                               std::string{file} + "(" +  // NOLINT
-                               std::to_string(line) + ")");
+    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
+                                       std::string{file} + ": " +  // NOLINT
+                                       std::to_string(line)).what();
  }
  return code;
 }
@@ -70,13 +77,13 @@ inline std::string ToString(const T& data) {
 */
 class Range {
 public:
+  using DifferenceType = int64_t;
+
  class Iterator {
    friend class Range;

   public:
-    using DifferenceType = int64_t;
-
-    XGBOOST_DEVICE int64_t operator*() const { return i_; }
+    XGBOOST_DEVICE DifferenceType operator*() const { return i_; }
    XGBOOST_DEVICE const Iterator &operator++() {
      i_ += step_;
      return *this;
@@ -97,8 +104,8 @@ class Range {
    XGBOOST_DEVICE void Step(DifferenceType s) { step_ = s; }

   protected:
-    XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
-    XGBOOST_DEVICE explicit Iterator(int64_t start, int step) :
+    XGBOOST_DEVICE explicit Iterator(DifferenceType start) : i_(start) {}
+    XGBOOST_DEVICE explicit Iterator(DifferenceType start, DifferenceType step) :
        i_{start}, step_{step} {}

   public:
@@ -109,9 +116,10 @@ class Range {
  XGBOOST_DEVICE Iterator begin() const { return begin_; }  // NOLINT
  XGBOOST_DEVICE Iterator end() const { return end_; }      // NOLINT

-  XGBOOST_DEVICE Range(int64_t begin, int64_t end)
+  XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end)
      : begin_(begin), end_(end) {}
-  XGBOOST_DEVICE Range(int64_t begin, int64_t end, Iterator::DifferenceType step)
+  XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end,
+                       DifferenceType step)
      : begin_(begin, step), end_(end) {}

  XGBOOST_DEVICE bool operator==(const Range& other) const {
@@ -121,9 +129,7 @@ class Range {
    return !(*this == other);
  }

-  XGBOOST_DEVICE void Step(Iterator::DifferenceType s) { begin_.Step(s); }
-
-  XGBOOST_DEVICE Iterator::DifferenceType GetStep() const { return begin_.step_; }
+  XGBOOST_DEVICE void Step(DifferenceType s) { begin_.Step(s); }

 private:
  Iterator begin_;
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -9,6 +9,7 @@
 #include <xgboost/logging.h>

 #include "common.h"
+#include "span.h"

 #include <algorithm>
 #include <chrono>
@@ -955,7 +956,7 @@ class SaveCudaContext {
    // cudaGetDevice will fail.
    try {
      safe_cuda(cudaGetDevice(&saved_device_));
-    } catch (thrust::system::system_error & err) {
+    } catch (const thrust::system::system_error & err) {
      saved_device_ = -1;
    }
    func();
@@ -1035,4 +1036,22 @@ ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
  };
  return std::accumulate(sums.begin(), sums.end(), ReduceT());
 }
+
+template <typename T,
+  typename IndexT = typename xgboost::common::Span<T>::index_type>
+xgboost::common::Span<T> ToSpan(
+    thrust::device_vector<T>& vec,
+    IndexT offset = 0,
+    IndexT size = -1) {
+  size = size == -1 ? vec.size() : size;
+  CHECK_LE(offset + size, vec.size());
+  return {vec.data().get() + offset, static_cast<IndexT>(size)};
+}
+
+template <typename T>
+xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
+                                size_t offset, size_t size) {
+  using IndexT = typename xgboost::common::Span<T>::index_type;
+  return ToSpan(vec, static_cast<IndexT>(offset), static_cast<IndexT>(size));
+}
 }  // namespace dh
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -116,6 +116,7 @@ struct HostDeviceVectorImpl {
      int ndevices = vec_->distribution_.devices_.Size();
      start_ = vec_->distribution_.ShardStart(new_size, index_);
      proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
+      // The size on this device.
      size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
      SetDevice();
      data_.resize(size_d);
@@ -230,7 +231,7 @@ struct HostDeviceVectorImpl {
    CHECK(devices.Contains(device));
    LazySyncDevice(device, GPUAccess::kWrite);
    return {shards_[devices.Index(device)].data_.data().get(),
-            static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
+        static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
  }

  common::Span<const T> ConstDeviceSpan(int device) {
@@ -238,7 +239,7 @@ struct HostDeviceVectorImpl {
    CHECK(devices.Contains(device));
    LazySyncDevice(device, GPUAccess::kRead);
    return {shards_[devices.Index(device)].data_.data().get(),
-      static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
+        static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
  }

  size_t DeviceSize(int device) {
@@ -289,7 +290,6 @@ struct HostDeviceVectorImpl {
                               data_h_.size() * sizeof(T),
                               cudaMemcpyHostToDevice));
    } else {
-      //
      dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
    }
  }
@@ -304,14 +304,20 @@ struct HostDeviceVectorImpl {

  void Copy(HostDeviceVectorImpl<T>* other) {
    CHECK_EQ(Size(), other->Size());
+    // Data is on host.
    if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
      std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
-    } else {
-      CHECK(distribution_ == other->distribution_);
-      dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-          shard.Copy(&other->shards_[i]);
-        });
+      return;
    }
+    // Data is on device;
+    if (distribution_ != other->distribution_) {
+      distribution_ = GPUDistribution();
+      Reshard(other->Distribution());
+      size_d_ = other->size_d_;
+    }
+    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+        shard.Copy(&other->shards_[i]);
+      });
  }

  void Copy(const std::vector<T>& other) {
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -111,8 +111,11 @@ class GPUDistribution {
  }

  friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
-    return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
-      a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
+    bool const res = a.devices_ == b.devices_ &&
+                     a.granularity_ == b.granularity_ &&
+                     a.overlap_ == b.overlap_ &&
+                     a.offsets_ == b.offsets_;
+    return res;
  }

  friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include <cmath>
 #include <algorithm>
+#include <utility>
 #include "avx_helpers.h"

 namespace xgboost {
@@ -29,22 +30,31 @@ inline avx::Float8 Sigmoid(avx::Float8 x) {
 }

 /*!
- * \brief do inplace softmax transformaton on p_rec
- * \param p_rec the input/output vector of the values.
+ * \brief Do inplace softmax transformaton on start to end
+ *
+ * \tparam Iterator Input iterator type
+ *
+ * \param start Start iterator of input
+ * \param end end iterator of input
 */
-inline void Softmax(std::vector<float>* p_rec) {
-  std::vector<float> &rec = *p_rec;
-  float wmax = rec[0];
-  for (size_t i = 1; i < rec.size(); ++i) {
-    wmax = std::max(rec[i], wmax);
+template <typename Iterator>
+XGBOOST_DEVICE inline void Softmax(Iterator start, Iterator end) {
+  static_assert(std::is_same<bst_float,
+                typename std::remove_reference<
+                  decltype(std::declval<Iterator>().operator*())>::type
+                >::value,
+                "Values should be of type bst_float");
+  bst_float wmax = *start;
+  for (Iterator i = start+1; i != end; ++i) {
+    wmax = fmaxf(*i, wmax);
  }
  double wsum = 0.0f;
-  for (float & elem : rec) {
-    elem = std::exp(elem - wmax);
-    wsum += elem;
+  for (Iterator i = start; i != end; ++i) {
+    *i = expf(*i - wmax);
+    wsum += *i;
  }
-  for (float & elem : rec) {
-    elem /= static_cast<float>(wsum);
+  for (Iterator i = start; i != end; ++i) {
+    *i /= static_cast<float>(wsum);
  }
 }

@@ -56,7 +66,7 @@ inline void Softmax(std::vector<float>* p_rec) {
 * \tparam Iterator The type of the iterator.
 */
 template<typename Iterator>
-inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
+XGBOOST_DEVICE inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
  Iterator maxit = begin;
  for (Iterator it = begin; it != end; ++it) {
    if (*it > *maxit) maxit = it;
--- a/src/common/span.h
+++ b/src/common/span.h
@@ -49,7 +49,7 @@
 *
 *   https://github.com/Microsoft/GSL/pull/664
 *
- * FIXME: Group these MSVC workarounds into a manageable place.
+ * TODO(trivialfis): Group these MSVC workarounds into a manageable place.
 */
 #if defined(_MSC_VER) && _MSC_VER < 1910

@@ -68,7 +68,7 @@ namespace xgboost {
 namespace common {

 // Usual logging facility is not available inside device code.
-// FIXME: Make dmlc check more generic.
+// TODO(trivialfis): Make dmlc check more generic.
 #define KERNEL_CHECK(cond)                                      \
  do {                                                          \
    if (!(cond)) {                                              \
@@ -104,11 +104,11 @@ constexpr detail::ptrdiff_t dynamic_extent = -1;  // NOLINT

 enum class byte : unsigned char {};  // NOLINT

-namespace detail {
-
-template <class ElementType, detail::ptrdiff_t Extent = dynamic_extent>
+template <class ElementType, detail::ptrdiff_t Extent>
 class Span;

+namespace detail {
+
 template <typename SpanType, bool IsConst>
 class SpanIterator {
  using ElementType = typename SpanType::element_type;
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -0,0 +1,204 @@
+/*!
+ * Copyright 2018 XGBoost contributors
+ */
+#ifndef XGBOOST_COMMON_TRANSFORM_H_
+#define XGBOOST_COMMON_TRANSFORM_H_
+
+#include <dmlc/omp.h>
+#include <xgboost/data.h>
+#include <vector>
+#include <type_traits>  // enable_if
+
+#include "host_device_vector.h"
+#include "common.h"
+#include "span.h"
+
+#if defined (__CUDACC__)
+#include "device_helpers.cuh"
+#endif
+
+namespace xgboost {
+namespace common {
+
+constexpr size_t kBlockThreads = 256;
+
+namespace detail {
+
+#if defined(__CUDACC__)
+template <typename Functor, typename... SpanType>
+__global__ void LaunchCUDAKernel(Functor _func, Range _range,
+                                 SpanType... _spans) {
+  for (auto i : dh::GridStrideRange(*_range.begin(), *_range.end())) {
+    _func(i, _spans...);
+  }
+}
+#endif
+
+}  // namespace detail
+
+/*! \brief Do Transformation on HostDeviceVectors.
+ *
+ *  \tparam CompiledWithCuda A bool parameter used to distinguish compilation
+ *         trajectories, users do not need to use it.
+ *
+ *  Note: Using Transform is a VERY tricky thing to do. Transform uses template
+ *   argument to duplicate itself into two different types, one for CPU,
+ *   another for CUDA.  The trick is not without its flaw:
+ *
+ *     If you use it in a function that can be compiled by both nvcc and host
+ *     compiler, the behaviour is un-defined!  Because your function is NOT
+ *     duplicated by `CompiledWithCuda`. At link time, cuda compiler resolution
+ *     will merge functions with same signature.
+ */
+template <bool CompiledWithCuda = WITH_CUDA()>
+class Transform {
+ private:
+  template <typename Functor>
+  struct Evaluator {
+   public:
+    Evaluator(Functor func, Range range, GPUSet devices, bool reshard) :
+        func_(func), range_{std::move(range)},
+        distribution_{std::move(GPUDistribution::Block(devices))},
+        reshard_{reshard} {}
+    Evaluator(Functor func, Range range, GPUDistribution dist,
+              bool reshard) :
+        func_(func), range_{std::move(range)}, distribution_{std::move(dist)},
+        reshard_{reshard} {}
+
+    /*!
+     * \brief Evaluate the functor with input pointers to HostDeviceVector.
+     *
+     * \tparam HDV...  HostDeviceVectors type.
+     * \param  vectors Pointers to HostDeviceVector.
+     */
+    template <typename... HDV>
+    void Eval(HDV... vectors) const {
+      bool on_device = !distribution_.IsEmpty();
+
+      if (on_device) {
+        LaunchCUDA(func_, vectors...);
+      } else {
+        LaunchCPU(func_, vectors...);
+      }
+    }
+
+   private:
+    // CUDA UnpackHDV
+    template <typename T>
+    Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
+      return _vec->DeviceSpan(_device);
+    }
+    template <typename T>
+    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
+      return _vec->ConstDeviceSpan(_device);
+    }
+    // CPU UnpackHDV
+    template <typename T>
+    Span<T> UnpackHDV(HostDeviceVector<T>* _vec) const {
+      return Span<T> {_vec->HostPointer(),
+            static_cast<typename Span<T>::index_type>(_vec->Size())};
+    }
+    template <typename T>
+    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec) const {
+      return Span<T const> {_vec->ConstHostPointer(),
+            static_cast<typename Span<T>::index_type>(_vec->Size())};
+    }
+    // Recursive unpack for Reshard.
+    template <typename T>
+    void UnpackReshard(GPUDistribution dist, const HostDeviceVector<T>* vector) const {
+      vector->Reshard(dist);
+    }
+    template <typename Head, typename... Rest>
+    void UnpackReshard(GPUDistribution dist,
+                       const HostDeviceVector<Head>* _vector,
+                       const HostDeviceVector<Rest>*... _vectors) const {
+      _vector->Reshard(dist);
+      UnpackReshard(dist, _vectors...);
+    }
+
+#if defined(__CUDACC__)
+    template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
+              typename... HDV>
+    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
+      if (reshard_)
+        UnpackReshard(distribution_, _vectors...);
+
+      GPUSet devices = distribution_.Devices();
+      size_t range_size = *range_.end() - *range_.begin();
+#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
+      for (omp_ulong i = 0; i < devices.Size(); ++i) {
+        int d = devices.Index(i);
+        // Ignore other attributes of GPUDistribution for spliting index.
+        size_t shard_size =
+            GPUDistribution::Block(devices).ShardSize(range_size, d);
+        Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
+        dh::safe_cuda(cudaSetDevice(d));
+        const int GRID_SIZE =
+            static_cast<int>(dh::DivRoundUp(*(range_.end()), kBlockThreads));
+
+        detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
+            _func, shard_range, UnpackHDV(_vectors, d)...);
+        dh::safe_cuda(cudaGetLastError());
+        dh::safe_cuda(cudaDeviceSynchronize());
+      }
+    }
+#else
+    /*! \brief Dummy funtion defined when compiling for CPU.  */
+    template <typename std::enable_if<!CompiledWithCuda>::type* = nullptr,
+              typename... HDV>
+    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
+      LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA();
+    }
+#endif
+
+    template <typename... HDV>
+    void LaunchCPU(Functor func, HDV*... vectors) const {
+      auto end = *(range_.end());
+#pragma omp parallel for schedule(static)
+      for (omp_ulong idx = 0; idx < end; ++idx) {
+        func(idx, UnpackHDV(vectors)...);
+      }
+    }
+
+   private:
+    /*! \brief Callable object. */
+    Functor func_;
+    /*! \brief Range object specifying parallel threads index range. */
+    Range range_;
+    /*! \brief Whether resharding for vectors is required. */
+    bool reshard_;
+    GPUDistribution distribution_;
+  };
+
+ public:
+  /*!
+   * \brief Initialize a Transform object.
+   *
+   * \tparam Functor  A callable object type.
+   * \return A Evaluator having one method Eval.
+   *
+   * \param func    A callable object, accepting a size_t thread index,
+   *                  followed by a set of Span classes.
+   * \param range   Range object specifying parallel threads index range.
+   * \param devices GPUSet specifying GPUs to use, when compiling for CPU,
+   *                  this should be GPUSet::Empty().
+   * \param reshard Whether Reshard for HostDeviceVector is needed.
+   */
+  template <typename Functor>
+  static Evaluator<Functor> Init(Functor func, Range const range,
+                                 GPUSet const devices,
+                                 bool const reshard = true) {
+    return Evaluator<Functor> {func, std::move(range), std::move(devices), reshard};
+  }
+  template <typename Functor>
+  static Evaluator<Functor> Init(Functor func, Range const range,
+                                 GPUDistribution const dist,
+                                 bool const reshard = true) {
+    return Evaluator<Functor> {func, std::move(range), std::move(dist), reshard};
+  }
+};
+
+}  // namespace common
+}  // namespace xgboost
+
+#endif  // XGBOOST_COMMON_TRANSFORM_H_