Make HostDeviceVector single gpu only (#4773)

* Make HostDeviceVector single gpu only
2019-08-25 14:51:13 -07:00
parent 41227d1933
commit 38ab79f889
54 changed files with 641 additions and 1621 deletions
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -57,14 +57,10 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
+    Evaluator(Functor func, Range range, int device, bool shard) :
        func_(func), range_{std::move(range)},
        shard_{shard},
-        distribution_{GPUDistribution::Block(devices)} {}
-    Evaluator(Functor func, Range range, GPUDistribution dist,
-              bool shard) :
-        func_(func), range_{std::move(range)}, shard_{shard},
-        distribution_{std::move(dist)} {}
+        device_{device} {}

    /*!
     * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -74,7 +70,7 @@ class Transform {
     */
    template <typename... HDV>
    void Eval(HDV... vectors) const {
-      bool on_device = !distribution_.IsEmpty();
+      bool on_device = device_ >= 0;

      if (on_device) {
        LaunchCUDA(func_, vectors...);
@@ -86,13 +82,13 @@ class Transform {
   private:
    // CUDA UnpackHDV
    template <typename T>
-    Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->DeviceSpan(_device);
+    Span<T> UnpackHDVOnDevice(HostDeviceVector<T>* _vec) const {
+      auto span = _vec->DeviceSpan();
      return span;
    }
    template <typename T>
-    Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
-      auto span = _vec->ConstDeviceSpan(_device);
+    Span<T const> UnpackHDVOnDevice(const HostDeviceVector<T>* _vec) const {
+      auto span = _vec->ConstDeviceSpan();
      return span;
    }
    // CPU UnpackHDV
@@ -108,15 +104,15 @@ class Transform {
    }
    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
-      vector->Shard(dist);
+    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+      vector->SetDevice(device);
    }
    template <typename Head, typename... Rest>
-    void UnpackShard(GPUDistribution dist,
+    void UnpackShard(int device,
                     const HostDeviceVector<Head> *_vector,
                     const HostDeviceVector<Rest> *... _vectors) const {
-      _vector->Shard(dist);
-      UnpackShard(dist, _vectors...);
+      _vector->SetDevice(device);
+      UnpackShard(device, _vectors...);
    }

 #if defined(__CUDACC__)
@@ -124,28 +120,20 @@ class Transform {
              typename... HDV>
    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
      if (shard_)
-        UnpackShard(distribution_, _vectors...);
+        UnpackShard(device_, _vectors...);

-      GPUSet devices = distribution_.Devices();
      size_t range_size = *range_.end() - *range_.begin();

      // Extract index to deal with possible old OpenMP.
-      size_t device_beg = *(devices.begin());
-      size_t device_end = *(devices.end());
-#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
-      for (omp_ulong device = device_beg; device < device_end; ++device) {  // NOLINT
-        // Ignore other attributes of GPUDistribution for spliting index.
-        // This deals with situation like multi-class setting where
-        // granularity is used in data vector.
-        size_t shard_size = GPUDistribution::Block(devices).ShardSize(
-            range_size, devices.Index(device));
-        Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-        dh::safe_cuda(cudaSetDevice(device));
-        const int GRID_SIZE =
-            static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
-        detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
-            _func, shard_range, UnpackHDV(_vectors, device)...);
-      }
+      // This deals with situation like multi-class setting where
+      // granularity is used in data vector.
+      size_t shard_size = range_size;
+      Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
+      dh::safe_cuda(cudaSetDevice(device_));
+      const int GRID_SIZE =
+          static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
+      detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
+          _func, shard_range, UnpackHDVOnDevice(_vectors)...);
    }
 #else
    /*! \brief Dummy funtion defined when compiling for CPU.  */
@@ -172,7 +160,7 @@ class Transform {
    Range range_;
    /*! \brief Whether sharding for vectors is required. */
    bool shard_;
-    GPUDistribution distribution_;
+    int device_;
  };

 public:
@@ -191,15 +179,9 @@ class Transform {
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUSet const devices,
+                                 int device,
                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
-  }
-  template <typename Functor>
-  static Evaluator<Functor> Init(Functor func, Range const range,
-                                 GPUDistribution const dist,
-                                 bool const shard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
+    return Evaluator<Functor> {func, std::move(range), device, shard};
  }
 };