More explict sharding methods for device memory (#4396)

* Rename the Reshard method to Shard * Add a new Reshard method for sharding a vector that's already sharded
2019-04-30 16:47:23 -07:00
parent 797ba8e72d
commit eaab364a63
12 changed files with 154 additions and 77 deletions
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -154,10 +154,13 @@ bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
 }

 template <typename T>
-void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
+void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }

 template <typename T>
-void HostDeviceVector<T>::Reshard(GPUSet devices) const { }
+void HostDeviceVector<T>::Shard(GPUSet devices) const { }
+
+template <typename T>
+void Reshard(const GPUDistribution &distribution) { }

 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -318,7 +318,7 @@ struct HostDeviceVectorImpl {
    // Data is on device;
    if (distribution_ != other->distribution_) {
      distribution_ = GPUDistribution();
-      Reshard(other->Distribution());
+      Shard(other->Distribution());
      size_d_ = other->size_d_;
    }
    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
@@ -358,19 +358,24 @@ struct HostDeviceVectorImpl {
    return data_h_;
  }

-  void Reshard(const GPUDistribution& distribution) {
+  void Shard(const GPUDistribution& distribution) {
    if (distribution_ == distribution) { return; }
-    CHECK(distribution_.IsEmpty() || distribution.IsEmpty());
-    if (distribution.IsEmpty()) {
-      LazySyncHost(GPUAccess::kWrite);
-    }
+    CHECK(distribution_.IsEmpty());
    distribution_ = distribution;
    InitShards();
  }

-  void Reshard(GPUSet new_devices) {
+  void Shard(GPUSet new_devices) {
    if (distribution_.Devices() == new_devices) { return; }
-    Reshard(GPUDistribution::Block(new_devices));
+    Shard(GPUDistribution::Block(new_devices));
+  }
+
+  void Reshard(const GPUDistribution &distribution) {
+    if (distribution_ == distribution) { return; }
+    LazySyncHost(GPUAccess::kWrite);
+    distribution_ = distribution;
+    shards_.clear();
+    InitShards();
  }

  void Resize(size_t new_size, T v) {
@@ -586,12 +591,17 @@ bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
 }

 template <typename T>
-void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
-  impl_->Reshard(new_devices);
+void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
+  impl_->Shard(new_devices);
 }

 template <typename T>
-void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
+void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
+  impl_->Shard(distribution);
+}
+
+template <typename T>
+void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
  impl_->Reshard(distribution);
 }

--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -14,7 +14,7 @@
 * Initialization/Allocation:<br/>
 * One can choose to initialize the vector on CPU or GPU during constructor.
 * (use the 'devices' argument) Or, can choose to use the 'Resize' method to
- * allocate/resize memory explicitly, and use the 'Reshard' method
+ * allocate/resize memory explicitly, and use the 'Shard' method
 * to specify the devices.
 *
 * Accessing underlying data:<br/>
@@ -98,6 +98,8 @@ class GPUDistribution {
    offsets_(std::move(offsets)) {}

 public:
+  static GPUDistribution Empty() { return GPUDistribution(); }
+
  static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }

  static GPUDistribution Overlap(GPUSet devices, int overlap) {
@@ -250,11 +252,15 @@ class HostDeviceVector {

  /*!
   * \brief Specify memory distribution.
-   *
-   *   If GPUSet::Empty() is used, all data will be drawn back to CPU.
   */
-  void Reshard(const GPUDistribution& distribution) const;
-  void Reshard(GPUSet devices) const;
+  void Shard(const GPUDistribution &distribution) const;
+  void Shard(GPUSet devices) const;
+
+  /*!
+   * \brief Change memory distribution.
+   */
+  void Reshard(const GPUDistribution &distribution);
+
  void Resize(size_t new_size, T v = T());

 private:
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -57,13 +57,13 @@ class Transform {
  template <typename Functor>
  struct Evaluator {
   public:
-    Evaluator(Functor func, Range range, GPUSet devices, bool reshard) :
+    Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
        func_(func), range_{std::move(range)},
-        reshard_{reshard},
+        shard_{shard},
        distribution_{std::move(GPUDistribution::Block(devices))} {}
    Evaluator(Functor func, Range range, GPUDistribution dist,
-              bool reshard) :
-        func_(func), range_{std::move(range)}, reshard_{reshard},
+              bool shard) :
+        func_(func), range_{std::move(range)}, shard_{shard},
        distribution_{std::move(dist)} {}

    /*!
@@ -106,25 +106,25 @@ class Transform {
      return Span<T const> {_vec->ConstHostPointer(),
            static_cast<typename Span<T>::index_type>(_vec->Size())};
    }
-    // Recursive unpack for Reshard.
+    // Recursive unpack for Shard.
    template <typename T>
-    void UnpackReshard(GPUDistribution dist, const HostDeviceVector<T>* vector) const {
-      vector->Reshard(dist);
+    void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
+      vector->Shard(dist);
    }
    template <typename Head, typename... Rest>
-    void UnpackReshard(GPUDistribution dist,
-                       const HostDeviceVector<Head>* _vector,
-                       const HostDeviceVector<Rest>*... _vectors) const {
-      _vector->Reshard(dist);
-      UnpackReshard(dist, _vectors...);
+    void UnpackShard(GPUDistribution dist,
+                     const HostDeviceVector<Head> *_vector,
+                     const HostDeviceVector<Rest> *... _vectors) const {
+      _vector->Shard(dist);
+      UnpackShard(dist, _vectors...);
    }

 #if defined(__CUDACC__)
    template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
              typename... HDV>
    void LaunchCUDA(Functor _func, HDV*... _vectors) const {
-      if (reshard_)
-        UnpackReshard(distribution_, _vectors...);
+      if (shard_)
+        UnpackShard(distribution_, _vectors...);

      GPUSet devices = distribution_.Devices();
      size_t range_size = *range_.end() - *range_.begin();
@@ -170,8 +170,8 @@ class Transform {
    Functor func_;
    /*! \brief Range object specifying parallel threads index range. */
    Range range_;
-    /*! \brief Whether resharding for vectors is required. */
-    bool reshard_;
+    /*! \brief Whether sharding for vectors is required. */
+    bool shard_;
    GPUDistribution distribution_;
  };

@@ -187,19 +187,19 @@ class Transform {
   * \param range   Range object specifying parallel threads index range.
   * \param devices GPUSet specifying GPUs to use, when compiling for CPU,
   *                  this should be GPUSet::Empty().
-   * \param reshard Whether Reshard for HostDeviceVector is needed.
+   * \param shard Whether Shard for HostDeviceVector is needed.
   */
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
                                 GPUSet const devices,
-                                 bool const reshard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(devices), reshard};
+                                 bool const shard = true) {
+    return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
  }
  template <typename Functor>
  static Evaluator<Functor> Init(Functor func, Range const range,
                                 GPUDistribution const dist,
-                                 bool const reshard = true) {
-    return Evaluator<Functor> {func, std::move(range), std::move(dist), reshard};
+                                 bool const shard = true) {
+    return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
  }
 };