Optimized BuildHist function (#5156)

2020-01-30 10:32:57 +03:00
parent 4240daed4e
commit c67163250e
8 changed files with 610 additions and 184 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -659,13 +659,59 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
  }
 }

+/*!
+ * \brief fill a histogram by zeroes
+ */
+void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
+  memset(hist.data() + begin, '\0', (end-begin)*sizeof(tree::GradStats));
+}
+
+/*!
+ * \brief Increment hist as dst += add in range [begin, end)
+ */
+void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end) {
+  using FPType = decltype(tree::GradStats::sum_grad);
+  FPType* pdst = reinterpret_cast<FPType*>(dst.data());
+  const FPType* padd = reinterpret_cast<const FPType*>(add.data());
+
+  for (size_t i = 2 * begin; i < 2 * end; ++i) {
+    pdst[i] += padd[i];
+  }
+}
+
+/*!
+ * \brief Copy hist from src to dst in range [begin, end)
+ */
+void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end) {
+  using FPType = decltype(tree::GradStats::sum_grad);
+  FPType* pdst = reinterpret_cast<FPType*>(dst.data());
+  const FPType* psrc = reinterpret_cast<const FPType*>(src.data());
+
+  for (size_t i = 2 * begin; i < 2 * end; ++i) {
+    pdst[i] = psrc[i];
+  }
+}
+
+/*!
+ * \brief Compute Subtraction: dst = src1 - src2 in range [begin, end)
+ */
+void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
+                     size_t begin, size_t end) {
+  using FPType = decltype(tree::GradStats::sum_grad);
+  FPType* pdst = reinterpret_cast<FPType*>(dst.data());
+  const FPType* psrc1 = reinterpret_cast<const FPType*>(src1.data());
+  const FPType* psrc2 = reinterpret_cast<const FPType*>(src2.data());
+
+  for (size_t i = 2 * begin; i < 2 * end; ++i) {
+    pdst[i] = psrc1[i] - psrc2[i];
+  }
+}
+
+
 void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
                             GHistRow hist) {
-  const size_t nthread = static_cast<size_t>(this->nthread_);
-  data_.resize(nbins_ * nthread_);
-
  const size_t* rid =  row_indices.begin;
  const size_t nrows = row_indices.Size();
  const uint32_t* index = gmat.index.data();
@@ -673,79 +719,27 @@ void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
  const float* pgh = reinterpret_cast<const float*>(gpair.data());

  double* hist_data = reinterpret_cast<double*>(hist.data());
-  double* data = reinterpret_cast<double*>(data_.data());
-
-  const size_t block_size = 512;
-  size_t n_blocks = nrows/block_size;
-  n_blocks += !!(nrows - n_blocks*block_size);
-
-  const size_t nthread_to_process = std::min(nthread,  n_blocks);
-  memset(thread_init_.data(), '\0', nthread_to_process*sizeof(size_t));

  const size_t cache_line_size = 64;
  const size_t prefetch_offset = 10;
  size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
  no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;

-#pragma omp parallel for num_threads(nthread_to_process) schedule(guided)
-  for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
-    dmlc::omp_uint tid = omp_get_thread_num();
-    double* data_local_hist = ((nthread_to_process == 1) ? hist_data :
-                               reinterpret_cast<double*>(data_.data() + tid * nbins_));
+  for (size_t i = 0; i < nrows; ++i) {
+    const size_t icol_start = row_ptr[rid[i]];
+    const size_t icol_end = row_ptr[rid[i]+1];

-    if (!thread_init_[tid]) {
-      memset(data_local_hist, '\0', 2*nbins_*sizeof(double));
-      thread_init_[tid] = true;
+    if (i < nrows - no_prefetch_size) {
+      PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
+      PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
    }

-    const size_t istart = iblock*block_size;
-    const size_t iend = (((iblock+1)*block_size > nrows) ? nrows : istart + block_size);
-    for (size_t i = istart; i < iend; ++i) {
-      const size_t icol_start = row_ptr[rid[i]];
-      const size_t icol_end = row_ptr[rid[i]+1];
+    for (size_t j = icol_start; j < icol_end; ++j) {
+      const uint32_t idx_bin = 2*index[j];
+      const size_t idx_gh = 2*rid[i];

-      if (i < nrows - no_prefetch_size) {
-        PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
-        PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
-      }
-
-      for (size_t j = icol_start; j < icol_end; ++j) {
-        const uint32_t idx_bin = 2*index[j];
-        const size_t idx_gh = 2*rid[i];
-
-        data_local_hist[idx_bin] += pgh[idx_gh];
-        data_local_hist[idx_bin+1] += pgh[idx_gh+1];
-      }
-    }
-  }
-
-  if (nthread_to_process > 1) {
-    const size_t size = (2*nbins_);
-    const size_t block_size = 1024;
-    size_t n_blocks = size/block_size;
-    n_blocks += !!(size - n_blocks*block_size);
-
-    size_t n_worked_bins = 0;
-    for (size_t i = 0; i < nthread_to_process; ++i) {
-      if (thread_init_[i]) {
-        thread_init_[n_worked_bins++] = i;
-      }
-    }
-
-#pragma omp parallel for num_threads(std::min(nthread, n_blocks)) schedule(guided)
-    for (bst_omp_uint iblock = 0; iblock < n_blocks; iblock++) {
-      const size_t istart = iblock * block_size;
-      const size_t iend = (((iblock + 1) * block_size > size) ? size : istart + block_size);
-
-      const size_t bin = 2 * thread_init_[0] * nbins_;
-      memcpy(hist_data + istart, (data + bin + istart), sizeof(double) * (iend - istart));
-
-      for (size_t i_bin_part = 1; i_bin_part < n_worked_bins; ++i_bin_part) {
-        const size_t bin = 2 * thread_init_[i_bin_part] * nbins_;
-        for (size_t i = istart; i < iend; i++) {
-          hist_data[i] += data[bin + i];
-        }
-      }
+      hist_data[idx_bin] += pgh[idx_gh];
+      hist_data[idx_bin+1] += pgh[idx_gh+1];
    }
  }
 }
@@ -801,10 +795,6 @@ void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
 }

 void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
-  tree::GradStats* p_self = self.data();
-  tree::GradStats* p_sibling = sibling.data();
-  tree::GradStats* p_parent = parent.data();
-
  const size_t size = self.size();
  CHECK_EQ(sibling.size(), size);
  CHECK_EQ(parent.size(), size);
@@ -816,9 +806,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa
  for (omp_ulong iblock = 0; iblock < n_blocks; ++iblock) {
    const size_t ibegin = iblock*block_size;
    const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
-    for (bst_omp_uint bin_id = ibegin; bin_id < iend; bin_id++) {
-      p_self[bin_id].SetSubstract(p_parent[bin_id], p_sibling[bin_id]);
-    }
+    SubtractionHist(self, parent, sibling, ibegin, iend);
  }
 }

--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -14,8 +14,10 @@
 #include <algorithm>
 #include <memory>
 #include <utility>
+#include <map>

 #include "row_set.h"
+#include "threading_utils.h"
 #include "../tree/param.h"
 #include "./quantile.h"
 #include "./timer.h"
@@ -254,7 +256,7 @@ class DenseCuts  : public CutsBuilder {

 // FIXME(trivialfis): Merge this into generic cut builder.
 /*! \brief Builds the cut matrix on the GPU.
- *  
+ *
 *  \return The row stride across the entire dataset.
 */
 size_t DeviceSketch(int device,
@@ -343,13 +345,34 @@ class GHistIndexBlockMatrix {
 };

 /*!
- * \brief histogram of graident statistics for a single node.
- *  Consists of multiple GradStats, each entry showing total graident statistics
+ * \brief histogram of gradient statistics for a single node.
+ *  Consists of multiple GradStats, each entry showing total gradient statistics
 *     for that particular bin
 *  Uses global bin id so as to represent all features simultaneously
 */
 using GHistRow = Span<tree::GradStats>;

+/*!
+ * \brief fill a histogram by zeros
+ */
+void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
+
+/*!
+ * \brief Increment hist as dst += add in range [begin, end)
+ */
+void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end);
+
+/*!
+ * \brief Copy hist from src to dst in range [begin, end)
+ */
+void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end);
+
+/*!
+ * \brief Compute Subtraction: dst = src1 - src2 in range [begin, end)
+ */
+void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
+                     size_t begin, size_t end);
+
 /*!
 * \brief histogram of gradient statistics for multiple nodes
 */
@@ -372,9 +395,13 @@ class HistCollection {

  // initialize histogram collection
  void Init(uint32_t nbins) {
-    nbins_ = nbins;
+    if (nbins_ != nbins) {
+      nbins_ = nbins;
+      // quite expensive operation, so let's do this only once
+      data_.clear();
+    }
    row_ptr_.clear();
-    data_.clear();
+    n_nodes_added_ = 0;
  }

  // create an empty histogram for i-th node
@@ -385,20 +412,201 @@ class HistCollection {
    }
    CHECK_EQ(row_ptr_[nid], kMax);

-    row_ptr_[nid] = data_.size();
-    data_.resize(data_.size() + nbins_);
+    if (data_.size() < nbins_ * (nid + 1)) {
+      data_.resize(nbins_ * (nid + 1));
+    }
+
+    row_ptr_[nid] = nbins_ * n_nodes_added_;
+    n_nodes_added_++;
  }

 private:
  /*! \brief number of all bins over all features */
-  uint32_t nbins_;
+  uint32_t nbins_ = 0;
+  /*! \brief amount of active nodes in hist collection */
+  uint32_t n_nodes_added_ = 0;

  std::vector<tree::GradStats> data_;

-  /*! \brief row_ptr_[nid] locates bin for historgram of node nid */
+  /*! \brief row_ptr_[nid] locates bin for histogram of node nid */
  std::vector<size_t> row_ptr_;
 };

+/*!
+ * \brief Stores temporary histograms to compute them in parallel
+ * Supports processing multiple tree-nodes for nested parallelism
+ * Able to reduce histograms across threads in efficient way
+ */
+class ParallelGHistBuilder {
+ public:
+  void Init(size_t nbins) {
+    if (nbins != nbins_) {
+      hist_buffer_.Init(nbins);
+      nbins_ = nbins;
+    }
+  }
+
+  // Add new elements if needed, mark all hists as unused
+  // targeted_hists - already allocated hists which should contain final results after Reduce() call
+  void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d& space,
+             const std::vector<GHistRow>& targeted_hists) {
+    hist_buffer_.Init(nbins_);
+    tid_nid_to_hist_.clear();
+    hist_memory_.clear();
+    threads_to_nids_map_.clear();
+
+    targeted_hists_ = targeted_hists;
+
+    CHECK_EQ(nodes, targeted_hists.size());
+
+    nodes_    = nodes;
+    nthreads_ = nthreads;
+
+    MatchThreadsToNodes(space);
+    AllocateAdditionalHistograms();
+    MatchNodeNidPairToHist();
+
+    hist_was_used_.resize(nthreads * nodes_);
+    std::fill(hist_was_used_.begin(), hist_was_used_.end(), static_cast<int>(false));
+  }
+
+  // Get specified hist, initialize hist by zeros if it wasn't used before
+  GHistRow GetInitializedHist(size_t tid, size_t nid) {
+    CHECK_LT(nid, nodes_);
+    CHECK_LT(tid, nthreads_);
+
+    size_t idx = tid_nid_to_hist_.at({tid, nid});
+    GHistRow hist = hist_memory_[idx];
+
+    if (!hist_was_used_[tid * nodes_ + nid]) {
+      InitilizeHistByZeroes(hist, 0, hist.size());
+      hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
+    }
+
+    return hist;
+  }
+
+  // Reduce following bins (begin, end] for nid-node in dst across threads
+  void ReduceHist(size_t nid, size_t begin, size_t end) {
+    CHECK_GT(end, begin);
+    CHECK_LT(nid, nodes_);
+
+    GHistRow dst = targeted_hists_[nid];
+
+    bool is_updated = false;
+    for (size_t tid = 0; tid < nthreads_; ++tid) {
+      if (hist_was_used_[tid * nodes_ + nid]) {
+        is_updated = true;
+        const size_t idx = tid_nid_to_hist_.at({tid, nid});
+        GHistRow src = hist_memory_[idx];
+
+        if (dst.data() != src.data()) {
+          IncrementHist(dst, src, begin, end);
+        }
+      }
+    }
+    if (!is_updated) {
+      // In distributed mode - some tree nodes can be empty on local machines,
+      // So we need just set local hist by zeros in this case
+      InitilizeHistByZeroes(dst, begin, end);
+    }
+  }
+
+ protected:
+  void MatchThreadsToNodes(const BlockedSpace2d& space) {
+    const size_t space_size = space.Size();
+    const size_t chunck_size = space_size / nthreads_ + !!(space_size % nthreads_);
+
+    threads_to_nids_map_.resize(nthreads_ * nodes_, false);
+
+    for (size_t tid = 0; tid < nthreads_; ++tid) {
+      size_t begin = chunck_size * tid;
+      size_t end   = std::min(begin + chunck_size, space_size);
+
+      if (begin < space_size) {
+        size_t nid_begin = space.GetFirstDimension(begin);
+        size_t nid_end   = space.GetFirstDimension(end-1);
+
+        for (size_t nid = nid_begin; nid <= nid_end; ++nid) {
+          // true - means thread 'tid' will work to compute partial hist for node 'nid'
+          threads_to_nids_map_[tid * nodes_ + nid] = true;
+        }
+      }
+    }
+  }
+
+  void AllocateAdditionalHistograms() {
+    size_t hist_allocated_additionally = 0;
+
+    for (size_t nid = 0; nid < nodes_; ++nid) {
+      int nthreads_for_nid = 0;
+
+      for (size_t tid = 0; tid < nthreads_; ++tid) {
+        if (threads_to_nids_map_[tid * nodes_ + nid]) {
+          nthreads_for_nid++;
+        }
+      }
+
+      // In distributed mode - some tree nodes can be empty on local machines,
+      // set nthreads_for_nid to 0 in this case.
+      // In another case - allocate additional (nthreads_for_nid - 1) histograms,
+      // because one is already allocated externally (will store final result for the node).
+      hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
+    }
+
+    for (size_t i = 0; i < hist_allocated_additionally; ++i) {
+      hist_buffer_.AddHistRow(i);
+    }
+  }
+
+  void MatchNodeNidPairToHist() {
+    size_t hist_total = 0;
+    size_t hist_allocated_additionally = 0;
+
+    for (size_t nid = 0; nid < nodes_; ++nid) {
+      bool first_hist = true;
+      for (size_t tid = 0; tid < nthreads_; ++tid) {
+        if (threads_to_nids_map_[tid * nodes_ + nid]) {
+          if (first_hist) {
+            hist_memory_.push_back(targeted_hists_[nid]);
+            first_hist = false;
+          } else {
+            hist_memory_.push_back(hist_buffer_[hist_allocated_additionally]);
+            hist_allocated_additionally++;
+          }
+          // map pair {tid, nid} to index of allocated histogram from hist_memory_
+          tid_nid_to_hist_[{tid, nid}] = hist_total++;
+          CHECK_EQ(hist_total, hist_memory_.size());
+        }
+      }
+    }
+  }
+
+  /*! \brief number of bins in each histogram */
+  size_t nbins_ = 0;
+  /*! \brief number of threads for parallel computation */
+  size_t nthreads_ = 0;
+  /*! \brief number of nodes which will be processed in parallel  */
+  size_t nodes_ = 0;
+  /*! \brief Buffer for additional histograms for Parallel processing  */
+  HistCollection hist_buffer_;
+  /*!
+   * \brief Marks which hists were used, it means that they should be merged.
+   * Contains only {true or false} values
+   * but 'int' is used instead of 'bool', because std::vector<bool> isn't thread safe
+   */
+  std::vector<int> hist_was_used_;
+
+  /*! \brief Buffer for additional histograms for Parallel processing  */
+  std::vector<bool> threads_to_nids_map_;
+  /*! \brief Contains histograms for final results  */
+  std::vector<GHistRow> targeted_hists_;
+  /*! \brief Allocated memory for histograms used for construction  */
+  std::vector<GHistRow> hist_memory_;
+  /*! \brief map pair {tid, nid} to index of allocated histogram from hist_memory_  */
+  std::map<std::pair<size_t, size_t>, size_t> tid_nid_to_hist_;
+};
+
 /*!
 * \brief builder for histograms of gradient statistics
 */
@@ -408,7 +616,6 @@ class GHistBuilder {
  inline void Init(size_t nthread, uint32_t nbins) {
    nthread_ = nthread;
    nbins_ = nbins;
-    thread_init_.resize(nthread_);
  }

  // construct a histogram via histogram aggregation
@@ -433,8 +640,6 @@ class GHistBuilder {
  size_t nthread_;
  /*! \brief number of all bins over all features */
  uint32_t nbins_;
-  std::vector<size_t> thread_init_;
-  std::vector<tree::GradStats> data_;
 };


--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -108,12 +108,19 @@ class BlockedSpace2d {

 // Wrapper to implement nested parallelism with simple omp parallel for
 template<typename Func>
-void ParallelFor2d(const BlockedSpace2d& space, Func func) {
-  const int num_blocks_in_space = static_cast<int>(space.Size());
+void ParallelFor2d(const BlockedSpace2d& space, const int nthreads, Func func) {
+  const size_t num_blocks_in_space = space.Size();

-  #pragma omp parallel for
-  for (auto i = 0; i < num_blocks_in_space; i++) {
-    func(space.GetFirstDimension(i), space.GetRange(i));
+  #pragma omp parallel num_threads(nthreads)
+  {
+    size_t tid = omp_get_thread_num();
+    size_t chunck_size = num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
+
+    size_t begin = chunck_size * tid;
+    size_t end   = std::min(begin + chunck_size, num_blocks_in_space);
+    for (auto i = begin; i < end; i++) {
+      func(space.GetFirstDimension(i), space.GetRange(i));
+    }
  }
 }