From 55e645c5f56559def985c0450ca747c00a538239 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 26 May 2019 08:57:41 +0800
Subject: [PATCH] Revert hist init optimization. (#4502)

---
 src/common/hist_util.cc | 103 ++++++++++------------------------------
 1 file changed, 26 insertions(+), 77 deletions(-)
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index a473f9240..1ac058645 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -50,7 +50,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
   constexpr int kFactor = 8;
   std::vector<WXQSketch> sketchs;
 
-  const size_t nthread = omp_get_max_threads();
+  const int nthread = omp_get_max_threads();
 
   unsigned const nstep =
       static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
@@ -68,85 +68,34 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
   // Use group index for weights?
   bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_;
 
-  if (use_group_ind) {
-    for (const auto &batch : p_fmat->GetRowBatches()) {
-      size_t group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
-      #pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
-      {
-        CHECK_EQ(nthread, omp_get_num_threads());
-        auto tid = static_cast<unsigned>(omp_get_thread_num());
-        unsigned begin = std::min(nstep * tid, ncol);
-        unsigned end = std::min(nstep * (tid + 1), ncol);
-
-        // do not iterate if no columns are assigned to the thread
-        if (begin < end && end <= ncol) {
-          for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
-            size_t const ridx = batch.base_rowid + i;
-            SparsePage::Inst const inst = batch[i];
-            if (group_ptr[group_ind] == ridx &&
-                // maximum equals to weights.size() - 1
-                group_ind < num_groups - 1) {
-              // move to next group
-              group_ind++;
-            }
-            for (auto const& entry : inst) {
-              if (entry.index >= begin && entry.index < end) {
-                size_t w_idx = group_ind;
-                sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
-              }
-            }
-          }
-        }
-      }
+  for (const auto &batch : p_fmat->GetRowBatches()) {
+    size_t group_ind = 0;
+    if (use_group_ind) {
+      group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid);
     }
-  } else {
-    for (const auto &batch : p_fmat->GetRowBatches()) {
-      const size_t size = batch.Size();
-      const size_t block_size = 512;
-      const size_t block_size_iter = block_size * nthread;
-      const size_t n_blocks = size / block_size_iter + !!(size % block_size_iter);
+#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind)
+    {
+      CHECK_EQ(nthread, omp_get_num_threads());
+      auto tid = static_cast<unsigned>(omp_get_thread_num());
+      unsigned begin = std::min(nstep * tid, ncol);
+      unsigned end = std::min(nstep * (tid + 1), ncol);
 
-      std::vector<std::vector<std::pair<float, float>>> buff(nthread);
-      for (size_t tid = 0; tid < nthread; ++tid) {
-        buff[tid].resize(block_size * ncol);
-      }
-
-      std::vector<size_t> sizes(nthread * ncol, 0);
-
-      for (size_t iblock = 0; iblock < n_blocks; ++iblock) {
-        #pragma omp parallel num_threads(nthread)
-        {
-          int tid = omp_get_thread_num();
-
-          const size_t ibegin = iblock * block_size_iter + tid * block_size;
-          const size_t iend = std::min(ibegin + block_size, size);
-
-          auto* p_sizes = sizes.data() + ncol * tid;
-          auto* p_buff = buff[tid].data();
-
-          for (size_t i = ibegin; i < iend; ++i) {
-            size_t const ridx = batch.base_rowid + i;
-            bst_float w = info.GetWeight(ridx);
-            SparsePage::Inst const inst = batch[i];
-
-            for (auto const& entry : inst) {
-              const size_t idx = entry.index;
-              p_buff[idx * block_size + p_sizes[idx]] = { entry.fvalue, w };
-              p_sizes[idx]++;
-            }
+      // do not iterate if no columns are assigned to the thread
+      if (begin < end && end <= ncol) {
+        for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*)
+          size_t const ridx = batch.base_rowid + i;
+          SparsePage::Inst const inst = batch[i];
+          if (use_group_ind &&
+              group_ptr[group_ind] == ridx &&
+              // maximum equals to weights.size() - 1
+              group_ind < num_groups - 1) {
+            // move to next group
+            group_ind++;
           }
-          #pragma omp barrier
-          #pragma omp for schedule(static)
-          for (int32_t icol = 0; icol < static_cast<int32_t>(ncol); ++icol) {
-            for (size_t tid = 0; tid < nthread; ++tid) {
-              auto* p_sizes = sizes.data() + ncol * tid;
-              auto* p_buff = buff[tid].data() + icol * block_size;
-
-              for (size_t i = 0; i < p_sizes[icol]; ++i) {
-                sketchs[icol].Push(p_buff[i].first,  p_buff[i].second);
-              }
-
-              p_sizes[icol] = 0;
+          for (auto const& entry : inst) {
+            if (entry.index >= begin && entry.index < end) {
+              size_t w_idx = use_group_ind ? group_ind : ridx;
+              sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
             }
           }
         }