Improve OpenMP exception handling (#6680)

2021-02-25 06:56:16 +01:00
parent c375173dca
commit 9b530e5697
26 changed files with 610 additions and 475 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -230,8 +230,7 @@ class ColumnMatrix {
    /* missing values make sense only for column with type kDenseColumn,
       and if no missing values were observed it could be handled much faster. */
    if (noMissingValues) {
-#pragma omp parallel for num_threads(omp_get_max_threads())
-      for (omp_ulong rid = 0; rid < nrow; ++rid) {
+      ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
        const size_t ibegin = rid*nfeature;
        const size_t iend = (rid+1)*nfeature;
        size_t j = 0;
@@ -239,7 +238,7 @@ class ColumnMatrix {
            const size_t idx = feature_offsets_[j];
            local_index[idx + rid] = index[i];
        }
-      }
+      });
    } else {
      /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
      size_t rbegin = 0;
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -84,38 +84,46 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {

    size_t block_size =  batch.Size() / batch_threads;

+    dmlc::OMPException exc;
    #pragma omp parallel num_threads(batch_threads)
    {
      #pragma omp for
      for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
-        size_t ibegin = block_size * tid;
-        size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
+        exc.Run([&]() {
+          size_t ibegin = block_size * tid;
+          size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));

-        size_t sum = 0;
-        for (size_t i = ibegin; i < iend; ++i) {
-          sum += page[i].size();
-          row_ptr[rbegin + 1 + i] = sum;
-        }
+          size_t sum = 0;
+          for (size_t i = ibegin; i < iend; ++i) {
+            sum += page[i].size();
+            row_ptr[rbegin + 1 + i] = sum;
+          }
+        });
      }

      #pragma omp single
      {
-        p_part[0] = prev_sum;
-        for (size_t i = 1; i < batch_threads; ++i) {
-          p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
-        }
+        exc.Run([&]() {
+          p_part[0] = prev_sum;
+          for (size_t i = 1; i < batch_threads; ++i) {
+            p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
+          }
+        });
      }

      #pragma omp for
      for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
-        size_t ibegin = block_size * tid;
-        size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
+        exc.Run([&]() {
+          size_t ibegin = block_size * tid;
+          size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));

-        for (size_t i = ibegin; i < iend; ++i) {
-          row_ptr[rbegin + 1 + i] += p_part[tid];
-        }
+          for (size_t i = ibegin; i < iend; ++i) {
+            row_ptr[rbegin + 1 + i] += p_part[tid];
+          }
+        });
      }
    }
+    exc.Rethrow();

    const size_t n_offsets = cut.Ptrs().size() - 1;
    const size_t n_index = row_ptr[rbegin + batch.Size()];
@@ -167,13 +175,12 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
                   [](auto idx, auto) { return idx; });
    }

-    #pragma omp parallel for num_threads(nthread) schedule(static)
-    for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
+    ParallelFor(bst_omp_uint(nbins), nthread, [&](bst_omp_uint idx) {
      for (int32_t tid = 0; tid < nthread; ++tid) {
        hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
        hit_count_tloc_[tid * nbins + idx] = 0;  // reset for next batch
      }
-    }
+    });

    prev_sum = row_ptr[rbegin + batch.Size()];
    rbegin += batch.Size();
@@ -701,7 +708,7 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
                                  const RowSetCollection::Elem row_indices,
                                  const GHistIndexBlockMatrix& gmatb,
                                  GHistRowT hist) {
-  constexpr int kUnroll = 8;  // loop unrolling factor
+  static constexpr int kUnroll = 8;  // loop unrolling factor
  const size_t nblock = gmatb.GetNumBlock();
  const size_t nrows = row_indices.end - row_indices.begin;
  const size_t rest = nrows % kUnroll;
@@ -710,40 +717,44 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
 #endif  // defined(_OPENMP)
  xgboost::detail::GradientPairInternal<GradientSumT>* p_hist = hist.data();

+  dmlc::OMPException exc;
 #pragma omp parallel for num_threads(nthread) schedule(guided)
  for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
-    auto gmat = gmatb[bid];
+    exc.Run([&]() {
+      auto gmat = gmatb[bid];

-    for (size_t i = 0; i < nrows - rest; i += kUnroll) {
-      size_t rid[kUnroll];
-      size_t ibegin[kUnroll];
-      size_t iend[kUnroll];
-      GradientPair stat[kUnroll];
+      for (size_t i = 0; i < nrows - rest; i += kUnroll) {
+        size_t rid[kUnroll];
+        size_t ibegin[kUnroll];
+        size_t iend[kUnroll];
+        GradientPair stat[kUnroll];

-      for (int k = 0; k < kUnroll; ++k) {
-        rid[k] = row_indices.begin[i + k];
-        ibegin[k] = gmat.row_ptr[rid[k]];
-        iend[k] = gmat.row_ptr[rid[k] + 1];
-        stat[k] = gpair[rid[k]];
-      }
-      for (int k = 0; k < kUnroll; ++k) {
-        for (size_t j = ibegin[k]; j < iend[k]; ++j) {
-          const uint32_t bin = gmat.index[j];
-          p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
+        for (int k = 0; k < kUnroll; ++k) {
+          rid[k] = row_indices.begin[i + k];
+          ibegin[k] = gmat.row_ptr[rid[k]];
+          iend[k] = gmat.row_ptr[rid[k] + 1];
+          stat[k] = gpair[rid[k]];
+        }
+        for (int k = 0; k < kUnroll; ++k) {
+          for (size_t j = ibegin[k]; j < iend[k]; ++j) {
+            const uint32_t bin = gmat.index[j];
+            p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
+          }
        }
      }
-    }
-    for (size_t i = nrows - rest; i < nrows; ++i) {
-      const size_t rid = row_indices.begin[i];
-      const size_t ibegin = gmat.row_ptr[rid];
-      const size_t iend = gmat.row_ptr[rid + 1];
-      const GradientPair stat = gpair[rid];
-      for (size_t j = ibegin; j < iend; ++j) {
-        const uint32_t bin = gmat.index[j];
-        p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
+      for (size_t i = nrows - rest; i < nrows; ++i) {
+        const size_t rid = row_indices.begin[i];
+        const size_t ibegin = gmat.row_ptr[rid];
+        const size_t iend = gmat.row_ptr[rid + 1];
+        const GradientPair stat = gpair[rid];
+        for (size_t j = ibegin; j < iend; ++j) {
+          const uint32_t bin = gmat.index[j];
+          p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
+        }
      }
-    }
+    });
  }
+  exc.Rethrow();
 }
 template
 void GHistBuilder<float>::BuildBlockHist(const std::vector<GradientPair>& gpair,
@@ -768,12 +779,11 @@ void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
  const size_t block_size = 1024;  // aproximatly 1024 values per block
  size_t n_blocks = size/block_size + !!(size%block_size);

-#pragma omp parallel for
-  for (omp_ulong iblock = 0; iblock < n_blocks; ++iblock) {
+  ParallelFor(omp_ulong(n_blocks), [&](omp_ulong iblock) {
    const size_t ibegin = iblock*block_size;
    const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
    SubtractionHist(self, parent, sibling, ibegin, iend);
-  }
+  });
 }
 template
 void GHistBuilder<float>::SubtractionTrick(GHistRow<float> self,
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -257,8 +257,7 @@ struct GHistIndexMatrix {
    const size_t batch_size = batch.Size();
    CHECK_LT(batch_size, offset_vec.size());
    BinIdxType* index_data = index_data_span.data();
-#pragma omp parallel for num_threads(batch_threads) schedule(static)
-    for (omp_ulong i = 0; i < batch_size; ++i) {
+    ParallelFor(omp_ulong(batch_size), batch_threads, [&](omp_ulong i) {
      const int tid = omp_get_thread_num();
      size_t ibegin = row_ptr[rbegin + i];
      size_t iend = row_ptr[rbegin + i + 1];
@@ -270,7 +269,7 @@ struct GHistIndexMatrix {
        index_data[ibegin + j] = get_offset(idx, j);
        ++hit_count_tloc_[tid * nbins + idx];
      }
-    }
+    });
  }

  void ResizeIndex(const size_t n_index,
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -35,7 +35,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
    column.resize(n_columns, 0);
  }

-  ParallelFor(page.Size(), nthreads, [&](size_t i) {
+  ParallelFor(omp_ulong(page.Size()), nthreads, [&](omp_ulong i) {
    auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
    auto row = page[i];
    auto const *p_row = row.data();
@@ -44,7 +44,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
    }
  });
  std::vector<bst_row_t> entries_per_columns(n_columns, 0);
-  ParallelFor(n_columns, nthreads, [&](size_t i) {
+  ParallelFor(bst_omp_uint(n_columns), nthreads, [&](bst_omp_uint i) {
    for (auto const &thread : column_sizes) {
      entries_per_columns[i] += thread[i];
    }
@@ -99,15 +99,15 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
  std::vector<bst_uint> const &group_ptr = info.group_ptr_;
  // Use group index for weights?
  auto batch = page.GetView();
-  dmlc::OMPException exec;
  // Parallel over columns.  Each thread owns a set of consecutive columns.
  auto const ncol = static_cast<uint32_t>(info.num_col_);
  auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
  auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);

+  dmlc::OMPException exc;
 #pragma omp parallel num_threads(nthread)
  {
-    exec.Run([&]() {
+    exc.Run([&]() {
      auto tid = static_cast<uint32_t>(omp_get_thread_num());
      auto const begin = thread_columns_ptr[tid];
      auto const end = thread_columns_ptr[tid + 1];
@@ -140,7 +140,7 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
      }
    });
  }
-  exec.Rethrow();
+  exc.Rethrow();
  monitor_.Stop(__func__);
 }

@@ -242,7 +242,7 @@ size_t nbytes = 0;
                         &global_sketches);

  std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
-  ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) {
+  ParallelFor(omp_ulong(n_columns), [&](omp_ulong fidx) {
    int32_t intermediate_num_cuts = num_cuts[fidx];
    auto nbytes =
        WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -115,11 +115,10 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
  nthreads = std::min(nthreads, omp_get_max_threads());
  nthreads = std::max(nthreads, 1);

-  dmlc::OMPException omp_exc;
+  dmlc::OMPException exc;
 #pragma omp parallel num_threads(nthreads)
  {
-    omp_exc.Run(
-        [](size_t num_blocks_in_space, const BlockedSpace2d& space, int nthreads, Func func) {
+    exc.Run([&]() {
      size_t tid = omp_get_thread_num();
      size_t chunck_size =
          num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
@@ -129,19 +128,24 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
      for (auto i = begin; i < end; i++) {
        func(space.GetFirstDimension(i), space.GetRange(i));
      }
-    }, num_blocks_in_space, space, nthreads, func);
+    });
  }
-  omp_exc.Rethrow();
+  exc.Rethrow();
 }

-template <typename Func>
-void ParallelFor(size_t size, size_t nthreads, Func fn) {
-  dmlc::OMPException omp_exc;
-#pragma omp parallel for num_threads(nthreads)
-  for (omp_ulong i = 0; i < size; ++i) {
-    omp_exc.Run(fn, i);
+template <typename Index, typename Func>
+void ParallelFor(Index size, size_t nthreads, Func fn) {
+  dmlc::OMPException exc;
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+  for (Index i = 0; i < size; ++i) {
+    exc.Run(fn, i);
  }
-  omp_exc.Rethrow();
+  exc.Rethrow();
+}
+
+template <typename Index, typename Func>
+void ParallelFor(Index size, Func fn) {
+  ParallelFor(size, omp_get_max_threads(), fn);
 }

 /* \brief Configure parallel threads.
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -16,6 +16,7 @@
 #include "xgboost/span.h"

 #include "common.h"
+#include "threading_utils.h"

 #if defined (__CUDACC__)
 #include "device_helpers.cuh"
@@ -168,13 +169,10 @@ class Transform {
    template <typename... HDV>
    void LaunchCPU(Functor func, HDV*... vectors) const {
      omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
-      dmlc::OMPException omp_exc;
      SyncHost(vectors...);
-#pragma omp parallel for schedule(static)
-      for (omp_ulong idx = 0; idx < end; ++idx) {
-        omp_exc.Run(func, idx, UnpackHDV(vectors)...);
-      }
-      omp_exc.Rethrow();
+      ParallelFor(end, [&](omp_ulong idx) {
+        func(idx, UnpackHDV(vectors)...);
+      });
    }

   private: