Improve OpenMP exception handling (#6680)

2021-02-25 06:56:16 +01:00
parent c375173dca
commit 9b530e5697
26 changed files with 610 additions and 475 deletions
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -25,6 +25,7 @@
 #include "../common/io.h"
 #include "../common/random.h"
 #include "../common/quantile.h"
+#include "../common/threading_utils.h"

 namespace xgboost {
 namespace tree {
@@ -221,8 +222,7 @@ class BaseMaker: public TreeUpdater {
    // so that they are ignored in future statistics collection
    const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);

-#pragma omp parallel for schedule(static)
-    for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
+    common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
      const int nid = this->DecodePosition(ridx);
      if (tree[nid].IsLeaf()) {
        // mark finish when it is not a fresh leaf
@@ -237,7 +237,7 @@ class BaseMaker: public TreeUpdater {
          this->SetEncodePosition(ridx, tree[nid].RightChild());
        }
      }
-    }
+    });
  }
  /*!
   * \brief this is helper function uses column based data structure,
@@ -257,8 +257,7 @@ class BaseMaker: public TreeUpdater {

      if (it != sorted_split_set.end() && *it == fid) {
        const auto ndata = static_cast<bst_omp_uint>(col.size());
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = 0; j < ndata; ++j) {
+        common::ParallelFor(ndata, [&](bst_omp_uint j) {
          const bst_uint ridx = col[j].index;
          const bst_float fvalue = col[j].fvalue;
          const int nid = this->DecodePosition(ridx);
@@ -273,7 +272,7 @@ class BaseMaker: public TreeUpdater {
              this->SetEncodePosition(ridx, tree[pid].RightChild());
            }
          }
-        }
+        });
      }
    }
  }
@@ -314,8 +313,7 @@ class BaseMaker: public TreeUpdater {
      for (auto fid : fsplits) {
        auto col = page[fid];
        const auto ndata = static_cast<bst_omp_uint>(col.size());
-#pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = 0; j < ndata; ++j) {
+        common::ParallelFor(ndata, [&](bst_omp_uint j) {
          const bst_uint ridx = col[j].index;
          const bst_float fvalue = col[j].fvalue;
          const int nid = this->DecodePosition(ridx);
@@ -327,7 +325,7 @@ class BaseMaker: public TreeUpdater {
              this->SetEncodePosition(ridx, tree[nid].RightChild());
            }
          }
-        }
+        });
      }
    }
  }
@@ -341,24 +339,27 @@ class BaseMaker: public TreeUpdater {
    std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
    thread_temp.resize(omp_get_max_threads());
    p_node_stats->resize(tree.param.num_nodes);
+    dmlc::OMPException exc;
 #pragma omp parallel
    {
-      const int tid = omp_get_thread_num();
-      thread_temp[tid].resize(tree.param.num_nodes, TStats());
-      for (unsigned int nid : qexpand_) {
-        thread_temp[tid][nid] = TStats();
-      }
+      exc.Run([&]() {
+        const int tid = omp_get_thread_num();
+        thread_temp[tid].resize(tree.param.num_nodes, TStats());
+        for (unsigned int nid : qexpand_) {
+          thread_temp[tid][nid] = TStats();
+        }
+      });
    }
+    exc.Rethrow();
    // setup position
    const auto ndata = static_cast<bst_omp_uint>(fmat.Info().num_row_);
-#pragma omp parallel for schedule(static)
-    for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
+    common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
      const int nid = position_[ridx];
      const int tid = omp_get_thread_num();
      if (nid >= 0) {
        thread_temp[tid][nid].Add(gpair[ridx]);
      }
-    }
+    });
    // sum the per thread statistics together
    for (int nid : qexpand_) {
      TStats &s = (*p_node_stats)[nid];
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -264,12 +264,16 @@ class ColMaker: public TreeUpdater {
      const MetaInfo& info = fmat.Info();
      // setup position
      const auto ndata = static_cast<bst_omp_uint>(info.num_row_);
+      dmlc::OMPException exc;
      #pragma omp parallel for schedule(static)
      for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
-        const int tid = omp_get_thread_num();
-        if (position_[ridx] < 0) continue;
-        stemp_[tid][position_[ridx]].stats.Add(gpair[ridx]);
+        exc.Run([&]() {
+          const int tid = omp_get_thread_num();
+          if (position_[ridx] < 0) return;
+          stemp_[tid][position_[ridx]].stats.Add(gpair[ridx]);
+        });
      }
+      exc.Rethrow();
      // sum the per thread statistics together
      for (int nid : qexpand) {
        GradStats stats;
@@ -447,11 +451,11 @@ class ColMaker: public TreeUpdater {
          std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
 #endif  // defined(_OPENMP)
      {
-        dmlc::OMPException omp_handler;
        auto page = batch.GetView();
+        dmlc::OMPException exc;
 #pragma omp parallel for schedule(dynamic, batch_size)
        for (bst_omp_uint i = 0; i < num_features; ++i) {
-          omp_handler.Run([&]() {
+          exc.Run([&]() {
            auto evaluator = tree_evaluator_.GetEvaluator();
            bst_feature_t const fid = feat_set[i];
            int32_t const tid = omp_get_thread_num();
@@ -461,16 +465,16 @@ class ColMaker: public TreeUpdater {
            if (colmaker_train_param_.NeedForwardSearch(
                    param_.default_direction, column_densities_[fid], ind)) {
              this->EnumerateSplit(c.data(), c.data() + c.size(), +1, fid,
-                                   gpair, stemp_[tid], evaluator);
+                                  gpair, stemp_[tid], evaluator);
            }
            if (colmaker_train_param_.NeedBackwardSearch(
                    param_.default_direction)) {
              this->EnumerateSplit(c.data() + c.size() - 1, c.data() - 1, -1,
-                                   fid, gpair, stemp_[tid], evaluator);
+                                  fid, gpair, stemp_[tid], evaluator);
            }
          });
        }
-        omp_handler.Rethrow();
+        exc.Rethrow();
      }
    }
    // find splits at current level, do split per level
@@ -521,8 +525,7 @@ class ColMaker: public TreeUpdater {
      // so that they are ignored in future statistics collection
      const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);

-#pragma omp parallel for schedule(static)
-      for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
+      common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
        CHECK_LT(ridx, position_.size())
            << "ridx exceed bound " << "ridx="<<  ridx << " pos=" << position_.size();
        const int nid = this->DecodePosition(ridx);
@@ -539,7 +542,7 @@ class ColMaker: public TreeUpdater {
            this->SetEncodePosition(ridx, tree[nid].RightChild());
          }
        }
-      }
+      });
    }
    // customization part
    // synchronize the best solution of each node
@@ -568,8 +571,7 @@ class ColMaker: public TreeUpdater {
        for (auto fid : fsplits) {
          auto col = page[fid];
          const auto ndata = static_cast<bst_omp_uint>(col.size());
-#pragma omp parallel for schedule(static)
-          for (bst_omp_uint j = 0; j < ndata; ++j) {
+          common::ParallelFor(ndata, [&](bst_omp_uint j) {
            const bst_uint ridx = col[j].index;
            const int nid = this->DecodePosition(ridx);
            const bst_float fvalue = col[j].fvalue;
@@ -581,7 +583,7 @@ class ColMaker: public TreeUpdater {
                this->SetEncodePosition(ridx, tree[nid].RightChild());
              }
            }
-          }
+          });
        }
      }
    }
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -202,22 +202,26 @@ class HistMaker: public BaseMaker {
    std::vector<SplitEntry> sol(qexpand_.size());
    std::vector<GradStats> left_sum(qexpand_.size());
    auto nexpand = static_cast<bst_omp_uint>(qexpand_.size());
+    dmlc::OMPException exc;
 #pragma omp parallel for schedule(dynamic, 1)
    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
-      const int nid = qexpand_[wid];
-      CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
-      SplitEntry &best = sol[wid];
-      GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      for (size_t i = 0; i < feature_set.size(); ++i) {
-        // Query is thread safe as it's a const function.
-        if (!this->interaction_constraints_.Query(nid, feature_set[i])) {
-          continue;
-        }
+      exc.Run([&]() {
+        const int nid = qexpand_[wid];
+        CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
+        SplitEntry &best = sol[wid];
+        GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+        for (size_t i = 0; i < feature_set.size(); ++i) {
+          // Query is thread safe as it's a const function.
+          if (!this->interaction_constraints_.Query(nid, feature_set[i])) {
+            continue;
+          }

-        EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
-                       node_sum, feature_set[i], &best, &left_sum[wid]);
-      }
+          EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
+                        node_sum, feature_set[i], &best, &left_sum[wid]);
+        }
+      });
    }
+    exc.Rethrow();
    // get the best result, we can synchronize the solution
    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
      const bst_node_t nid = qexpand_[wid];
@@ -341,16 +345,20 @@ class CQHistMaker: public HistMaker {
        auto page = batch.GetView();
        // start enumeration
        const auto nsize = static_cast<bst_omp_uint>(fset.size());
+        dmlc::OMPException exc;
 #pragma omp parallel for schedule(dynamic, 1)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int fid = fset[i];
-          int offset = feat2workindex_[fid];
-          if (offset >= 0) {
-            this->UpdateHistCol(gpair, page[fid], info, tree,
-                                fset, offset,
-                                &thread_hist_[omp_get_thread_num()]);
-          }
+          exc.Run([&]() {
+            int fid = fset[i];
+            int offset = feat2workindex_[fid];
+            if (offset >= 0) {
+              this->UpdateHistCol(gpair, page[fid], info, tree,
+                                  fset, offset,
+                                  &thread_hist_[omp_get_thread_num()]);
+            }
+          });
        }
+        exc.Rethrow();
      }
      // update node statistics.
      this->GetNodeStats(gpair, *p_fmat, tree,
@@ -417,16 +425,20 @@ class CQHistMaker: public HistMaker {
        auto page = batch.GetView();
        // start enumeration
        const auto nsize = static_cast<bst_omp_uint>(work_set_.size());
+        dmlc::OMPException exc;
 #pragma omp parallel for schedule(dynamic, 1)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int fid = work_set_[i];
-          int offset = feat2workindex_[fid];
-          if (offset >= 0) {
-            this->UpdateSketchCol(gpair, page[fid], tree,
-                                  work_set_size, offset,
-                                  &thread_sketch_[omp_get_thread_num()]);
-          }
+          exc.Run([&]() {
+            int fid = work_set_[i];
+            int offset = feat2workindex_[fid];
+            if (offset >= 0) {
+              this->UpdateSketchCol(gpair, page[fid], tree,
+                                    work_set_size, offset,
+                                    &thread_sketch_[omp_get_thread_num()]);
+            }
+          });
        }
+        exc.Rethrow();
      }
      for (size_t i = 0; i < sketchs_.size(); ++i) {
        common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
@@ -701,16 +713,20 @@ class GlobalProposalHistMaker: public CQHistMaker {

        // start enumeration
        const auto nsize = static_cast<bst_omp_uint>(this->work_set_.size());
+        dmlc::OMPException exc;
 #pragma omp parallel for schedule(dynamic, 1)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int fid = this->work_set_[i];
-          int offset = this->feat2workindex_[fid];
-          if (offset >= 0) {
-            this->UpdateHistCol(gpair, page[fid], info, tree,
-                                fset, offset,
-                                &this->thread_hist_[omp_get_thread_num()]);
-          }
+          exc.Run([&]() {
+            int fid = this->work_set_[i];
+            int offset = this->feat2workindex_[fid];
+            if (offset >= 0) {
+              this->UpdateHistCol(gpair, page[fid], info, tree,
+                                  fset, offset,
+                                  &this->thread_hist_[omp_get_thread_num()]);
+            }
+          });
        }
+        exc.Rethrow();
      }

      // update node statistics.
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -713,20 +713,24 @@ void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const std::vector<Gr
  const size_t discard_size = info.num_row_ / nthread;
  auto upper_border = static_cast<float>(std::numeric_limits<uint32_t>::max());
  uint32_t coin_flip_border = static_cast<uint32_t>(upper_border * param_.subsample);
+  dmlc::OMPException exc;
  #pragma omp parallel num_threads(nthread)
  {
-    const size_t tid = omp_get_thread_num();
-    const size_t ibegin = tid * discard_size;
-    const size_t iend = (tid == (nthread - 1)) ?
-                        info.num_row_ : ibegin + discard_size;
+    exc.Run([&]() {
+      const size_t tid = omp_get_thread_num();
+      const size_t ibegin = tid * discard_size;
+      const size_t iend = (tid == (nthread - 1)) ?
+                          info.num_row_ : ibegin + discard_size;

-    rnds[tid].discard(discard_size * tid);
-    for (size_t i = ibegin; i < iend; ++i) {
-      if (gpair[i].GetHess() >= 0.0f && rnds[tid]() < coin_flip_border) {
-        p_row_indices[ibegin + row_offsets[tid]++] = i;
+      rnds[tid].discard(discard_size * tid);
+      for (size_t i = ibegin; i < iend; ++i) {
+        if (gpair[i].GetHess() >= 0.0f && rnds[tid]() < coin_flip_border) {
+          p_row_indices[ibegin + row_offsets[tid]++] = i;
+        }
      }
-    }
+    });
  }
+  exc.Rethrow();
  /* discard global engine */
  rnd = rnds[nthread - 1];
  size_t prefix_sum = row_offsets[0];
@@ -769,10 +773,14 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
    hist_buffer_.Init(nbins);

    // initialize histogram builder
+    dmlc::OMPException exc;
 #pragma omp parallel
    {
-      this->nthread_ = omp_get_num_threads();
+      exc.Run([&]() {
+        this->nthread_ = omp_get_num_threads();
+      });
    }
+    exc.Rethrow();
    hist_builder_ = GHistBuilder<GradientSumT>(this->nthread_, nbins);

    std::vector<size_t>& row_indices = *row_set_collection_.Data();
@@ -794,18 +802,21 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&

      #pragma omp parallel num_threads(this->nthread_)
      {
-        const size_t tid = omp_get_thread_num();
-        const size_t ibegin = tid * block_size;
-        const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
-            static_cast<size_t>(info.num_row_));
+        exc.Run([&]() {
+          const size_t tid = omp_get_thread_num();
+          const size_t ibegin = tid * block_size;
+          const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
+              static_cast<size_t>(info.num_row_));

-        for (size_t i = ibegin; i < iend; ++i) {
-          if (gpair[i].GetHess() < 0.0f) {
-            p_buff[tid] = true;
-            break;
+          for (size_t i = ibegin; i < iend; ++i) {
+            if (gpair[i].GetHess() < 0.0f) {
+              p_buff[tid] = true;
+              break;
+            }
          }
-        }
+        });
      }
+      exc.Rethrow();

      bool has_neg_hess = false;
      for (int32_t tid = 0; tid < this->nthread_; ++tid) {
@@ -825,14 +836,17 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
      } else {
        #pragma omp parallel num_threads(this->nthread_)
        {
-          const size_t tid = omp_get_thread_num();
-          const size_t ibegin = tid * block_size;
-          const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
-              static_cast<size_t>(info.num_row_));
-          for (size_t i = ibegin; i < iend; ++i) {
-           p_row_indices[i] = i;
-          }
+          exc.Run([&]() {
+            const size_t tid = omp_get_thread_num();
+            const size_t ibegin = tid * block_size;
+            const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
+                static_cast<size_t>(info.num_row_));
+            for (size_t i = ibegin; i < iend; ++i) {
+              p_row_indices[i] = i;
+            }
+          });
        }
+        exc.Rethrow();
      }
    }
  }
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -13,6 +13,7 @@
 #include "xgboost/json.h"
 #include "./param.h"
 #include "../common/io.h"
+#include "../common/threading_utils.h"

 namespace xgboost {
 namespace tree {
@@ -52,17 +53,21 @@ class TreeRefresher: public TreeUpdater {
    const int nthread = omp_get_max_threads();
    fvec_temp.resize(nthread, RegTree::FVec());
    stemp.resize(nthread, std::vector<GradStats>());
+    dmlc::OMPException exc;
    #pragma omp parallel
    {
-      int tid = omp_get_thread_num();
-      int num_nodes = 0;
-      for (auto tree : trees) {
-        num_nodes += tree->param.num_nodes;
-      }
-      stemp[tid].resize(num_nodes, GradStats());
-      std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
-      fvec_temp[tid].Init(trees[0]->param.num_feature);
+      exc.Run([&]() {
+        int tid = omp_get_thread_num();
+        int num_nodes = 0;
+        for (auto tree : trees) {
+          num_nodes += tree->param.num_nodes;
+        }
+        stemp[tid].resize(num_nodes, GradStats());
+        std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
+        fvec_temp[tid].Init(trees[0]->param.num_feature);
+      });
    }
+    exc.Rethrow();
    // if it is C++11, use lazy evaluation for Allreduce,
    // to gain speedup in recovery
    auto lazy_get_stats = [&]() {
@@ -72,8 +77,7 @@ class TreeRefresher: public TreeUpdater {
        auto page = batch.GetView();
        CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
        const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
-#pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        common::ParallelFor(nbatch, [&](bst_omp_uint i) {
          SparsePage::Inst inst = page[i];
          const int tid = omp_get_thread_num();
          const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
@@ -86,16 +90,15 @@ class TreeRefresher: public TreeUpdater {
            offset += tree->param.num_nodes;
          }
          feats.Drop(inst);
-        }
+        });
      }
      // aggregate the statistics
      auto num_nodes = static_cast<int>(stemp[0].size());
-      #pragma omp parallel for schedule(static)
-      for (int nid = 0; nid < num_nodes; ++nid) {
+      common::ParallelFor(num_nodes, [&](int nid) {
        for (int tid = 1; tid < nthread; ++tid) {
          stemp[0][nid].Add(stemp[tid][nid]);
        }
-      }
+      });
    };
    reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
    // rescale learning rate according to size of trees