Improve OpenMP exception handling (#6680)

2021-02-25 06:56:16 +01:00
parent c375173dca
commit 9b530e5697
26 changed files with 610 additions and 475 deletions
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -14,6 +14,7 @@
 #include "./param.h"
 #include "../gbm/gblinear_model.h"
 #include "../common/random.h"
+#include "../common/threading_utils.h"

 namespace xgboost {
 namespace linear {
@@ -115,14 +116,18 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
    auto page = batch.GetView();
    auto col = page[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.size());
+    dmlc::OMPException exc;
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
    for (bst_omp_uint j = 0; j < ndata; ++j) {
-      const bst_float v = col[j].fvalue;
-      auto &p = gpair[col[j].index * num_group + group_idx];
-      if (p.GetHess() < 0.0f) continue;
-      sum_grad += p.GetGrad() * v;
-      sum_hess += p.GetHess() * v * v;
+      exc.Run([&]() {
+        const bst_float v = col[j].fvalue;
+        auto &p = gpair[col[j].index * num_group + group_idx];
+        if (p.GetHess() < 0.0f) return;
+        sum_grad += p.GetGrad() * v;
+        sum_hess += p.GetHess() * v * v;
+      });
    }
+    exc.Rethrow();
  }
  return std::make_pair(sum_grad, sum_hess);
 }
@@ -142,14 +147,18 @@ inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_
                                                         DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
  const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
+  dmlc::OMPException exc;
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
  for (bst_omp_uint i = 0; i < ndata; ++i) {
-    auto &p = gpair[i * num_group + group_idx];
-    if (p.GetHess() >= 0.0f) {
-      sum_grad += p.GetGrad();
-      sum_hess += p.GetHess();
-    }
+    exc.Run([&]() {
+      auto &p = gpair[i * num_group + group_idx];
+      if (p.GetHess() >= 0.0f) {
+        sum_grad += p.GetGrad();
+        sum_hess += p.GetHess();
+      }
+    });
  }
+  exc.Rethrow();
  return std::make_pair(sum_grad, sum_hess);
 }

@@ -172,12 +181,16 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
    auto col = page[fidx];
    // update grad value
    const auto num_row = static_cast<bst_omp_uint>(col.size());
+    dmlc::OMPException exc;
 #pragma omp parallel for schedule(static)
    for (bst_omp_uint j = 0; j < num_row; ++j) {
-      GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
-      if (p.GetHess() < 0.0f) continue;
-      p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
+      exc.Run([&]() {
+        GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
+        if (p.GetHess() < 0.0f) return;
+        p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
+      });
    }
+    exc.Rethrow();
  }
 }

@@ -195,12 +208,16 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias
                                       DMatrix *p_fmat) {
  if (dbias == 0.0f) return;
  const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
+  dmlc::OMPException exc;
 #pragma omp parallel for schedule(static)
  for (bst_omp_uint i = 0; i < ndata; ++i) {
-    GradientPair &g = (*in_gpair)[i * num_group + group_idx];
-    if (g.GetHess() < 0.0f) continue;
-    g += GradientPair(g.GetHess() * dbias, 0);
+    exc.Run([&]() {
+      GradientPair &g = (*in_gpair)[i * num_group + group_idx];
+      if (g.GetHess() < 0.0f) return;
+      g += GradientPair(g.GetHess() * dbias, 0);
+    });
  }
+  exc.Rethrow();
 }

 /**
@@ -336,10 +353,9 @@ class GreedyFeatureSelector : public FeatureSelector {
    const bst_omp_uint nfeat = model.learner_model_param->num_feature;
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
-  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
-    auto page = batch.GetView();
-#pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nfeat; ++i) {
+    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+      auto page = batch.GetView();
+      common::ParallelFor(nfeat, [&](bst_omp_uint i) {
        const auto col = page[i];
        const bst_uint ndata = col.size();
        auto &sums = gpair_sums_[group_idx * nfeat + i];
@@ -350,7 +366,7 @@ class GreedyFeatureSelector : public FeatureSelector {
          sums.first += p.GetGrad() * v;
          sums.second += p.GetHess() * v * v;
        }
-      }
+      });
    }
    // Find a feature with the largest magnitude of weight change
    int best_fidx = 0;
@@ -405,8 +421,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
      auto page = batch.GetView();
      // column-parallel is usually fastaer than row-parallel
-#pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nfeat; ++i) {
+      common::ParallelFor(nfeat, [&](bst_omp_uint i) {
        const auto col = page[i];
        const bst_uint ndata = col.size();
        for (bst_uint gid = 0u; gid < ngroup; ++gid) {
@@ -419,7 +434,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
            sums.second += p.GetHess() * v * v;
          }
        }
-      }
+      });
    }
    // rank by descending weight magnitude within the groups
    std::fill(deltaw_.begin(), deltaw_.end(), 0.f);