Improve OpenMP exception handling (#6680)

This commit is contained in:
Louis Desreumaux
2021-02-25 06:56:16 +01:00
committed by GitHub
parent c375173dca
commit 9b530e5697
26 changed files with 610 additions and 475 deletions

View File

@@ -25,6 +25,7 @@
#include "../common/io.h"
#include "../common/random.h"
#include "../common/quantile.h"
#include "../common/threading_utils.h"
namespace xgboost {
namespace tree {
@@ -221,8 +222,7 @@ class BaseMaker: public TreeUpdater {
// so that they are ignored in future statistics collection
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
const int nid = this->DecodePosition(ridx);
if (tree[nid].IsLeaf()) {
// mark finish when it is not a fresh leaf
@@ -237,7 +237,7 @@ class BaseMaker: public TreeUpdater {
this->SetEncodePosition(ridx, tree[nid].RightChild());
}
}
}
});
}
/*!
* \brief this is helper function uses column based data structure,
@@ -257,8 +257,7 @@ class BaseMaker: public TreeUpdater {
if (it != sorted_split_set.end() && *it == fid) {
const auto ndata = static_cast<bst_omp_uint>(col.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
common::ParallelFor(ndata, [&](bst_omp_uint j) {
const bst_uint ridx = col[j].index;
const bst_float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx);
@@ -273,7 +272,7 @@ class BaseMaker: public TreeUpdater {
this->SetEncodePosition(ridx, tree[pid].RightChild());
}
}
}
});
}
}
}
@@ -314,8 +313,7 @@ class BaseMaker: public TreeUpdater {
for (auto fid : fsplits) {
auto col = page[fid];
const auto ndata = static_cast<bst_omp_uint>(col.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
common::ParallelFor(ndata, [&](bst_omp_uint j) {
const bst_uint ridx = col[j].index;
const bst_float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx);
@@ -327,7 +325,7 @@ class BaseMaker: public TreeUpdater {
this->SetEncodePosition(ridx, tree[nid].RightChild());
}
}
}
});
}
}
}
@@ -341,24 +339,27 @@ class BaseMaker: public TreeUpdater {
std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
thread_temp.resize(omp_get_max_threads());
p_node_stats->resize(tree.param.num_nodes);
dmlc::OMPException exc;
#pragma omp parallel
{
const int tid = omp_get_thread_num();
thread_temp[tid].resize(tree.param.num_nodes, TStats());
for (unsigned int nid : qexpand_) {
thread_temp[tid][nid] = TStats();
}
exc.Run([&]() {
const int tid = omp_get_thread_num();
thread_temp[tid].resize(tree.param.num_nodes, TStats());
for (unsigned int nid : qexpand_) {
thread_temp[tid][nid] = TStats();
}
});
}
exc.Rethrow();
// setup position
const auto ndata = static_cast<bst_omp_uint>(fmat.Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
const int nid = position_[ridx];
const int tid = omp_get_thread_num();
if (nid >= 0) {
thread_temp[tid][nid].Add(gpair[ridx]);
}
}
});
// sum the per thread statistics together
for (int nid : qexpand_) {
TStats &s = (*p_node_stats)[nid];

View File

@@ -264,12 +264,16 @@ class ColMaker: public TreeUpdater {
const MetaInfo& info = fmat.Info();
// setup position
const auto ndata = static_cast<bst_omp_uint>(info.num_row_);
dmlc::OMPException exc;
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
const int tid = omp_get_thread_num();
if (position_[ridx] < 0) continue;
stemp_[tid][position_[ridx]].stats.Add(gpair[ridx]);
exc.Run([&]() {
const int tid = omp_get_thread_num();
if (position_[ridx] < 0) return;
stemp_[tid][position_[ridx]].stats.Add(gpair[ridx]);
});
}
exc.Rethrow();
// sum the per thread statistics together
for (int nid : qexpand) {
GradStats stats;
@@ -447,11 +451,11 @@ class ColMaker: public TreeUpdater {
std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
#endif // defined(_OPENMP)
{
dmlc::OMPException omp_handler;
auto page = batch.GetView();
dmlc::OMPException exc;
#pragma omp parallel for schedule(dynamic, batch_size)
for (bst_omp_uint i = 0; i < num_features; ++i) {
omp_handler.Run([&]() {
exc.Run([&]() {
auto evaluator = tree_evaluator_.GetEvaluator();
bst_feature_t const fid = feat_set[i];
int32_t const tid = omp_get_thread_num();
@@ -461,16 +465,16 @@ class ColMaker: public TreeUpdater {
if (colmaker_train_param_.NeedForwardSearch(
param_.default_direction, column_densities_[fid], ind)) {
this->EnumerateSplit(c.data(), c.data() + c.size(), +1, fid,
gpair, stemp_[tid], evaluator);
gpair, stemp_[tid], evaluator);
}
if (colmaker_train_param_.NeedBackwardSearch(
param_.default_direction)) {
this->EnumerateSplit(c.data() + c.size() - 1, c.data() - 1, -1,
fid, gpair, stemp_[tid], evaluator);
fid, gpair, stemp_[tid], evaluator);
}
});
}
omp_handler.Rethrow();
exc.Rethrow();
}
}
// find splits at current level, do split per level
@@ -521,8 +525,7 @@ class ColMaker: public TreeUpdater {
// so that they are ignored in future statistics collection
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
#pragma omp parallel for schedule(static)
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
CHECK_LT(ridx, position_.size())
<< "ridx exceed bound " << "ridx="<< ridx << " pos=" << position_.size();
const int nid = this->DecodePosition(ridx);
@@ -539,7 +542,7 @@ class ColMaker: public TreeUpdater {
this->SetEncodePosition(ridx, tree[nid].RightChild());
}
}
}
});
}
// customization part
// synchronize the best solution of each node
@@ -568,8 +571,7 @@ class ColMaker: public TreeUpdater {
for (auto fid : fsplits) {
auto col = page[fid];
const auto ndata = static_cast<bst_omp_uint>(col.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
common::ParallelFor(ndata, [&](bst_omp_uint j) {
const bst_uint ridx = col[j].index;
const int nid = this->DecodePosition(ridx);
const bst_float fvalue = col[j].fvalue;
@@ -581,7 +583,7 @@ class ColMaker: public TreeUpdater {
this->SetEncodePosition(ridx, tree[nid].RightChild());
}
}
}
});
}
}
}

View File

@@ -202,22 +202,26 @@ class HistMaker: public BaseMaker {
std::vector<SplitEntry> sol(qexpand_.size());
std::vector<GradStats> left_sum(qexpand_.size());
auto nexpand = static_cast<bst_omp_uint>(qexpand_.size());
dmlc::OMPException exc;
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
const int nid = qexpand_[wid];
CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
SplitEntry &best = sol[wid];
GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
for (size_t i = 0; i < feature_set.size(); ++i) {
// Query is thread safe as it's a const function.
if (!this->interaction_constraints_.Query(nid, feature_set[i])) {
continue;
}
exc.Run([&]() {
const int nid = qexpand_[wid];
CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
SplitEntry &best = sol[wid];
GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
for (size_t i = 0; i < feature_set.size(); ++i) {
// Query is thread safe as it's a const function.
if (!this->interaction_constraints_.Query(nid, feature_set[i])) {
continue;
}
EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
node_sum, feature_set[i], &best, &left_sum[wid]);
}
EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
node_sum, feature_set[i], &best, &left_sum[wid]);
}
});
}
exc.Rethrow();
// get the best result, we can synchronize the solution
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
const bst_node_t nid = qexpand_[wid];
@@ -341,16 +345,20 @@ class CQHistMaker: public HistMaker {
auto page = batch.GetView();
// start enumeration
const auto nsize = static_cast<bst_omp_uint>(fset.size());
dmlc::OMPException exc;
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) {
int fid = fset[i];
int offset = feat2workindex_[fid];
if (offset >= 0) {
this->UpdateHistCol(gpair, page[fid], info, tree,
fset, offset,
&thread_hist_[omp_get_thread_num()]);
}
exc.Run([&]() {
int fid = fset[i];
int offset = feat2workindex_[fid];
if (offset >= 0) {
this->UpdateHistCol(gpair, page[fid], info, tree,
fset, offset,
&thread_hist_[omp_get_thread_num()]);
}
});
}
exc.Rethrow();
}
// update node statistics.
this->GetNodeStats(gpair, *p_fmat, tree,
@@ -417,16 +425,20 @@ class CQHistMaker: public HistMaker {
auto page = batch.GetView();
// start enumeration
const auto nsize = static_cast<bst_omp_uint>(work_set_.size());
dmlc::OMPException exc;
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) {
int fid = work_set_[i];
int offset = feat2workindex_[fid];
if (offset >= 0) {
this->UpdateSketchCol(gpair, page[fid], tree,
work_set_size, offset,
&thread_sketch_[omp_get_thread_num()]);
}
exc.Run([&]() {
int fid = work_set_[i];
int offset = feat2workindex_[fid];
if (offset >= 0) {
this->UpdateSketchCol(gpair, page[fid], tree,
work_set_size, offset,
&thread_sketch_[omp_get_thread_num()]);
}
});
}
exc.Rethrow();
}
for (size_t i = 0; i < sketchs_.size(); ++i) {
common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
@@ -701,16 +713,20 @@ class GlobalProposalHistMaker: public CQHistMaker {
// start enumeration
const auto nsize = static_cast<bst_omp_uint>(this->work_set_.size());
dmlc::OMPException exc;
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) {
int fid = this->work_set_[i];
int offset = this->feat2workindex_[fid];
if (offset >= 0) {
this->UpdateHistCol(gpair, page[fid], info, tree,
fset, offset,
&this->thread_hist_[omp_get_thread_num()]);
}
exc.Run([&]() {
int fid = this->work_set_[i];
int offset = this->feat2workindex_[fid];
if (offset >= 0) {
this->UpdateHistCol(gpair, page[fid], info, tree,
fset, offset,
&this->thread_hist_[omp_get_thread_num()]);
}
});
}
exc.Rethrow();
}
// update node statistics.

View File

@@ -713,20 +713,24 @@ void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const std::vector<Gr
const size_t discard_size = info.num_row_ / nthread;
auto upper_border = static_cast<float>(std::numeric_limits<uint32_t>::max());
uint32_t coin_flip_border = static_cast<uint32_t>(upper_border * param_.subsample);
dmlc::OMPException exc;
#pragma omp parallel num_threads(nthread)
{
const size_t tid = omp_get_thread_num();
const size_t ibegin = tid * discard_size;
const size_t iend = (tid == (nthread - 1)) ?
info.num_row_ : ibegin + discard_size;
exc.Run([&]() {
const size_t tid = omp_get_thread_num();
const size_t ibegin = tid * discard_size;
const size_t iend = (tid == (nthread - 1)) ?
info.num_row_ : ibegin + discard_size;
rnds[tid].discard(discard_size * tid);
for (size_t i = ibegin; i < iend; ++i) {
if (gpair[i].GetHess() >= 0.0f && rnds[tid]() < coin_flip_border) {
p_row_indices[ibegin + row_offsets[tid]++] = i;
rnds[tid].discard(discard_size * tid);
for (size_t i = ibegin; i < iend; ++i) {
if (gpair[i].GetHess() >= 0.0f && rnds[tid]() < coin_flip_border) {
p_row_indices[ibegin + row_offsets[tid]++] = i;
}
}
}
});
}
exc.Rethrow();
/* discard global engine */
rnd = rnds[nthread - 1];
size_t prefix_sum = row_offsets[0];
@@ -769,10 +773,14 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
hist_buffer_.Init(nbins);
// initialize histogram builder
dmlc::OMPException exc;
#pragma omp parallel
{
this->nthread_ = omp_get_num_threads();
exc.Run([&]() {
this->nthread_ = omp_get_num_threads();
});
}
exc.Rethrow();
hist_builder_ = GHistBuilder<GradientSumT>(this->nthread_, nbins);
std::vector<size_t>& row_indices = *row_set_collection_.Data();
@@ -794,18 +802,21 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
#pragma omp parallel num_threads(this->nthread_)
{
const size_t tid = omp_get_thread_num();
const size_t ibegin = tid * block_size;
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
static_cast<size_t>(info.num_row_));
exc.Run([&]() {
const size_t tid = omp_get_thread_num();
const size_t ibegin = tid * block_size;
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
static_cast<size_t>(info.num_row_));
for (size_t i = ibegin; i < iend; ++i) {
if (gpair[i].GetHess() < 0.0f) {
p_buff[tid] = true;
break;
for (size_t i = ibegin; i < iend; ++i) {
if (gpair[i].GetHess() < 0.0f) {
p_buff[tid] = true;
break;
}
}
}
});
}
exc.Rethrow();
bool has_neg_hess = false;
for (int32_t tid = 0; tid < this->nthread_; ++tid) {
@@ -825,14 +836,17 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
} else {
#pragma omp parallel num_threads(this->nthread_)
{
const size_t tid = omp_get_thread_num();
const size_t ibegin = tid * block_size;
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
static_cast<size_t>(info.num_row_));
for (size_t i = ibegin; i < iend; ++i) {
p_row_indices[i] = i;
}
exc.Run([&]() {
const size_t tid = omp_get_thread_num();
const size_t ibegin = tid * block_size;
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
static_cast<size_t>(info.num_row_));
for (size_t i = ibegin; i < iend; ++i) {
p_row_indices[i] = i;
}
});
}
exc.Rethrow();
}
}
}

View File

@@ -13,6 +13,7 @@
#include "xgboost/json.h"
#include "./param.h"
#include "../common/io.h"
#include "../common/threading_utils.h"
namespace xgboost {
namespace tree {
@@ -52,17 +53,21 @@ class TreeRefresher: public TreeUpdater {
const int nthread = omp_get_max_threads();
fvec_temp.resize(nthread, RegTree::FVec());
stemp.resize(nthread, std::vector<GradStats>());
dmlc::OMPException exc;
#pragma omp parallel
{
int tid = omp_get_thread_num();
int num_nodes = 0;
for (auto tree : trees) {
num_nodes += tree->param.num_nodes;
}
stemp[tid].resize(num_nodes, GradStats());
std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
fvec_temp[tid].Init(trees[0]->param.num_feature);
exc.Run([&]() {
int tid = omp_get_thread_num();
int num_nodes = 0;
for (auto tree : trees) {
num_nodes += tree->param.num_nodes;
}
stemp[tid].resize(num_nodes, GradStats());
std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
fvec_temp[tid].Init(trees[0]->param.num_feature);
});
}
exc.Rethrow();
// if it is C++11, use lazy evaluation for Allreduce,
// to gain speedup in recovery
auto lazy_get_stats = [&]() {
@@ -72,8 +77,7 @@ class TreeRefresher: public TreeUpdater {
auto page = batch.GetView();
CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) {
common::ParallelFor(nbatch, [&](bst_omp_uint i) {
SparsePage::Inst inst = page[i];
const int tid = omp_get_thread_num();
const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
@@ -86,16 +90,15 @@ class TreeRefresher: public TreeUpdater {
offset += tree->param.num_nodes;
}
feats.Drop(inst);
}
});
}
// aggregate the statistics
auto num_nodes = static_cast<int>(stemp[0].size());
#pragma omp parallel for schedule(static)
for (int nid = 0; nid < num_nodes; ++nid) {
common::ParallelFor(num_nodes, [&](int nid) {
for (int tid = 1; tid < nthread; ++tid) {
stemp[0][nid].Add(stemp[tid][nid]);
}
}
});
};
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
// rescale learning rate according to size of trees