Improve OpenMP exception handling (#6680)

This commit is contained in:
Louis Desreumaux
2021-02-25 06:56:16 +01:00
committed by GitHub
parent c375173dca
commit 9b530e5697
26 changed files with 610 additions and 475 deletions

View File

@@ -230,8 +230,7 @@ class ColumnMatrix {
/* missing values make sense only for column with type kDenseColumn,
and if no missing values were observed it could be handled much faster. */
if (noMissingValues) {
#pragma omp parallel for num_threads(omp_get_max_threads())
for (omp_ulong rid = 0; rid < nrow; ++rid) {
ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
const size_t ibegin = rid*nfeature;
const size_t iend = (rid+1)*nfeature;
size_t j = 0;
@@ -239,7 +238,7 @@ class ColumnMatrix {
const size_t idx = feature_offsets_[j];
local_index[idx + rid] = index[i];
}
}
});
} else {
/* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
size_t rbegin = 0;

View File

@@ -84,38 +84,46 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
size_t block_size = batch.Size() / batch_threads;
dmlc::OMPException exc;
#pragma omp parallel num_threads(batch_threads)
{
#pragma omp for
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
exc.Run([&]() {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
size_t sum = 0;
for (size_t i = ibegin; i < iend; ++i) {
sum += page[i].size();
row_ptr[rbegin + 1 + i] = sum;
}
size_t sum = 0;
for (size_t i = ibegin; i < iend; ++i) {
sum += page[i].size();
row_ptr[rbegin + 1 + i] = sum;
}
});
}
#pragma omp single
{
p_part[0] = prev_sum;
for (size_t i = 1; i < batch_threads; ++i) {
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
}
exc.Run([&]() {
p_part[0] = prev_sum;
for (size_t i = 1; i < batch_threads; ++i) {
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
}
});
}
#pragma omp for
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
exc.Run([&]() {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
for (size_t i = ibegin; i < iend; ++i) {
row_ptr[rbegin + 1 + i] += p_part[tid];
}
for (size_t i = ibegin; i < iend; ++i) {
row_ptr[rbegin + 1 + i] += p_part[tid];
}
});
}
}
exc.Rethrow();
const size_t n_offsets = cut.Ptrs().size() - 1;
const size_t n_index = row_ptr[rbegin + batch.Size()];
@@ -167,13 +175,12 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
[](auto idx, auto) { return idx; });
}
#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
ParallelFor(bst_omp_uint(nbins), nthread, [&](bst_omp_uint idx) {
for (int32_t tid = 0; tid < nthread; ++tid) {
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
hit_count_tloc_[tid * nbins + idx] = 0; // reset for next batch
}
}
});
prev_sum = row_ptr[rbegin + batch.Size()];
rbegin += batch.Size();
@@ -701,7 +708,7 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
GHistRowT hist) {
constexpr int kUnroll = 8; // loop unrolling factor
static constexpr int kUnroll = 8; // loop unrolling factor
const size_t nblock = gmatb.GetNumBlock();
const size_t nrows = row_indices.end - row_indices.begin;
const size_t rest = nrows % kUnroll;
@@ -710,40 +717,44 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
#endif // defined(_OPENMP)
xgboost::detail::GradientPairInternal<GradientSumT>* p_hist = hist.data();
dmlc::OMPException exc;
#pragma omp parallel for num_threads(nthread) schedule(guided)
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
auto gmat = gmatb[bid];
exc.Run([&]() {
auto gmat = gmatb[bid];
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];
for (int k = 0; k < kUnroll; ++k) {
rid[k] = row_indices.begin[i + k];
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < kUnroll; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
for (int k = 0; k < kUnroll; ++k) {
rid[k] = row_indices.begin[i + k];
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < kUnroll; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
}
}
}
}
for (size_t i = nrows - rest; i < nrows; ++i) {
const size_t rid = row_indices.begin[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
for (size_t i = nrows - rest; i < nrows; ++i) {
const size_t rid = row_indices.begin[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
}
}
}
});
}
exc.Rethrow();
}
template
void GHistBuilder<float>::BuildBlockHist(const std::vector<GradientPair>& gpair,
@@ -768,12 +779,11 @@ void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
const size_t block_size = 1024; // aproximatly 1024 values per block
size_t n_blocks = size/block_size + !!(size%block_size);
#pragma omp parallel for
for (omp_ulong iblock = 0; iblock < n_blocks; ++iblock) {
ParallelFor(omp_ulong(n_blocks), [&](omp_ulong iblock) {
const size_t ibegin = iblock*block_size;
const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
SubtractionHist(self, parent, sibling, ibegin, iend);
}
});
}
template
void GHistBuilder<float>::SubtractionTrick(GHistRow<float> self,

View File

@@ -257,8 +257,7 @@ struct GHistIndexMatrix {
const size_t batch_size = batch.Size();
CHECK_LT(batch_size, offset_vec.size());
BinIdxType* index_data = index_data_span.data();
#pragma omp parallel for num_threads(batch_threads) schedule(static)
for (omp_ulong i = 0; i < batch_size; ++i) {
ParallelFor(omp_ulong(batch_size), batch_threads, [&](omp_ulong i) {
const int tid = omp_get_thread_num();
size_t ibegin = row_ptr[rbegin + i];
size_t iend = row_ptr[rbegin + i + 1];
@@ -270,7 +269,7 @@ struct GHistIndexMatrix {
index_data[ibegin + j] = get_offset(idx, j);
++hit_count_tloc_[tid * nbins + idx];
}
}
});
}
void ResizeIndex(const size_t n_index,

View File

@@ -35,7 +35,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
column.resize(n_columns, 0);
}
ParallelFor(page.Size(), nthreads, [&](size_t i) {
ParallelFor(omp_ulong(page.Size()), nthreads, [&](omp_ulong i) {
auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
auto row = page[i];
auto const *p_row = row.data();
@@ -44,7 +44,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
}
});
std::vector<bst_row_t> entries_per_columns(n_columns, 0);
ParallelFor(n_columns, nthreads, [&](size_t i) {
ParallelFor(bst_omp_uint(n_columns), nthreads, [&](bst_omp_uint i) {
for (auto const &thread : column_sizes) {
entries_per_columns[i] += thread[i];
}
@@ -99,15 +99,15 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
std::vector<bst_uint> const &group_ptr = info.group_ptr_;
// Use group index for weights?
auto batch = page.GetView();
dmlc::OMPException exec;
// Parallel over columns. Each thread owns a set of consecutive columns.
auto const ncol = static_cast<uint32_t>(info.num_col_);
auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);
dmlc::OMPException exc;
#pragma omp parallel num_threads(nthread)
{
exec.Run([&]() {
exc.Run([&]() {
auto tid = static_cast<uint32_t>(omp_get_thread_num());
auto const begin = thread_columns_ptr[tid];
auto const end = thread_columns_ptr[tid + 1];
@@ -140,7 +140,7 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
}
});
}
exec.Rethrow();
exc.Rethrow();
monitor_.Stop(__func__);
}
@@ -242,7 +242,7 @@ size_t nbytes = 0;
&global_sketches);
std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) {
ParallelFor(omp_ulong(n_columns), [&](omp_ulong fidx) {
int32_t intermediate_num_cuts = num_cuts[fidx];
auto nbytes =
WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);

View File

@@ -115,11 +115,10 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
nthreads = std::min(nthreads, omp_get_max_threads());
nthreads = std::max(nthreads, 1);
dmlc::OMPException omp_exc;
dmlc::OMPException exc;
#pragma omp parallel num_threads(nthreads)
{
omp_exc.Run(
[](size_t num_blocks_in_space, const BlockedSpace2d& space, int nthreads, Func func) {
exc.Run([&]() {
size_t tid = omp_get_thread_num();
size_t chunck_size =
num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
@@ -129,19 +128,24 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
for (auto i = begin; i < end; i++) {
func(space.GetFirstDimension(i), space.GetRange(i));
}
}, num_blocks_in_space, space, nthreads, func);
});
}
omp_exc.Rethrow();
exc.Rethrow();
}
template <typename Func>
void ParallelFor(size_t size, size_t nthreads, Func fn) {
dmlc::OMPException omp_exc;
#pragma omp parallel for num_threads(nthreads)
for (omp_ulong i = 0; i < size; ++i) {
omp_exc.Run(fn, i);
template <typename Index, typename Func>
void ParallelFor(Index size, size_t nthreads, Func fn) {
dmlc::OMPException exc;
#pragma omp parallel for num_threads(nthreads) schedule(static)
for (Index i = 0; i < size; ++i) {
exc.Run(fn, i);
}
omp_exc.Rethrow();
exc.Rethrow();
}
template <typename Index, typename Func>
void ParallelFor(Index size, Func fn) {
ParallelFor(size, omp_get_max_threads(), fn);
}
/* \brief Configure parallel threads.

View File

@@ -16,6 +16,7 @@
#include "xgboost/span.h"
#include "common.h"
#include "threading_utils.h"
#if defined (__CUDACC__)
#include "device_helpers.cuh"
@@ -168,13 +169,10 @@ class Transform {
template <typename... HDV>
void LaunchCPU(Functor func, HDV*... vectors) const {
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
dmlc::OMPException omp_exc;
SyncHost(vectors...);
#pragma omp parallel for schedule(static)
for (omp_ulong idx = 0; idx < end; ++idx) {
omp_exc.Run(func, idx, UnpackHDV(vectors)...);
}
omp_exc.Rethrow();
ParallelFor(end, [&](omp_ulong idx) {
func(idx, UnpackHDV(vectors)...);
});
}
private: