Improve OpenMP exception handling (#6680)
This commit is contained in:
@@ -230,8 +230,7 @@ class ColumnMatrix {
|
||||
/* missing values make sense only for column with type kDenseColumn,
|
||||
and if no missing values were observed it could be handled much faster. */
|
||||
if (noMissingValues) {
|
||||
#pragma omp parallel for num_threads(omp_get_max_threads())
|
||||
for (omp_ulong rid = 0; rid < nrow; ++rid) {
|
||||
ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
|
||||
const size_t ibegin = rid*nfeature;
|
||||
const size_t iend = (rid+1)*nfeature;
|
||||
size_t j = 0;
|
||||
@@ -239,7 +238,7 @@ class ColumnMatrix {
|
||||
const size_t idx = feature_offsets_[j];
|
||||
local_index[idx + rid] = index[i];
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
/* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
|
||||
size_t rbegin = 0;
|
||||
|
||||
@@ -84,38 +84,46 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
|
||||
|
||||
size_t block_size = batch.Size() / batch_threads;
|
||||
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel num_threads(batch_threads)
|
||||
{
|
||||
#pragma omp for
|
||||
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
|
||||
size_t ibegin = block_size * tid;
|
||||
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
||||
exc.Run([&]() {
|
||||
size_t ibegin = block_size * tid;
|
||||
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
||||
|
||||
size_t sum = 0;
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
sum += page[i].size();
|
||||
row_ptr[rbegin + 1 + i] = sum;
|
||||
}
|
||||
size_t sum = 0;
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
sum += page[i].size();
|
||||
row_ptr[rbegin + 1 + i] = sum;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#pragma omp single
|
||||
{
|
||||
p_part[0] = prev_sum;
|
||||
for (size_t i = 1; i < batch_threads; ++i) {
|
||||
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
|
||||
}
|
||||
exc.Run([&]() {
|
||||
p_part[0] = prev_sum;
|
||||
for (size_t i = 1; i < batch_threads; ++i) {
|
||||
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#pragma omp for
|
||||
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
|
||||
size_t ibegin = block_size * tid;
|
||||
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
||||
exc.Run([&]() {
|
||||
size_t ibegin = block_size * tid;
|
||||
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
||||
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
row_ptr[rbegin + 1 + i] += p_part[tid];
|
||||
}
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
row_ptr[rbegin + 1 + i] += p_part[tid];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
exc.Rethrow();
|
||||
|
||||
const size_t n_offsets = cut.Ptrs().size() - 1;
|
||||
const size_t n_index = row_ptr[rbegin + batch.Size()];
|
||||
@@ -167,13 +175,12 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
|
||||
[](auto idx, auto) { return idx; });
|
||||
}
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
|
||||
ParallelFor(bst_omp_uint(nbins), nthread, [&](bst_omp_uint idx) {
|
||||
for (int32_t tid = 0; tid < nthread; ++tid) {
|
||||
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
|
||||
hit_count_tloc_[tid * nbins + idx] = 0; // reset for next batch
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
prev_sum = row_ptr[rbegin + batch.Size()];
|
||||
rbegin += batch.Size();
|
||||
@@ -701,7 +708,7 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
GHistRowT hist) {
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
static constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const size_t nblock = gmatb.GetNumBlock();
|
||||
const size_t nrows = row_indices.end - row_indices.begin;
|
||||
const size_t rest = nrows % kUnroll;
|
||||
@@ -710,40 +717,44 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
|
||||
#endif // defined(_OPENMP)
|
||||
xgboost::detail::GradientPairInternal<GradientSumT>* p_hist = hist.data();
|
||||
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(guided)
|
||||
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
|
||||
auto gmat = gmatb[bid];
|
||||
exc.Run([&]() {
|
||||
auto gmat = gmatb[bid];
|
||||
|
||||
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
|
||||
size_t rid[kUnroll];
|
||||
size_t ibegin[kUnroll];
|
||||
size_t iend[kUnroll];
|
||||
GradientPair stat[kUnroll];
|
||||
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
|
||||
size_t rid[kUnroll];
|
||||
size_t ibegin[kUnroll];
|
||||
size_t iend[kUnroll];
|
||||
GradientPair stat[kUnroll];
|
||||
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
ibegin[k] = gmat.row_ptr[rid[k]];
|
||||
iend[k] = gmat.row_ptr[rid[k] + 1];
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
ibegin[k] = gmat.row_ptr[rid[k]];
|
||||
iend[k] = gmat.row_ptr[rid[k] + 1];
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
p_hist[bin].Add(stat[k].GetGrad(), stat[k].GetHess());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
const GradientPair stat = gpair[rid];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
|
||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
const GradientPair stat = gpair[rid];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
exc.Rethrow();
|
||||
}
|
||||
template
|
||||
void GHistBuilder<float>::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
@@ -768,12 +779,11 @@ void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
|
||||
const size_t block_size = 1024; // aproximatly 1024 values per block
|
||||
size_t n_blocks = size/block_size + !!(size%block_size);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (omp_ulong iblock = 0; iblock < n_blocks; ++iblock) {
|
||||
ParallelFor(omp_ulong(n_blocks), [&](omp_ulong iblock) {
|
||||
const size_t ibegin = iblock*block_size;
|
||||
const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
|
||||
SubtractionHist(self, parent, sibling, ibegin, iend);
|
||||
}
|
||||
});
|
||||
}
|
||||
template
|
||||
void GHistBuilder<float>::SubtractionTrick(GHistRow<float> self,
|
||||
|
||||
@@ -257,8 +257,7 @@ struct GHistIndexMatrix {
|
||||
const size_t batch_size = batch.Size();
|
||||
CHECK_LT(batch_size, offset_vec.size());
|
||||
BinIdxType* index_data = index_data_span.data();
|
||||
#pragma omp parallel for num_threads(batch_threads) schedule(static)
|
||||
for (omp_ulong i = 0; i < batch_size; ++i) {
|
||||
ParallelFor(omp_ulong(batch_size), batch_threads, [&](omp_ulong i) {
|
||||
const int tid = omp_get_thread_num();
|
||||
size_t ibegin = row_ptr[rbegin + i];
|
||||
size_t iend = row_ptr[rbegin + i + 1];
|
||||
@@ -270,7 +269,7 @@ struct GHistIndexMatrix {
|
||||
index_data[ibegin + j] = get_offset(idx, j);
|
||||
++hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void ResizeIndex(const size_t n_index,
|
||||
|
||||
@@ -35,7 +35,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
|
||||
column.resize(n_columns, 0);
|
||||
}
|
||||
|
||||
ParallelFor(page.Size(), nthreads, [&](size_t i) {
|
||||
ParallelFor(omp_ulong(page.Size()), nthreads, [&](omp_ulong i) {
|
||||
auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
|
||||
auto row = page[i];
|
||||
auto const *p_row = row.data();
|
||||
@@ -44,7 +44,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
|
||||
}
|
||||
});
|
||||
std::vector<bst_row_t> entries_per_columns(n_columns, 0);
|
||||
ParallelFor(n_columns, nthreads, [&](size_t i) {
|
||||
ParallelFor(bst_omp_uint(n_columns), nthreads, [&](bst_omp_uint i) {
|
||||
for (auto const &thread : column_sizes) {
|
||||
entries_per_columns[i] += thread[i];
|
||||
}
|
||||
@@ -99,15 +99,15 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
|
||||
std::vector<bst_uint> const &group_ptr = info.group_ptr_;
|
||||
// Use group index for weights?
|
||||
auto batch = page.GetView();
|
||||
dmlc::OMPException exec;
|
||||
// Parallel over columns. Each thread owns a set of consecutive columns.
|
||||
auto const ncol = static_cast<uint32_t>(info.num_col_);
|
||||
auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
|
||||
auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);
|
||||
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
exec.Run([&]() {
|
||||
exc.Run([&]() {
|
||||
auto tid = static_cast<uint32_t>(omp_get_thread_num());
|
||||
auto const begin = thread_columns_ptr[tid];
|
||||
auto const end = thread_columns_ptr[tid + 1];
|
||||
@@ -140,7 +140,7 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
|
||||
}
|
||||
});
|
||||
}
|
||||
exec.Rethrow();
|
||||
exc.Rethrow();
|
||||
monitor_.Stop(__func__);
|
||||
}
|
||||
|
||||
@@ -242,7 +242,7 @@ size_t nbytes = 0;
|
||||
&global_sketches);
|
||||
|
||||
std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
|
||||
ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) {
|
||||
ParallelFor(omp_ulong(n_columns), [&](omp_ulong fidx) {
|
||||
int32_t intermediate_num_cuts = num_cuts[fidx];
|
||||
auto nbytes =
|
||||
WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
|
||||
|
||||
@@ -115,11 +115,10 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
|
||||
nthreads = std::min(nthreads, omp_get_max_threads());
|
||||
nthreads = std::max(nthreads, 1);
|
||||
|
||||
dmlc::OMPException omp_exc;
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel num_threads(nthreads)
|
||||
{
|
||||
omp_exc.Run(
|
||||
[](size_t num_blocks_in_space, const BlockedSpace2d& space, int nthreads, Func func) {
|
||||
exc.Run([&]() {
|
||||
size_t tid = omp_get_thread_num();
|
||||
size_t chunck_size =
|
||||
num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
|
||||
@@ -129,19 +128,24 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
|
||||
for (auto i = begin; i < end; i++) {
|
||||
func(space.GetFirstDimension(i), space.GetRange(i));
|
||||
}
|
||||
}, num_blocks_in_space, space, nthreads, func);
|
||||
});
|
||||
}
|
||||
omp_exc.Rethrow();
|
||||
exc.Rethrow();
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void ParallelFor(size_t size, size_t nthreads, Func fn) {
|
||||
dmlc::OMPException omp_exc;
|
||||
#pragma omp parallel for num_threads(nthreads)
|
||||
for (omp_ulong i = 0; i < size; ++i) {
|
||||
omp_exc.Run(fn, i);
|
||||
template <typename Index, typename Func>
|
||||
void ParallelFor(Index size, size_t nthreads, Func fn) {
|
||||
dmlc::OMPException exc;
|
||||
#pragma omp parallel for num_threads(nthreads) schedule(static)
|
||||
for (Index i = 0; i < size; ++i) {
|
||||
exc.Run(fn, i);
|
||||
}
|
||||
omp_exc.Rethrow();
|
||||
exc.Rethrow();
|
||||
}
|
||||
|
||||
template <typename Index, typename Func>
|
||||
void ParallelFor(Index size, Func fn) {
|
||||
ParallelFor(size, omp_get_max_threads(), fn);
|
||||
}
|
||||
|
||||
/* \brief Configure parallel threads.
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "xgboost/span.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "threading_utils.h"
|
||||
|
||||
#if defined (__CUDACC__)
|
||||
#include "device_helpers.cuh"
|
||||
@@ -168,13 +169,10 @@ class Transform {
|
||||
template <typename... HDV>
|
||||
void LaunchCPU(Functor func, HDV*... vectors) const {
|
||||
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
|
||||
dmlc::OMPException omp_exc;
|
||||
SyncHost(vectors...);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (omp_ulong idx = 0; idx < end; ++idx) {
|
||||
omp_exc.Run(func, idx, UnpackHDV(vectors)...);
|
||||
}
|
||||
omp_exc.Rethrow();
|
||||
ParallelFor(end, [&](omp_ulong idx) {
|
||||
func(idx, UnpackHDV(vectors)...);
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
Reference in New Issue
Block a user