Improve OpenMP exception handling (#6680)
This commit is contained in:
parent
c375173dca
commit
9b530e5697
@ -1,6 +1,7 @@
|
|||||||
// Copyright (c) 2014 by Contributors
|
// Copyright (c) 2014 by Contributors
|
||||||
#include <dmlc/logging.h>
|
#include <dmlc/logging.h>
|
||||||
#include <dmlc/omp.h>
|
#include <dmlc/omp.h>
|
||||||
|
#include <dmlc/common.h>
|
||||||
#include <xgboost/c_api.h>
|
#include <xgboost/c_api.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -92,12 +93,16 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
|||||||
din = REAL(mat);
|
din = REAL(mat);
|
||||||
}
|
}
|
||||||
std::vector<float> data(nrow * ncol);
|
std::vector<float> data(nrow * ncol);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (omp_ulong i = 0; i < nrow; ++i) {
|
for (omp_ulong i = 0; i < nrow; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
for (size_t j = 0; j < ncol; ++j) {
|
for (size_t j = 0; j < ncol; ++j) {
|
||||||
data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
|
data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
DMatrixHandle handle;
|
DMatrixHandle handle;
|
||||||
CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
|
CHECK_CALL(XGDMatrixCreateFromMat(BeginPtr(data), nrow, ncol, asReal(missing), &handle));
|
||||||
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
@ -126,11 +131,15 @@ SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
|||||||
for (size_t i = 0; i < nindptr; ++i) {
|
for (size_t i = 0; i < nindptr; ++i) {
|
||||||
col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
|
col_ptr_[i] = static_cast<size_t>(p_indptr[i]);
|
||||||
}
|
}
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (int64_t i = 0; i < static_cast<int64_t>(ndata); ++i) {
|
for (int64_t i = 0; i < static_cast<int64_t>(ndata); ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
||||||
data_[i] = static_cast<float>(p_data[i]);
|
data_[i] = static_cast<float>(p_data[i]);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
DMatrixHandle handle;
|
DMatrixHandle handle;
|
||||||
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
|
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_),
|
||||||
BeginPtr(data_), nindptr, ndata,
|
BeginPtr(data_), nindptr, ndata,
|
||||||
@ -175,12 +184,16 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
|||||||
R_API_BEGIN();
|
R_API_BEGIN();
|
||||||
int len = length(array);
|
int len = length(array);
|
||||||
const char *name = CHAR(asChar(field));
|
const char *name = CHAR(asChar(field));
|
||||||
|
dmlc::OMPException exc;
|
||||||
if (!strcmp("group", name)) {
|
if (!strcmp("group", name)) {
|
||||||
std::vector<unsigned> vec(len);
|
std::vector<unsigned> vec(len);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
CHECK_CALL(XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle),
|
CHECK_CALL(XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(field)),
|
CHAR(asChar(field)),
|
||||||
BeginPtr(vec), len));
|
BeginPtr(vec), len));
|
||||||
@ -188,8 +201,11 @@ SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
|
|||||||
std::vector<float> vec(len);
|
std::vector<float> vec(len);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
vec[i] = REAL(array)[i];
|
vec[i] = REAL(array)[i];
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
CHECK_CALL(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
CHECK_CALL(XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(field)),
|
CHAR(asChar(field)),
|
||||||
BeginPtr(vec), len));
|
BeginPtr(vec), len));
|
||||||
@ -280,11 +296,15 @@ SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
|||||||
<< "gradient and hess must have same length";
|
<< "gradient and hess must have same length";
|
||||||
int len = length(grad);
|
int len = length(grad);
|
||||||
std::vector<float> tgrad(len), thess(len);
|
std::vector<float> tgrad(len), thess(len);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (int j = 0; j < len; ++j) {
|
for (int j = 0; j < len; ++j) {
|
||||||
|
exc.Run([&]() {
|
||||||
tgrad[j] = REAL(grad)[j];
|
tgrad[j] = REAL(grad)[j];
|
||||||
thess[j] = REAL(hess)[j];
|
thess[j] = REAL(hess)[j];
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
||||||
R_ExternalPtrAddr(dtrain),
|
R_ExternalPtrAddr(dtrain),
|
||||||
BeginPtr(tgrad), BeginPtr(thess),
|
BeginPtr(tgrad), BeginPtr(thess),
|
||||||
|
|||||||
@ -290,15 +290,19 @@ class SparsePage {
|
|||||||
|
|
||||||
void SortRows() {
|
void SortRows() {
|
||||||
auto ncol = static_cast<bst_omp_uint>(this->Size());
|
auto ncol = static_cast<bst_omp_uint>(this->Size());
|
||||||
#pragma omp parallel for default(none) shared(ncol) schedule(dynamic, 1)
|
dmlc::OMPException exc;
|
||||||
|
#pragma omp parallel for schedule(dynamic, 1)
|
||||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
|
if (this->offset.HostVector()[i] < this->offset.HostVector()[i + 1]) {
|
||||||
std::sort(
|
std::sort(
|
||||||
this->data.HostVector().begin() + this->offset.HostVector()[i],
|
this->data.HostVector().begin() + this->offset.HostVector()[i],
|
||||||
this->data.HostVector().begin() + this->offset.HostVector()[i + 1],
|
this->data.HostVector().begin() + this->offset.HostVector()[i + 1],
|
||||||
Entry::CmpValue);
|
Entry::CmpValue);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -250,14 +250,18 @@ class SparsePageLZ4Format : public SparsePageFormat<SparsePage> {
|
|||||||
int nindex = index_.num_chunk();
|
int nindex = index_.num_chunk();
|
||||||
int nvalue = value_.num_chunk();
|
int nvalue = value_.num_chunk();
|
||||||
int ntotal = nindex + nvalue;
|
int ntotal = nindex + nvalue;
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_write_)
|
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_write_)
|
||||||
for (int i = 0; i < ntotal; ++i) {
|
for (int i = 0; i < ntotal; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
if (i < nindex) {
|
if (i < nindex) {
|
||||||
index_.Compress(i, use_lz4_hc_);
|
index_.Compress(i, use_lz4_hc_);
|
||||||
} else {
|
} else {
|
||||||
value_.Compress(i - nindex, use_lz4_hc_);
|
value_.Compress(i - nindex, use_lz4_hc_);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
index_.Write(fo);
|
index_.Write(fo);
|
||||||
value_.Write(fo);
|
value_.Write(fo);
|
||||||
// statistics
|
// statistics
|
||||||
@ -276,14 +280,18 @@ class SparsePageLZ4Format : public SparsePageFormat<SparsePage> {
|
|||||||
int nindex = index_.num_chunk();
|
int nindex = index_.num_chunk();
|
||||||
int nvalue = value_.num_chunk();
|
int nvalue = value_.num_chunk();
|
||||||
int ntotal = nindex + nvalue;
|
int ntotal = nindex + nvalue;
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
|
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread_)
|
||||||
for (int i = 0; i < ntotal; ++i) {
|
for (int i = 0; i < ntotal; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
if (i < nindex) {
|
if (i < nindex) {
|
||||||
index_.Decompress(i);
|
index_.Decompress(i);
|
||||||
} else {
|
} else {
|
||||||
value_.Decompress(i - nindex);
|
value_.Decompress(i - nindex);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -230,8 +230,7 @@ class ColumnMatrix {
|
|||||||
/* missing values make sense only for column with type kDenseColumn,
|
/* missing values make sense only for column with type kDenseColumn,
|
||||||
and if no missing values were observed it could be handled much faster. */
|
and if no missing values were observed it could be handled much faster. */
|
||||||
if (noMissingValues) {
|
if (noMissingValues) {
|
||||||
#pragma omp parallel for num_threads(omp_get_max_threads())
|
ParallelFor(omp_ulong(nrow), [&](omp_ulong rid) {
|
||||||
for (omp_ulong rid = 0; rid < nrow; ++rid) {
|
|
||||||
const size_t ibegin = rid*nfeature;
|
const size_t ibegin = rid*nfeature;
|
||||||
const size_t iend = (rid+1)*nfeature;
|
const size_t iend = (rid+1)*nfeature;
|
||||||
size_t j = 0;
|
size_t j = 0;
|
||||||
@ -239,7 +238,7 @@ class ColumnMatrix {
|
|||||||
const size_t idx = feature_offsets_[j];
|
const size_t idx = feature_offsets_[j];
|
||||||
local_index[idx + rid] = index[i];
|
local_index[idx + rid] = index[i];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
} else {
|
} else {
|
||||||
/* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
|
/* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
|
||||||
size_t rbegin = 0;
|
size_t rbegin = 0;
|
||||||
|
|||||||
@ -84,10 +84,12 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
|
|||||||
|
|
||||||
size_t block_size = batch.Size() / batch_threads;
|
size_t block_size = batch.Size() / batch_threads;
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel num_threads(batch_threads)
|
#pragma omp parallel num_threads(batch_threads)
|
||||||
{
|
{
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
|
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
|
||||||
|
exc.Run([&]() {
|
||||||
size_t ibegin = block_size * tid;
|
size_t ibegin = block_size * tid;
|
||||||
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
||||||
|
|
||||||
@ -96,26 +98,32 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
|
|||||||
sum += page[i].size();
|
sum += page[i].size();
|
||||||
row_ptr[rbegin + 1 + i] = sum;
|
row_ptr[rbegin + 1 + i] = sum;
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp single
|
#pragma omp single
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
p_part[0] = prev_sum;
|
p_part[0] = prev_sum;
|
||||||
for (size_t i = 1; i < batch_threads; ++i) {
|
for (size_t i = 1; i < batch_threads; ++i) {
|
||||||
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
|
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size];
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
|
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
|
||||||
|
exc.Run([&]() {
|
||||||
size_t ibegin = block_size * tid;
|
size_t ibegin = block_size * tid;
|
||||||
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
size_t iend = (tid == (batch_threads-1) ? batch.Size() : (block_size * (tid+1)));
|
||||||
|
|
||||||
for (size_t i = ibegin; i < iend; ++i) {
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
row_ptr[rbegin + 1 + i] += p_part[tid];
|
row_ptr[rbegin + 1 + i] += p_part[tid];
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
|
|
||||||
const size_t n_offsets = cut.Ptrs().size() - 1;
|
const size_t n_offsets = cut.Ptrs().size() - 1;
|
||||||
const size_t n_index = row_ptr[rbegin + batch.Size()];
|
const size_t n_index = row_ptr[rbegin + batch.Size()];
|
||||||
@ -167,13 +175,12 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
|
|||||||
[](auto idx, auto) { return idx; });
|
[](auto idx, auto) { return idx; });
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
ParallelFor(bst_omp_uint(nbins), nthread, [&](bst_omp_uint idx) {
|
||||||
for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
|
|
||||||
for (int32_t tid = 0; tid < nthread; ++tid) {
|
for (int32_t tid = 0; tid < nthread; ++tid) {
|
||||||
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
|
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
|
||||||
hit_count_tloc_[tid * nbins + idx] = 0; // reset for next batch
|
hit_count_tloc_[tid * nbins + idx] = 0; // reset for next batch
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
|
|
||||||
prev_sum = row_ptr[rbegin + batch.Size()];
|
prev_sum = row_ptr[rbegin + batch.Size()];
|
||||||
rbegin += batch.Size();
|
rbegin += batch.Size();
|
||||||
@ -701,7 +708,7 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
|
|||||||
const RowSetCollection::Elem row_indices,
|
const RowSetCollection::Elem row_indices,
|
||||||
const GHistIndexBlockMatrix& gmatb,
|
const GHistIndexBlockMatrix& gmatb,
|
||||||
GHistRowT hist) {
|
GHistRowT hist) {
|
||||||
constexpr int kUnroll = 8; // loop unrolling factor
|
static constexpr int kUnroll = 8; // loop unrolling factor
|
||||||
const size_t nblock = gmatb.GetNumBlock();
|
const size_t nblock = gmatb.GetNumBlock();
|
||||||
const size_t nrows = row_indices.end - row_indices.begin;
|
const size_t nrows = row_indices.end - row_indices.begin;
|
||||||
const size_t rest = nrows % kUnroll;
|
const size_t rest = nrows % kUnroll;
|
||||||
@ -710,8 +717,10 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
|
|||||||
#endif // defined(_OPENMP)
|
#endif // defined(_OPENMP)
|
||||||
xgboost::detail::GradientPairInternal<GradientSumT>* p_hist = hist.data();
|
xgboost::detail::GradientPairInternal<GradientSumT>* p_hist = hist.data();
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for num_threads(nthread) schedule(guided)
|
#pragma omp parallel for num_threads(nthread) schedule(guided)
|
||||||
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
|
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
|
||||||
|
exc.Run([&]() {
|
||||||
auto gmat = gmatb[bid];
|
auto gmat = gmatb[bid];
|
||||||
|
|
||||||
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
|
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
|
||||||
@ -743,7 +752,9 @@ void GHistBuilder<GradientSumT>::BuildBlockHist(const std::vector<GradientPair>&
|
|||||||
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
|
p_hist[bin].Add(stat.GetGrad(), stat.GetHess());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
template
|
template
|
||||||
void GHistBuilder<float>::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
void GHistBuilder<float>::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||||
@ -768,12 +779,11 @@ void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
|
|||||||
const size_t block_size = 1024; // aproximatly 1024 values per block
|
const size_t block_size = 1024; // aproximatly 1024 values per block
|
||||||
size_t n_blocks = size/block_size + !!(size%block_size);
|
size_t n_blocks = size/block_size + !!(size%block_size);
|
||||||
|
|
||||||
#pragma omp parallel for
|
ParallelFor(omp_ulong(n_blocks), [&](omp_ulong iblock) {
|
||||||
for (omp_ulong iblock = 0; iblock < n_blocks; ++iblock) {
|
|
||||||
const size_t ibegin = iblock*block_size;
|
const size_t ibegin = iblock*block_size;
|
||||||
const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
|
const size_t iend = (((iblock+1)*block_size > size) ? size : ibegin + block_size);
|
||||||
SubtractionHist(self, parent, sibling, ibegin, iend);
|
SubtractionHist(self, parent, sibling, ibegin, iend);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
template
|
template
|
||||||
void GHistBuilder<float>::SubtractionTrick(GHistRow<float> self,
|
void GHistBuilder<float>::SubtractionTrick(GHistRow<float> self,
|
||||||
|
|||||||
@ -257,8 +257,7 @@ struct GHistIndexMatrix {
|
|||||||
const size_t batch_size = batch.Size();
|
const size_t batch_size = batch.Size();
|
||||||
CHECK_LT(batch_size, offset_vec.size());
|
CHECK_LT(batch_size, offset_vec.size());
|
||||||
BinIdxType* index_data = index_data_span.data();
|
BinIdxType* index_data = index_data_span.data();
|
||||||
#pragma omp parallel for num_threads(batch_threads) schedule(static)
|
ParallelFor(omp_ulong(batch_size), batch_threads, [&](omp_ulong i) {
|
||||||
for (omp_ulong i = 0; i < batch_size; ++i) {
|
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
size_t ibegin = row_ptr[rbegin + i];
|
size_t ibegin = row_ptr[rbegin + i];
|
||||||
size_t iend = row_ptr[rbegin + i + 1];
|
size_t iend = row_ptr[rbegin + i + 1];
|
||||||
@ -270,7 +269,7 @@ struct GHistIndexMatrix {
|
|||||||
index_data[ibegin + j] = get_offset(idx, j);
|
index_data[ibegin + j] = get_offset(idx, j);
|
||||||
++hit_count_tloc_[tid * nbins + idx];
|
++hit_count_tloc_[tid * nbins + idx];
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void ResizeIndex(const size_t n_index,
|
void ResizeIndex(const size_t n_index,
|
||||||
|
|||||||
@ -35,7 +35,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
|
|||||||
column.resize(n_columns, 0);
|
column.resize(n_columns, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
ParallelFor(page.Size(), nthreads, [&](size_t i) {
|
ParallelFor(omp_ulong(page.Size()), nthreads, [&](omp_ulong i) {
|
||||||
auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
|
auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
|
||||||
auto row = page[i];
|
auto row = page[i];
|
||||||
auto const *p_row = row.data();
|
auto const *p_row = row.data();
|
||||||
@ -44,7 +44,7 @@ HostSketchContainer::CalcColumnSize(SparsePage const &batch,
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
std::vector<bst_row_t> entries_per_columns(n_columns, 0);
|
std::vector<bst_row_t> entries_per_columns(n_columns, 0);
|
||||||
ParallelFor(n_columns, nthreads, [&](size_t i) {
|
ParallelFor(bst_omp_uint(n_columns), nthreads, [&](bst_omp_uint i) {
|
||||||
for (auto const &thread : column_sizes) {
|
for (auto const &thread : column_sizes) {
|
||||||
entries_per_columns[i] += thread[i];
|
entries_per_columns[i] += thread[i];
|
||||||
}
|
}
|
||||||
@ -99,15 +99,15 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
|
|||||||
std::vector<bst_uint> const &group_ptr = info.group_ptr_;
|
std::vector<bst_uint> const &group_ptr = info.group_ptr_;
|
||||||
// Use group index for weights?
|
// Use group index for weights?
|
||||||
auto batch = page.GetView();
|
auto batch = page.GetView();
|
||||||
dmlc::OMPException exec;
|
|
||||||
// Parallel over columns. Each thread owns a set of consecutive columns.
|
// Parallel over columns. Each thread owns a set of consecutive columns.
|
||||||
auto const ncol = static_cast<uint32_t>(info.num_col_);
|
auto const ncol = static_cast<uint32_t>(info.num_col_);
|
||||||
auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
|
auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
|
||||||
auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);
|
auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel num_threads(nthread)
|
#pragma omp parallel num_threads(nthread)
|
||||||
{
|
{
|
||||||
exec.Run([&]() {
|
exc.Run([&]() {
|
||||||
auto tid = static_cast<uint32_t>(omp_get_thread_num());
|
auto tid = static_cast<uint32_t>(omp_get_thread_num());
|
||||||
auto const begin = thread_columns_ptr[tid];
|
auto const begin = thread_columns_ptr[tid];
|
||||||
auto const end = thread_columns_ptr[tid + 1];
|
auto const end = thread_columns_ptr[tid + 1];
|
||||||
@ -140,7 +140,7 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
exec.Rethrow();
|
exc.Rethrow();
|
||||||
monitor_.Stop(__func__);
|
monitor_.Stop(__func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -242,7 +242,7 @@ size_t nbytes = 0;
|
|||||||
&global_sketches);
|
&global_sketches);
|
||||||
|
|
||||||
std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
|
std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
|
||||||
ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) {
|
ParallelFor(omp_ulong(n_columns), [&](omp_ulong fidx) {
|
||||||
int32_t intermediate_num_cuts = num_cuts[fidx];
|
int32_t intermediate_num_cuts = num_cuts[fidx];
|
||||||
auto nbytes =
|
auto nbytes =
|
||||||
WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
|
WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
|
||||||
|
|||||||
@ -115,11 +115,10 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
|
|||||||
nthreads = std::min(nthreads, omp_get_max_threads());
|
nthreads = std::min(nthreads, omp_get_max_threads());
|
||||||
nthreads = std::max(nthreads, 1);
|
nthreads = std::max(nthreads, 1);
|
||||||
|
|
||||||
dmlc::OMPException omp_exc;
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel num_threads(nthreads)
|
#pragma omp parallel num_threads(nthreads)
|
||||||
{
|
{
|
||||||
omp_exc.Run(
|
exc.Run([&]() {
|
||||||
[](size_t num_blocks_in_space, const BlockedSpace2d& space, int nthreads, Func func) {
|
|
||||||
size_t tid = omp_get_thread_num();
|
size_t tid = omp_get_thread_num();
|
||||||
size_t chunck_size =
|
size_t chunck_size =
|
||||||
num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
|
num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
|
||||||
@ -129,19 +128,24 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
|
|||||||
for (auto i = begin; i < end; i++) {
|
for (auto i = begin; i < end; i++) {
|
||||||
func(space.GetFirstDimension(i), space.GetRange(i));
|
func(space.GetFirstDimension(i), space.GetRange(i));
|
||||||
}
|
}
|
||||||
}, num_blocks_in_space, space, nthreads, func);
|
});
|
||||||
}
|
}
|
||||||
omp_exc.Rethrow();
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Func>
|
template <typename Index, typename Func>
|
||||||
void ParallelFor(size_t size, size_t nthreads, Func fn) {
|
void ParallelFor(Index size, size_t nthreads, Func fn) {
|
||||||
dmlc::OMPException omp_exc;
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for num_threads(nthreads)
|
#pragma omp parallel for num_threads(nthreads) schedule(static)
|
||||||
for (omp_ulong i = 0; i < size; ++i) {
|
for (Index i = 0; i < size; ++i) {
|
||||||
omp_exc.Run(fn, i);
|
exc.Run(fn, i);
|
||||||
}
|
}
|
||||||
omp_exc.Rethrow();
|
exc.Rethrow();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Index, typename Func>
|
||||||
|
void ParallelFor(Index size, Func fn) {
|
||||||
|
ParallelFor(size, omp_get_max_threads(), fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* \brief Configure parallel threads.
|
/* \brief Configure parallel threads.
|
||||||
|
|||||||
@ -16,6 +16,7 @@
|
|||||||
#include "xgboost/span.h"
|
#include "xgboost/span.h"
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "threading_utils.h"
|
||||||
|
|
||||||
#if defined (__CUDACC__)
|
#if defined (__CUDACC__)
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
@ -168,13 +169,10 @@ class Transform {
|
|||||||
template <typename... HDV>
|
template <typename... HDV>
|
||||||
void LaunchCPU(Functor func, HDV*... vectors) const {
|
void LaunchCPU(Functor func, HDV*... vectors) const {
|
||||||
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
|
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
|
||||||
dmlc::OMPException omp_exc;
|
|
||||||
SyncHost(vectors...);
|
SyncHost(vectors...);
|
||||||
#pragma omp parallel for schedule(static)
|
ParallelFor(end, [&](omp_ulong idx) {
|
||||||
for (omp_ulong idx = 0; idx < end; ++idx) {
|
func(idx, UnpackHDV(vectors)...);
|
||||||
omp_exc.Run(func, idx, UnpackHDV(vectors)...);
|
});
|
||||||
}
|
|
||||||
omp_exc.Rethrow();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -830,17 +830,15 @@ SparsePage SparsePage::GetTranspose(int num_columns) const {
|
|||||||
builder.InitBudget(num_columns, nthread);
|
builder.InitBudget(num_columns, nthread);
|
||||||
long batch_size = static_cast<long>(this->Size()); // NOLINT(*)
|
long batch_size = static_cast<long>(this->Size()); // NOLINT(*)
|
||||||
auto page = this->GetView();
|
auto page = this->GetView();
|
||||||
#pragma omp parallel for default(none) shared(batch_size, builder, page) schedule(static)
|
common::ParallelFor(batch_size, [&](long i) { // NOLINT(*)
|
||||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
auto inst = page[i];
|
auto inst = page[i];
|
||||||
for (const auto& entry : inst) {
|
for (const auto& entry : inst) {
|
||||||
builder.AddBudget(entry.index, tid);
|
builder.AddBudget(entry.index, tid);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
builder.InitStorage();
|
builder.InitStorage();
|
||||||
#pragma omp parallel for default(none) shared(batch_size, builder, page) schedule(static)
|
common::ParallelFor(batch_size, [&](long i) { // NOLINT(*)
|
||||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
auto inst = page[i];
|
auto inst = page[i];
|
||||||
for (const auto& entry : inst) {
|
for (const auto& entry : inst) {
|
||||||
@ -849,7 +847,7 @@ SparsePage SparsePage::GetTranspose(int num_columns) const {
|
|||||||
Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
|
Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
|
||||||
tid);
|
tid);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
return transpose;
|
return transpose;
|
||||||
}
|
}
|
||||||
void SparsePage::Push(const SparsePage &batch) {
|
void SparsePage::Push(const SparsePage &batch) {
|
||||||
@ -900,11 +898,11 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
return max_columns;
|
return max_columns;
|
||||||
}
|
}
|
||||||
std::vector<std::vector<uint64_t>> max_columns_vector(nthread);
|
std::vector<std::vector<uint64_t>> max_columns_vector(nthread);
|
||||||
dmlc::OMPException exec;
|
dmlc::OMPException exc;
|
||||||
// First-pass over the batch counting valid elements
|
// First-pass over the batch counting valid elements
|
||||||
#pragma omp parallel num_threads(nthread)
|
#pragma omp parallel num_threads(nthread)
|
||||||
{
|
{
|
||||||
exec.Run([&]() {
|
exc.Run([&]() {
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
size_t begin = tid*thread_size;
|
size_t begin = tid*thread_size;
|
||||||
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
||||||
@ -929,7 +927,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
exec.Rethrow();
|
exc.Rethrow();
|
||||||
for (const auto & max : max_columns_vector) {
|
for (const auto & max : max_columns_vector) {
|
||||||
max_columns = std::max(max_columns, max[0]);
|
max_columns = std::max(max_columns, max[0]);
|
||||||
}
|
}
|
||||||
@ -940,7 +938,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
|
|
||||||
#pragma omp parallel num_threads(nthread)
|
#pragma omp parallel num_threads(nthread)
|
||||||
{
|
{
|
||||||
exec.Run([&]() {
|
exc.Run([&]() {
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
size_t begin = tid*thread_size;
|
size_t begin = tid*thread_size;
|
||||||
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
||||||
@ -956,7 +954,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
exec.Rethrow();
|
exc.Rethrow();
|
||||||
omp_set_num_threads(nthread_original);
|
omp_set_num_threads(nthread_original);
|
||||||
|
|
||||||
return max_columns;
|
return max_columns;
|
||||||
|
|||||||
@ -23,6 +23,7 @@
|
|||||||
#include "gblinear_model.h"
|
#include "gblinear_model.h"
|
||||||
#include "../common/timer.h"
|
#include "../common/timer.h"
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace gbm {
|
namespace gbm {
|
||||||
@ -178,8 +179,7 @@ class GBLinear : public GradientBooster {
|
|||||||
// parallel over local batch
|
// parallel over local batch
|
||||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nsize, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
|
||||||
auto inst = page[i];
|
auto inst = page[i];
|
||||||
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
// loop over output groups
|
// loop over output groups
|
||||||
@ -195,7 +195,7 @@ class GBLinear : public GradientBooster {
|
|||||||
((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] :
|
((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] :
|
||||||
learner_model_param_->base_score);
|
learner_model_param_->base_score);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -246,8 +246,7 @@ class GBLinear : public GradientBooster {
|
|||||||
if (base_margin.size() != 0) {
|
if (base_margin.size() != 0) {
|
||||||
CHECK_EQ(base_margin.size(), nsize * ngroup);
|
CHECK_EQ(base_margin.size(), nsize * ngroup);
|
||||||
}
|
}
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nsize, [&](omp_ulong i) {
|
||||||
for (omp_ulong i = 0; i < nsize; ++i) {
|
|
||||||
const size_t ridx = page.base_rowid + i;
|
const size_t ridx = page.base_rowid + i;
|
||||||
// loop over output groups
|
// loop over output groups
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
@ -256,7 +255,7 @@ class GBLinear : public GradientBooster {
|
|||||||
base_margin[ridx * ngroup + gid] : learner_model_param_->base_score;
|
base_margin[ridx * ngroup + gid] : learner_model_param_->base_score;
|
||||||
this->Pred(batch[i], &preds[ridx * ngroup], gid, margin);
|
this->Pred(batch[i], &preds[ridx * ngroup], gid, margin);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
monitor_.Stop("PredictBatchInternal");
|
monitor_.Stop("PredictBatchInternal");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,6 +27,7 @@
|
|||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
#include "../common/timer.h"
|
#include "../common/timer.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace gbm {
|
namespace gbm {
|
||||||
@ -219,10 +220,9 @@ void GBTree::DoBoost(DMatrix* p_fmat,
|
|||||||
bool update_predict = true;
|
bool update_predict = true;
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
std::vector<GradientPair>& tmp_h = tmp.HostVector();
|
std::vector<GradientPair>& tmp_h = tmp.HostVector();
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nsize, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
|
||||||
tmp_h[i] = gpair_h[i * ngroup + gid];
|
tmp_h[i] = gpair_h[i * ngroup + gid];
|
||||||
}
|
});
|
||||||
std::vector<std::unique_ptr<RegTree> > ret;
|
std::vector<std::unique_ptr<RegTree> > ret;
|
||||||
BoostNewTrees(&tmp, p_fmat, gid, &ret);
|
BoostNewTrees(&tmp, p_fmat, gid, &ret);
|
||||||
const size_t num_new_trees = ret.size();
|
const size_t num_new_trees = ret.size();
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "../gbm/gblinear_model.h"
|
#include "../gbm/gblinear_model.h"
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace linear {
|
namespace linear {
|
||||||
@ -115,14 +116,18 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
|
|||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
auto col = page[fidx];
|
auto col = page[fidx];
|
||||||
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
|
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
|
||||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
||||||
|
exc.Run([&]() {
|
||||||
const bst_float v = col[j].fvalue;
|
const bst_float v = col[j].fvalue;
|
||||||
auto &p = gpair[col[j].index * num_group + group_idx];
|
auto &p = gpair[col[j].index * num_group + group_idx];
|
||||||
if (p.GetHess() < 0.0f) continue;
|
if (p.GetHess() < 0.0f) return;
|
||||||
sum_grad += p.GetGrad() * v;
|
sum_grad += p.GetGrad() * v;
|
||||||
sum_hess += p.GetHess() * v * v;
|
sum_hess += p.GetHess() * v * v;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
return std::make_pair(sum_grad, sum_hess);
|
return std::make_pair(sum_grad, sum_hess);
|
||||||
}
|
}
|
||||||
@ -142,14 +147,18 @@ inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_
|
|||||||
DMatrix *p_fmat) {
|
DMatrix *p_fmat) {
|
||||||
double sum_grad = 0.0, sum_hess = 0.0;
|
double sum_grad = 0.0, sum_hess = 0.0;
|
||||||
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
|
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
|
||||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
auto &p = gpair[i * num_group + group_idx];
|
auto &p = gpair[i * num_group + group_idx];
|
||||||
if (p.GetHess() >= 0.0f) {
|
if (p.GetHess() >= 0.0f) {
|
||||||
sum_grad += p.GetGrad();
|
sum_grad += p.GetGrad();
|
||||||
sum_hess += p.GetHess();
|
sum_hess += p.GetHess();
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
return std::make_pair(sum_grad, sum_hess);
|
return std::make_pair(sum_grad, sum_hess);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,12 +181,16 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
|
|||||||
auto col = page[fidx];
|
auto col = page[fidx];
|
||||||
// update grad value
|
// update grad value
|
||||||
const auto num_row = static_cast<bst_omp_uint>(col.size());
|
const auto num_row = static_cast<bst_omp_uint>(col.size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint j = 0; j < num_row; ++j) {
|
for (bst_omp_uint j = 0; j < num_row; ++j) {
|
||||||
|
exc.Run([&]() {
|
||||||
GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
|
GradientPair &p = (*in_gpair)[col[j].index * num_group + group_idx];
|
||||||
if (p.GetHess() < 0.0f) continue;
|
if (p.GetHess() < 0.0f) return;
|
||||||
p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
|
p += GradientPair(p.GetHess() * col[j].fvalue * dw, 0);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,12 +208,16 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias
|
|||||||
DMatrix *p_fmat) {
|
DMatrix *p_fmat) {
|
||||||
if (dbias == 0.0f) return;
|
if (dbias == 0.0f) return;
|
||||||
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
GradientPair &g = (*in_gpair)[i * num_group + group_idx];
|
GradientPair &g = (*in_gpair)[i * num_group + group_idx];
|
||||||
if (g.GetHess() < 0.0f) continue;
|
if (g.GetHess() < 0.0f) return;
|
||||||
g += GradientPair(g.GetHess() * dbias, 0);
|
g += GradientPair(g.GetHess() * dbias, 0);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -338,8 +355,7 @@ class GreedyFeatureSelector : public FeatureSelector {
|
|||||||
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
|
std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
|
||||||
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
|
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nfeat, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
|
||||||
const auto col = page[i];
|
const auto col = page[i];
|
||||||
const bst_uint ndata = col.size();
|
const bst_uint ndata = col.size();
|
||||||
auto &sums = gpair_sums_[group_idx * nfeat + i];
|
auto &sums = gpair_sums_[group_idx * nfeat + i];
|
||||||
@ -350,7 +366,7 @@ class GreedyFeatureSelector : public FeatureSelector {
|
|||||||
sums.first += p.GetGrad() * v;
|
sums.first += p.GetGrad() * v;
|
||||||
sums.second += p.GetHess() * v * v;
|
sums.second += p.GetHess() * v * v;
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
// Find a feature with the largest magnitude of weight change
|
// Find a feature with the largest magnitude of weight change
|
||||||
int best_fidx = 0;
|
int best_fidx = 0;
|
||||||
@ -405,8 +421,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
|
|||||||
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
|
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
// column-parallel is usually fastaer than row-parallel
|
// column-parallel is usually fastaer than row-parallel
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nfeat, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
|
||||||
const auto col = page[i];
|
const auto col = page[i];
|
||||||
const bst_uint ndata = col.size();
|
const bst_uint ndata = col.size();
|
||||||
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
|
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
|
||||||
@ -419,7 +434,7 @@ class ThriftyFeatureSelector : public FeatureSelector {
|
|||||||
sums.second += p.GetHess() * v * v;
|
sums.second += p.GetHess() * v * v;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
// rank by descending weight magnitude within the groups
|
// rank by descending weight magnitude within the groups
|
||||||
std::fill(deltaw_.begin(), deltaw_.end(), 0.f);
|
std::fill(deltaw_.begin(), deltaw_.end(), 0.f);
|
||||||
|
|||||||
@ -54,12 +54,14 @@ class ShotgunUpdater : public LinearUpdater {
|
|||||||
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
|
for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
|
const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
int ii = selector_->NextFeature
|
int ii = selector_->NextFeature
|
||||||
(i, *model, 0, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
|
(i, *model, 0, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
|
||||||
param_.reg_lambda_denorm);
|
param_.reg_lambda_denorm);
|
||||||
if (ii < 0) continue;
|
if (ii < 0) return;
|
||||||
const bst_uint fid = ii;
|
const bst_uint fid = ii;
|
||||||
auto col = page[ii];
|
auto col = page[ii];
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
@ -85,7 +87,9 @@ class ShotgunUpdater : public LinearUpdater {
|
|||||||
p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
|
p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -47,12 +47,16 @@ class ElementWiseMetricsReduction {
|
|||||||
bst_float residue_sum = 0;
|
bst_float residue_sum = 0;
|
||||||
bst_float weights_sum = 0;
|
bst_float weights_sum = 0;
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
|
#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
|
||||||
for (omp_ulong i = 0; i < ndata; ++i) {
|
for (omp_ulong i = 0; i < ndata; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
const bst_float wt = h_weights.size() > 0 ? h_weights[i] : 1.0f;
|
const bst_float wt = h_weights.size() > 0 ? h_weights[i] : 1.0f;
|
||||||
residue_sum += policy_.EvalRow(h_labels[i], h_preds[i]) * wt;
|
residue_sum += policy_.EvalRow(h_labels[i], h_preds[i]) * wt;
|
||||||
weights_sum += wt;
|
weights_sum += wt;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
PackedReduceResult res { residue_sum, weights_sum };
|
PackedReduceResult res { residue_sum, weights_sum };
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -53,8 +53,10 @@ class MultiClassMetricsReduction {
|
|||||||
int label_error = 0;
|
int label_error = 0;
|
||||||
bool const is_null_weight = weights.Size() == 0;
|
bool const is_null_weight = weights.Size() == 0;
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
|
#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
|
||||||
for (omp_ulong idx = 0; idx < ndata; ++idx) {
|
for (omp_ulong idx = 0; idx < ndata; ++idx) {
|
||||||
|
exc.Run([&]() {
|
||||||
bst_float weight = is_null_weight ? 1.0f : h_weights[idx];
|
bst_float weight = is_null_weight ? 1.0f : h_weights[idx];
|
||||||
auto label = static_cast<int>(h_labels[idx]);
|
auto label = static_cast<int>(h_labels[idx]);
|
||||||
if (label >= 0 && label < static_cast<int>(n_class)) {
|
if (label >= 0 && label < static_cast<int>(n_class)) {
|
||||||
@ -64,7 +66,10 @@ class MultiClassMetricsReduction {
|
|||||||
} else {
|
} else {
|
||||||
label_error = label;
|
label_error = label;
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
|
|
||||||
CheckLabelError(label_error, n_class);
|
CheckLabelError(label_error, n_class);
|
||||||
PackedReduceResult res { residue_sum, weights_sum };
|
PackedReduceResult res { residue_sum, weights_sum };
|
||||||
|
|
||||||
|
|||||||
@ -29,6 +29,7 @@
|
|||||||
|
|
||||||
#include "xgboost/host_device_vector.h"
|
#include "xgboost/host_device_vector.h"
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
#include "metric_common.h"
|
#include "metric_common.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@ -111,10 +112,9 @@ struct EvalAMS : public Metric {
|
|||||||
PredIndPairContainer rec(ndata);
|
PredIndPairContainer rec(ndata);
|
||||||
|
|
||||||
const auto &h_preds = preds.ConstHostVector();
|
const auto &h_preds = preds.ConstHostVector();
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
|
||||||
rec[i] = std::make_pair(h_preds[i], i);
|
rec[i] = std::make_pair(h_preds[i], i);
|
||||||
}
|
});
|
||||||
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
||||||
auto ntop = static_cast<unsigned>(ratio_ * ndata);
|
auto ntop = static_cast<unsigned>(ratio_ * ndata);
|
||||||
if (ntop == 0) ntop = ndata;
|
if (ntop == 0) ntop = ndata;
|
||||||
@ -175,18 +175,23 @@ struct EvalAuc : public Metric {
|
|||||||
const auto& labels = info.labels_.ConstHostVector();
|
const auto& labels = info.labels_.ConstHostVector();
|
||||||
const auto &h_preds = preds.ConstHostVector();
|
const auto &h_preds = preds.ConstHostVector();
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel reduction(+:sum_auc, auc_error) if (ngroups > 1)
|
#pragma omp parallel reduction(+:sum_auc, auc_error) if (ngroups > 1)
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
// Each thread works on a distinct group and sorts the predictions in that group
|
// Each thread works on a distinct group and sorts the predictions in that group
|
||||||
PredIndPairContainer rec;
|
PredIndPairContainer rec;
|
||||||
#pragma omp for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (bst_omp_uint group_id = 0; group_id < ngroups; ++group_id) {
|
for (bst_omp_uint group_id = 0; group_id < ngroups; ++group_id) {
|
||||||
|
exc.Run([&]() {
|
||||||
// Same thread can work on multiple groups one after another; hence, resize
|
// Same thread can work on multiple groups one after another; hence, resize
|
||||||
// the predictions array based on the current group
|
// the predictions array based on the current group
|
||||||
rec.resize(gptr[group_id + 1] - gptr[group_id]);
|
rec.resize(gptr[group_id + 1] - gptr[group_id]);
|
||||||
#pragma omp parallel for schedule(static) if (!omp_in_parallel())
|
#pragma omp parallel for schedule(static) if (!omp_in_parallel())
|
||||||
for (bst_omp_uint j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
for (bst_omp_uint j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
||||||
|
exc.Run([&]() {
|
||||||
rec[j - gptr[group_id]] = {h_preds[j], j};
|
rec[j - gptr[group_id]] = {h_preds[j], j};
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
||||||
@ -216,8 +221,11 @@ struct EvalAuc : public Metric {
|
|||||||
// this is the AUC
|
// this is the AUC
|
||||||
sum_auc += sum_pospair / (sum_npos * sum_nneg);
|
sum_auc += sum_pospair / (sum_npos * sum_nneg);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
|
|
||||||
// Report average AUC across all groups
|
// Report average AUC across all groups
|
||||||
// In distributed mode, workers which only contains pos or neg samples
|
// In distributed mode, workers which only contains pos or neg samples
|
||||||
@ -316,19 +324,25 @@ struct EvalRank : public Metric, public EvalRankConfig {
|
|||||||
const auto &labels = info.labels_.ConstHostVector();
|
const auto &labels = info.labels_.ConstHostVector();
|
||||||
const auto &h_preds = preds.ConstHostVector();
|
const auto &h_preds = preds.ConstHostVector();
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel reduction(+:sum_metric)
|
#pragma omp parallel reduction(+:sum_metric)
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
// each thread takes a local rec
|
// each thread takes a local rec
|
||||||
PredIndPairContainer rec;
|
PredIndPairContainer rec;
|
||||||
#pragma omp for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (bst_omp_uint k = 0; k < ngroups; ++k) {
|
for (bst_omp_uint k = 0; k < ngroups; ++k) {
|
||||||
|
exc.Run([&]() {
|
||||||
rec.clear();
|
rec.clear();
|
||||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
||||||
rec.emplace_back(h_preds[j], static_cast<int>(labels[j]));
|
rec.emplace_back(h_preds[j], static_cast<int>(labels[j]));
|
||||||
}
|
}
|
||||||
sum_metric += this->EvalGroup(&rec);
|
sum_metric += this->EvalGroup(&rec);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (distributed) {
|
if (distributed) {
|
||||||
@ -526,12 +540,15 @@ struct EvalAucPR : public Metric {
|
|||||||
const auto &h_labels = info.labels_.ConstHostVector();
|
const auto &h_labels = info.labels_.ConstHostVector();
|
||||||
const auto &h_preds = preds.ConstHostVector();
|
const auto &h_preds = preds.ConstHostVector();
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel reduction(+:sum_auc, auc_error) if (ngroups > 1)
|
#pragma omp parallel reduction(+:sum_auc, auc_error) if (ngroups > 1)
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
// Each thread works on a distinct group and sorts the predictions in that group
|
// Each thread works on a distinct group and sorts the predictions in that group
|
||||||
PredIndPairContainer rec;
|
PredIndPairContainer rec;
|
||||||
#pragma omp for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (bst_omp_uint group_id = 0; group_id < ngroups; ++group_id) {
|
for (bst_omp_uint group_id = 0; group_id < ngroups; ++group_id) {
|
||||||
|
exc.Run([&]() {
|
||||||
double total_pos = 0.0;
|
double total_pos = 0.0;
|
||||||
double total_neg = 0.0;
|
double total_neg = 0.0;
|
||||||
// Same thread can work on multiple groups one after another; hence, resize
|
// Same thread can work on multiple groups one after another; hence, resize
|
||||||
@ -540,16 +557,18 @@ struct EvalAucPR : public Metric {
|
|||||||
#pragma omp parallel for schedule(static) reduction(+:total_pos, total_neg) \
|
#pragma omp parallel for schedule(static) reduction(+:total_pos, total_neg) \
|
||||||
if (!omp_in_parallel()) // NOLINT
|
if (!omp_in_parallel()) // NOLINT
|
||||||
for (bst_omp_uint j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
for (bst_omp_uint j = gptr[group_id]; j < gptr[group_id + 1]; ++j) {
|
||||||
|
exc.Run([&]() {
|
||||||
const bst_float wt = WeightPolicy::GetWeightOfInstance(info, j, group_id);
|
const bst_float wt = WeightPolicy::GetWeightOfInstance(info, j, group_id);
|
||||||
total_pos += wt * h_labels[j];
|
total_pos += wt * h_labels[j];
|
||||||
total_neg += wt * (1.0f - h_labels[j]);
|
total_neg += wt * (1.0f - h_labels[j]);
|
||||||
rec[j - gptr[group_id]] = {h_preds[j], j};
|
rec[j - gptr[group_id]] = {h_preds[j], j};
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// we need pos > 0 && neg > 0
|
// we need pos > 0 && neg > 0
|
||||||
if (total_pos <= 0.0 || total_neg <= 0.0) {
|
if (total_pos <= 0.0 || total_neg <= 0.0) {
|
||||||
auc_error += 1;
|
auc_error += 1;
|
||||||
continue;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
|
||||||
@ -560,7 +579,8 @@ struct EvalAucPR : public Metric {
|
|||||||
const bst_float wt = WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id);
|
const bst_float wt = WeightPolicy::GetWeightOfSortedRecord(info, rec, j, group_id);
|
||||||
tp += wt * h_labels[rec[j].second];
|
tp += wt * h_labels[rec[j].second];
|
||||||
fp += wt * (1.0f - h_labels[rec[j].second]);
|
fp += wt * (1.0f - h_labels[rec[j].second]);
|
||||||
if ((j < rec.size() - 1 && rec[j].first != rec[j + 1].first) || j == rec.size() - 1) {
|
if ((j < rec.size() - 1 && rec[j].first != rec[j + 1].first) ||
|
||||||
|
j == rec.size() - 1) {
|
||||||
if (tp == prevtp) {
|
if (tp == prevtp) {
|
||||||
a = 1.0;
|
a = 1.0;
|
||||||
b = 0.0;
|
b = 0.0;
|
||||||
@ -584,8 +604,11 @@ struct EvalAucPR : public Metric {
|
|||||||
if (tp < 0 || prevtp < 0 || fp < 0 || prevfp < 0) {
|
if (tp < 0 || prevtp < 0 || fp < 0 || prevfp < 0) {
|
||||||
CHECK(!auc_error) << "AUC-PR: error in calculation";
|
CHECK(!auc_error) << "AUC-PR: error in calculation";
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
|
|
||||||
// Report average AUC-PR across all groups
|
// Report average AUC-PR across all groups
|
||||||
// In distributed mode, workers which only contains pos or neg samples
|
// In distributed mode, workers which only contains pos or neg samples
|
||||||
|
|||||||
@ -58,15 +58,19 @@ class ElementWiseSurvivalMetricsReduction {
|
|||||||
double residue_sum = 0;
|
double residue_sum = 0;
|
||||||
double weights_sum = 0;
|
double weights_sum = 0;
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
|
#pragma omp parallel for reduction(+: residue_sum, weights_sum) schedule(static)
|
||||||
for (omp_ulong i = 0; i < ndata; ++i) {
|
for (omp_ulong i = 0; i < ndata; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
const double wt = h_weights.empty() ? 1.0 : static_cast<double>(h_weights[i]);
|
const double wt = h_weights.empty() ? 1.0 : static_cast<double>(h_weights[i]);
|
||||||
residue_sum += policy_.EvalRow(
|
residue_sum += policy_.EvalRow(
|
||||||
static_cast<double>(h_labels_lower_bound[i]),
|
static_cast<double>(h_labels_lower_bound[i]),
|
||||||
static_cast<double>(h_labels_upper_bound[i]),
|
static_cast<double>(h_labels_upper_bound[i]),
|
||||||
static_cast<double>(h_preds[i])) * wt;
|
static_cast<double>(h_preds[i])) * wt;
|
||||||
weights_sum += wt;
|
weights_sum += wt;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
PackedReduceResult res{residue_sum, weights_sum};
|
PackedReduceResult res{residue_sum, weights_sum};
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -823,8 +823,10 @@ class LambdaRankObj : public ObjFunction {
|
|||||||
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
const auto ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
|
||||||
out_gpair->Resize(preds.Size());
|
out_gpair->Resize(preds.Size());
|
||||||
|
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
// parallel construct, declare random number generator here, so that each
|
// parallel construct, declare random number generator here, so that each
|
||||||
// thread use its own random number generator, seed by thread id and current iteration
|
// thread use its own random number generator, seed by thread id and current iteration
|
||||||
std::minstd_rand rnd((iter + 1) * 1111);
|
std::minstd_rand rnd((iter + 1) * 1111);
|
||||||
@ -834,6 +836,7 @@ class LambdaRankObj : public ObjFunction {
|
|||||||
|
|
||||||
#pragma omp for schedule(static)
|
#pragma omp for schedule(static)
|
||||||
for (bst_omp_uint k = 0; k < ngroup; ++k) {
|
for (bst_omp_uint k = 0; k < ngroup; ++k) {
|
||||||
|
exc.Run([&]() {
|
||||||
lst.clear(); pairs.clear();
|
lst.clear(); pairs.clear();
|
||||||
for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
|
for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
|
||||||
lst.emplace_back(preds_h[j], labels[j], j);
|
lst.emplace_back(preds_h[j], labels[j], j);
|
||||||
@ -845,7 +848,8 @@ class LambdaRankObj : public ObjFunction {
|
|||||||
rec[i] = std::make_pair(lst[i].label, i);
|
rec[i] = std::make_pair(lst[i].label, i);
|
||||||
}
|
}
|
||||||
std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
|
std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
|
||||||
// enumerate buckets with same label, for each item in the lst, grab another sample randomly
|
// enumerate buckets with same label
|
||||||
|
// for each item in the lst, grab another sample randomly
|
||||||
for (unsigned i = 0; i < rec.size(); ) {
|
for (unsigned i = 0; i < rec.size(); ) {
|
||||||
unsigned j = i + 1;
|
unsigned j = i + 1;
|
||||||
while (j < rec.size() && rec[j].first == rec[i].first) ++j;
|
while (j < rec.size() && rec[j].first == rec[i].first) ++j;
|
||||||
@ -855,7 +859,8 @@ class LambdaRankObj : public ObjFunction {
|
|||||||
int nsample = param_.num_pairsample;
|
int nsample = param_.num_pairsample;
|
||||||
while (nsample --) {
|
while (nsample --) {
|
||||||
for (unsigned pid = i; pid < j; ++pid) {
|
for (unsigned pid = i; pid < j; ++pid) {
|
||||||
unsigned ridx = std::uniform_int_distribution<unsigned>(0, nleft + nright - 1)(rnd);
|
unsigned ridx =
|
||||||
|
std::uniform_int_distribution<unsigned>(0, nleft + nright - 1)(rnd);
|
||||||
if (ridx < nleft) {
|
if (ridx < nleft) {
|
||||||
pairs.emplace_back(rec[ridx].second, rec[pid].second,
|
pairs.emplace_back(rec[ridx].second, rec[pid].second,
|
||||||
info.GetWeight(k) * weight_normalization_factor);
|
info.GetWeight(k) * weight_normalization_factor);
|
||||||
@ -887,8 +892,11 @@ class LambdaRankObj : public ObjFunction {
|
|||||||
gpair[pos.rindex] += GradientPair(g * w, 2.0f*w*h);
|
gpair[pos.rindex] += GradientPair(g * w, 2.0f*w*h);
|
||||||
gpair[neg.rindex] += GradientPair(-g * w, 2.0f*w*h);
|
gpair[neg.rindex] += GradientPair(-g * w, 2.0f*w*h);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__CUDACC__)
|
#if defined(__CUDACC__)
|
||||||
|
|||||||
@ -19,6 +19,7 @@
|
|||||||
|
|
||||||
#include "../common/transform.h"
|
#include "../common/transform.h"
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
#include "./regression_loss.h"
|
#include "./regression_loss.h"
|
||||||
|
|
||||||
|
|
||||||
@ -345,10 +346,9 @@ class CoxRegression : public ObjFunction {
|
|||||||
void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
|
void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
|
||||||
std::vector<bst_float> &preds = io_preds->HostVector();
|
std::vector<bst_float> &preds = io_preds->HostVector();
|
||||||
const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
|
const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](long j) { // NOLINT(*)
|
||||||
for (long j = 0; j < ndata; ++j) { // NOLINT(*)
|
|
||||||
preds[j] = std::exp(preds[j]);
|
preds[j] = std::exp(preds[j]);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
||||||
PredTransform(io_preds);
|
PredTransform(io_preds);
|
||||||
|
|||||||
@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
#include "../data/adapter.h"
|
#include "../data/adapter.h"
|
||||||
#include "../common/math.h"
|
#include "../common/math.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
#include "../gbm/gbtree_model.h"
|
#include "../gbm/gbtree_model.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@ -157,8 +158,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, std::vector<bst_float> *out
|
|||||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||||
const int num_feature = model.learner_model_param->num_feature;
|
const int num_feature = model.learner_model_param->num_feature;
|
||||||
const bst_omp_uint n_row_blocks = (nsize) / block_of_rows_size + !!((nsize) % block_of_rows_size);
|
const bst_omp_uint n_row_blocks = (nsize) / block_of_rows_size + !!((nsize) % block_of_rows_size);
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(n_row_blocks, [&](bst_omp_uint block_id) {
|
||||||
for (bst_omp_uint block_id = 0; block_id < n_row_blocks; ++block_id) {
|
|
||||||
const size_t batch_offset = block_id * block_of_rows_size;
|
const size_t batch_offset = block_id * block_of_rows_size;
|
||||||
const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
|
const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
|
||||||
const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
|
const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
|
||||||
@ -168,7 +168,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, std::vector<bst_float> *out
|
|||||||
PredictByAllTrees(model, tree_begin, tree_end, out_preds, batch_offset + batch.base_rowid,
|
PredictByAllTrees(model, tree_begin, tree_end, out_preds, batch_offset + batch.base_rowid,
|
||||||
num_group, thread_temp, fvec_offset, block_size);
|
num_group, thread_temp, fvec_offset, block_size);
|
||||||
FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
|
FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
class CPUPredictor : public Predictor {
|
class CPUPredictor : public Predictor {
|
||||||
@ -335,8 +335,7 @@ class CPUPredictor : public Predictor {
|
|||||||
// parallel over local batch
|
// parallel over local batch
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nsize, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
auto ridx = static_cast<size_t>(batch.base_rowid + i);
|
auto ridx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
RegTree::FVec &feats = feat_vecs[tid];
|
RegTree::FVec &feats = feat_vecs[tid];
|
||||||
@ -349,7 +348,7 @@ class CPUPredictor : public Predictor {
|
|||||||
preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
|
preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
|
||||||
}
|
}
|
||||||
feats.Drop(page[i]);
|
feats.Drop(page[i]);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -378,18 +377,16 @@ class CPUPredictor : public Predictor {
|
|||||||
// allocated one
|
// allocated one
|
||||||
std::fill(contribs.begin(), contribs.end(), 0);
|
std::fill(contribs.begin(), contribs.end(), 0);
|
||||||
// initialize tree node mean values
|
// initialize tree node mean values
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(bst_omp_uint(ntree_limit), [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < ntree_limit; ++i) {
|
|
||||||
model.trees[i]->FillNodeMeanValues();
|
model.trees[i]->FillNodeMeanValues();
|
||||||
}
|
});
|
||||||
const std::vector<bst_float>& base_margin = info.base_margin_.HostVector();
|
const std::vector<bst_float>& base_margin = info.base_margin_.HostVector();
|
||||||
// start collecting the contributions
|
// start collecting the contributions
|
||||||
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
|
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
// parallel over local batch
|
// parallel over local batch
|
||||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nsize, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
|
||||||
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
|
RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
|
||||||
if (feats.Size() == 0) {
|
if (feats.Size() == 0) {
|
||||||
@ -425,7 +422,7 @@ class CPUPredictor : public Predictor {
|
|||||||
p_contribs[ncolumns - 1] += model.learner_model_param->base_score;
|
p_contribs[ncolumns - 1] += model.learner_model_param->base_score;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,7 @@
|
|||||||
#include "../common/io.h"
|
#include "../common/io.h"
|
||||||
#include "../common/random.h"
|
#include "../common/random.h"
|
||||||
#include "../common/quantile.h"
|
#include "../common/quantile.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -221,8 +222,7 @@ class BaseMaker: public TreeUpdater {
|
|||||||
// so that they are ignored in future statistics collection
|
// so that they are ignored in future statistics collection
|
||||||
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
|
||||||
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
|
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
if (tree[nid].IsLeaf()) {
|
if (tree[nid].IsLeaf()) {
|
||||||
// mark finish when it is not a fresh leaf
|
// mark finish when it is not a fresh leaf
|
||||||
@ -237,7 +237,7 @@ class BaseMaker: public TreeUpdater {
|
|||||||
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief this is helper function uses column based data structure,
|
* \brief this is helper function uses column based data structure,
|
||||||
@ -257,8 +257,7 @@ class BaseMaker: public TreeUpdater {
|
|||||||
|
|
||||||
if (it != sorted_split_set.end() && *it == fid) {
|
if (it != sorted_split_set.end() && *it == fid) {
|
||||||
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint j) {
|
||||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
|
||||||
const bst_uint ridx = col[j].index;
|
const bst_uint ridx = col[j].index;
|
||||||
const bst_float fvalue = col[j].fvalue;
|
const bst_float fvalue = col[j].fvalue;
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
@ -273,7 +272,7 @@ class BaseMaker: public TreeUpdater {
|
|||||||
this->SetEncodePosition(ridx, tree[pid].RightChild());
|
this->SetEncodePosition(ridx, tree[pid].RightChild());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -314,8 +313,7 @@ class BaseMaker: public TreeUpdater {
|
|||||||
for (auto fid : fsplits) {
|
for (auto fid : fsplits) {
|
||||||
auto col = page[fid];
|
auto col = page[fid];
|
||||||
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint j) {
|
||||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
|
||||||
const bst_uint ridx = col[j].index;
|
const bst_uint ridx = col[j].index;
|
||||||
const bst_float fvalue = col[j].fvalue;
|
const bst_float fvalue = col[j].fvalue;
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
@ -327,7 +325,7 @@ class BaseMaker: public TreeUpdater {
|
|||||||
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -341,24 +339,27 @@ class BaseMaker: public TreeUpdater {
|
|||||||
std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
|
std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
|
||||||
thread_temp.resize(omp_get_max_threads());
|
thread_temp.resize(omp_get_max_threads());
|
||||||
p_node_stats->resize(tree.param.num_nodes);
|
p_node_stats->resize(tree.param.num_nodes);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
thread_temp[tid].resize(tree.param.num_nodes, TStats());
|
thread_temp[tid].resize(tree.param.num_nodes, TStats());
|
||||||
for (unsigned int nid : qexpand_) {
|
for (unsigned int nid : qexpand_) {
|
||||||
thread_temp[tid][nid] = TStats();
|
thread_temp[tid][nid] = TStats();
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
// setup position
|
// setup position
|
||||||
const auto ndata = static_cast<bst_omp_uint>(fmat.Info().num_row_);
|
const auto ndata = static_cast<bst_omp_uint>(fmat.Info().num_row_);
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
|
||||||
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
|
|
||||||
const int nid = position_[ridx];
|
const int nid = position_[ridx];
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
if (nid >= 0) {
|
if (nid >= 0) {
|
||||||
thread_temp[tid][nid].Add(gpair[ridx]);
|
thread_temp[tid][nid].Add(gpair[ridx]);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
// sum the per thread statistics together
|
// sum the per thread statistics together
|
||||||
for (int nid : qexpand_) {
|
for (int nid : qexpand_) {
|
||||||
TStats &s = (*p_node_stats)[nid];
|
TStats &s = (*p_node_stats)[nid];
|
||||||
|
|||||||
@ -264,12 +264,16 @@ class ColMaker: public TreeUpdater {
|
|||||||
const MetaInfo& info = fmat.Info();
|
const MetaInfo& info = fmat.Info();
|
||||||
// setup position
|
// setup position
|
||||||
const auto ndata = static_cast<bst_omp_uint>(info.num_row_);
|
const auto ndata = static_cast<bst_omp_uint>(info.num_row_);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
|
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
|
||||||
|
exc.Run([&]() {
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
if (position_[ridx] < 0) continue;
|
if (position_[ridx] < 0) return;
|
||||||
stemp_[tid][position_[ridx]].stats.Add(gpair[ridx]);
|
stemp_[tid][position_[ridx]].stats.Add(gpair[ridx]);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
// sum the per thread statistics together
|
// sum the per thread statistics together
|
||||||
for (int nid : qexpand) {
|
for (int nid : qexpand) {
|
||||||
GradStats stats;
|
GradStats stats;
|
||||||
@ -447,11 +451,11 @@ class ColMaker: public TreeUpdater {
|
|||||||
std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
|
std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
|
||||||
#endif // defined(_OPENMP)
|
#endif // defined(_OPENMP)
|
||||||
{
|
{
|
||||||
dmlc::OMPException omp_handler;
|
|
||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, batch_size)
|
#pragma omp parallel for schedule(dynamic, batch_size)
|
||||||
for (bst_omp_uint i = 0; i < num_features; ++i) {
|
for (bst_omp_uint i = 0; i < num_features; ++i) {
|
||||||
omp_handler.Run([&]() {
|
exc.Run([&]() {
|
||||||
auto evaluator = tree_evaluator_.GetEvaluator();
|
auto evaluator = tree_evaluator_.GetEvaluator();
|
||||||
bst_feature_t const fid = feat_set[i];
|
bst_feature_t const fid = feat_set[i];
|
||||||
int32_t const tid = omp_get_thread_num();
|
int32_t const tid = omp_get_thread_num();
|
||||||
@ -470,7 +474,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
omp_handler.Rethrow();
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// find splits at current level, do split per level
|
// find splits at current level, do split per level
|
||||||
@ -521,8 +525,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
// so that they are ignored in future statistics collection
|
// so that they are ignored in future statistics collection
|
||||||
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
const auto ndata = static_cast<bst_omp_uint>(p_fmat->Info().num_row_);
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint ridx) {
|
||||||
for (bst_omp_uint ridx = 0; ridx < ndata; ++ridx) {
|
|
||||||
CHECK_LT(ridx, position_.size())
|
CHECK_LT(ridx, position_.size())
|
||||||
<< "ridx exceed bound " << "ridx="<< ridx << " pos=" << position_.size();
|
<< "ridx exceed bound " << "ridx="<< ridx << " pos=" << position_.size();
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
@ -539,7 +542,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
// customization part
|
// customization part
|
||||||
// synchronize the best solution of each node
|
// synchronize the best solution of each node
|
||||||
@ -568,8 +571,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
for (auto fid : fsplits) {
|
for (auto fid : fsplits) {
|
||||||
auto col = page[fid];
|
auto col = page[fid];
|
||||||
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
const auto ndata = static_cast<bst_omp_uint>(col.size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(ndata, [&](bst_omp_uint j) {
|
||||||
for (bst_omp_uint j = 0; j < ndata; ++j) {
|
|
||||||
const bst_uint ridx = col[j].index;
|
const bst_uint ridx = col[j].index;
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
const bst_float fvalue = col[j].fvalue;
|
const bst_float fvalue = col[j].fvalue;
|
||||||
@ -581,7 +583,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
this->SetEncodePosition(ridx, tree[nid].RightChild());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -202,8 +202,10 @@ class HistMaker: public BaseMaker {
|
|||||||
std::vector<SplitEntry> sol(qexpand_.size());
|
std::vector<SplitEntry> sol(qexpand_.size());
|
||||||
std::vector<GradStats> left_sum(qexpand_.size());
|
std::vector<GradStats> left_sum(qexpand_.size());
|
||||||
auto nexpand = static_cast<bst_omp_uint>(qexpand_.size());
|
auto nexpand = static_cast<bst_omp_uint>(qexpand_.size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, 1)
|
#pragma omp parallel for schedule(dynamic, 1)
|
||||||
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
||||||
|
exc.Run([&]() {
|
||||||
const int nid = qexpand_[wid];
|
const int nid = qexpand_[wid];
|
||||||
CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
|
CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
|
||||||
SplitEntry &best = sol[wid];
|
SplitEntry &best = sol[wid];
|
||||||
@ -217,7 +219,9 @@ class HistMaker: public BaseMaker {
|
|||||||
EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
|
EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature+1)],
|
||||||
node_sum, feature_set[i], &best, &left_sum[wid]);
|
node_sum, feature_set[i], &best, &left_sum[wid]);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
// get the best result, we can synchronize the solution
|
// get the best result, we can synchronize the solution
|
||||||
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
|
||||||
const bst_node_t nid = qexpand_[wid];
|
const bst_node_t nid = qexpand_[wid];
|
||||||
@ -341,8 +345,10 @@ class CQHistMaker: public HistMaker {
|
|||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
// start enumeration
|
// start enumeration
|
||||||
const auto nsize = static_cast<bst_omp_uint>(fset.size());
|
const auto nsize = static_cast<bst_omp_uint>(fset.size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, 1)
|
#pragma omp parallel for schedule(dynamic, 1)
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
int fid = fset[i];
|
int fid = fset[i];
|
||||||
int offset = feat2workindex_[fid];
|
int offset = feat2workindex_[fid];
|
||||||
if (offset >= 0) {
|
if (offset >= 0) {
|
||||||
@ -350,7 +356,9 @@ class CQHistMaker: public HistMaker {
|
|||||||
fset, offset,
|
fset, offset,
|
||||||
&thread_hist_[omp_get_thread_num()]);
|
&thread_hist_[omp_get_thread_num()]);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
// update node statistics.
|
// update node statistics.
|
||||||
this->GetNodeStats(gpair, *p_fmat, tree,
|
this->GetNodeStats(gpair, *p_fmat, tree,
|
||||||
@ -417,8 +425,10 @@ class CQHistMaker: public HistMaker {
|
|||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
// start enumeration
|
// start enumeration
|
||||||
const auto nsize = static_cast<bst_omp_uint>(work_set_.size());
|
const auto nsize = static_cast<bst_omp_uint>(work_set_.size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, 1)
|
#pragma omp parallel for schedule(dynamic, 1)
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
int fid = work_set_[i];
|
int fid = work_set_[i];
|
||||||
int offset = feat2workindex_[fid];
|
int offset = feat2workindex_[fid];
|
||||||
if (offset >= 0) {
|
if (offset >= 0) {
|
||||||
@ -426,7 +436,9 @@ class CQHistMaker: public HistMaker {
|
|||||||
work_set_size, offset,
|
work_set_size, offset,
|
||||||
&thread_sketch_[omp_get_thread_num()]);
|
&thread_sketch_[omp_get_thread_num()]);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
for (size_t i = 0; i < sketchs_.size(); ++i) {
|
||||||
common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
|
common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
|
||||||
@ -701,8 +713,10 @@ class GlobalProposalHistMaker: public CQHistMaker {
|
|||||||
|
|
||||||
// start enumeration
|
// start enumeration
|
||||||
const auto nsize = static_cast<bst_omp_uint>(this->work_set_.size());
|
const auto nsize = static_cast<bst_omp_uint>(this->work_set_.size());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel for schedule(dynamic, 1)
|
#pragma omp parallel for schedule(dynamic, 1)
|
||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
|
exc.Run([&]() {
|
||||||
int fid = this->work_set_[i];
|
int fid = this->work_set_[i];
|
||||||
int offset = this->feat2workindex_[fid];
|
int offset = this->feat2workindex_[fid];
|
||||||
if (offset >= 0) {
|
if (offset >= 0) {
|
||||||
@ -710,7 +724,9 @@ class GlobalProposalHistMaker: public CQHistMaker {
|
|||||||
fset, offset,
|
fset, offset,
|
||||||
&this->thread_hist_[omp_get_thread_num()]);
|
&this->thread_hist_[omp_get_thread_num()]);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
// update node statistics.
|
// update node statistics.
|
||||||
|
|||||||
@ -713,8 +713,10 @@ void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const std::vector<Gr
|
|||||||
const size_t discard_size = info.num_row_ / nthread;
|
const size_t discard_size = info.num_row_ / nthread;
|
||||||
auto upper_border = static_cast<float>(std::numeric_limits<uint32_t>::max());
|
auto upper_border = static_cast<float>(std::numeric_limits<uint32_t>::max());
|
||||||
uint32_t coin_flip_border = static_cast<uint32_t>(upper_border * param_.subsample);
|
uint32_t coin_flip_border = static_cast<uint32_t>(upper_border * param_.subsample);
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel num_threads(nthread)
|
#pragma omp parallel num_threads(nthread)
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
const size_t tid = omp_get_thread_num();
|
const size_t tid = omp_get_thread_num();
|
||||||
const size_t ibegin = tid * discard_size;
|
const size_t ibegin = tid * discard_size;
|
||||||
const size_t iend = (tid == (nthread - 1)) ?
|
const size_t iend = (tid == (nthread - 1)) ?
|
||||||
@ -726,7 +728,9 @@ void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const std::vector<Gr
|
|||||||
p_row_indices[ibegin + row_offsets[tid]++] = i;
|
p_row_indices[ibegin + row_offsets[tid]++] = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
/* discard global engine */
|
/* discard global engine */
|
||||||
rnd = rnds[nthread - 1];
|
rnd = rnds[nthread - 1];
|
||||||
size_t prefix_sum = row_offsets[0];
|
size_t prefix_sum = row_offsets[0];
|
||||||
@ -769,10 +773,14 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
|
|||||||
hist_buffer_.Init(nbins);
|
hist_buffer_.Init(nbins);
|
||||||
|
|
||||||
// initialize histogram builder
|
// initialize histogram builder
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
this->nthread_ = omp_get_num_threads();
|
this->nthread_ = omp_get_num_threads();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
hist_builder_ = GHistBuilder<GradientSumT>(this->nthread_, nbins);
|
hist_builder_ = GHistBuilder<GradientSumT>(this->nthread_, nbins);
|
||||||
|
|
||||||
std::vector<size_t>& row_indices = *row_set_collection_.Data();
|
std::vector<size_t>& row_indices = *row_set_collection_.Data();
|
||||||
@ -794,6 +802,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
|
|||||||
|
|
||||||
#pragma omp parallel num_threads(this->nthread_)
|
#pragma omp parallel num_threads(this->nthread_)
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
const size_t tid = omp_get_thread_num();
|
const size_t tid = omp_get_thread_num();
|
||||||
const size_t ibegin = tid * block_size;
|
const size_t ibegin = tid * block_size;
|
||||||
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
|
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
|
||||||
@ -805,7 +814,9 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
|
|
||||||
bool has_neg_hess = false;
|
bool has_neg_hess = false;
|
||||||
for (int32_t tid = 0; tid < this->nthread_; ++tid) {
|
for (int32_t tid = 0; tid < this->nthread_; ++tid) {
|
||||||
@ -825,6 +836,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
|
|||||||
} else {
|
} else {
|
||||||
#pragma omp parallel num_threads(this->nthread_)
|
#pragma omp parallel num_threads(this->nthread_)
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
const size_t tid = omp_get_thread_num();
|
const size_t tid = omp_get_thread_num();
|
||||||
const size_t ibegin = tid * block_size;
|
const size_t ibegin = tid * block_size;
|
||||||
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
|
const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
|
||||||
@ -832,7 +844,9 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
|
|||||||
for (size_t i = ibegin; i < iend; ++i) {
|
for (size_t i = ibegin; i < iend; ++i) {
|
||||||
p_row_indices[i] = i;
|
p_row_indices[i] = i;
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,6 +13,7 @@
|
|||||||
#include "xgboost/json.h"
|
#include "xgboost/json.h"
|
||||||
#include "./param.h"
|
#include "./param.h"
|
||||||
#include "../common/io.h"
|
#include "../common/io.h"
|
||||||
|
#include "../common/threading_utils.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -52,8 +53,10 @@ class TreeRefresher: public TreeUpdater {
|
|||||||
const int nthread = omp_get_max_threads();
|
const int nthread = omp_get_max_threads();
|
||||||
fvec_temp.resize(nthread, RegTree::FVec());
|
fvec_temp.resize(nthread, RegTree::FVec());
|
||||||
stemp.resize(nthread, std::vector<GradStats>());
|
stemp.resize(nthread, std::vector<GradStats>());
|
||||||
|
dmlc::OMPException exc;
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
|
exc.Run([&]() {
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
int num_nodes = 0;
|
int num_nodes = 0;
|
||||||
for (auto tree : trees) {
|
for (auto tree : trees) {
|
||||||
@ -62,7 +65,9 @@ class TreeRefresher: public TreeUpdater {
|
|||||||
stemp[tid].resize(num_nodes, GradStats());
|
stemp[tid].resize(num_nodes, GradStats());
|
||||||
std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
|
std::fill(stemp[tid].begin(), stemp[tid].end(), GradStats());
|
||||||
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
exc.Rethrow();
|
||||||
// if it is C++11, use lazy evaluation for Allreduce,
|
// if it is C++11, use lazy evaluation for Allreduce,
|
||||||
// to gain speedup in recovery
|
// to gain speedup in recovery
|
||||||
auto lazy_get_stats = [&]() {
|
auto lazy_get_stats = [&]() {
|
||||||
@ -72,8 +77,7 @@ class TreeRefresher: public TreeUpdater {
|
|||||||
auto page = batch.GetView();
|
auto page = batch.GetView();
|
||||||
CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
|
CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
|
||||||
const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
|
const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(nbatch, [&](bst_omp_uint i) {
|
||||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
|
||||||
SparsePage::Inst inst = page[i];
|
SparsePage::Inst inst = page[i];
|
||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||||
@ -86,16 +90,15 @@ class TreeRefresher: public TreeUpdater {
|
|||||||
offset += tree->param.num_nodes;
|
offset += tree->param.num_nodes;
|
||||||
}
|
}
|
||||||
feats.Drop(inst);
|
feats.Drop(inst);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
// aggregate the statistics
|
// aggregate the statistics
|
||||||
auto num_nodes = static_cast<int>(stemp[0].size());
|
auto num_nodes = static_cast<int>(stemp[0].size());
|
||||||
#pragma omp parallel for schedule(static)
|
common::ParallelFor(num_nodes, [&](int nid) {
|
||||||
for (int nid = 0; nid < num_nodes; ++nid) {
|
|
||||||
for (int tid = 1; tid < nthread; ++tid) {
|
for (int tid = 1; tid < nthread; ++tid) {
|
||||||
stemp[0][nid].Add(stemp[tid][nid]);
|
stemp[0][nid].Add(stemp[tid][nid]);
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
};
|
};
|
||||||
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
|
reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
|
||||||
// rescale learning rate according to size of trees
|
// rescale learning rate according to size of trees
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user