[sycl] optimise hist building (#10311)
Co-authored-by: Dmitry Razdoburdin <>
This commit is contained in:
parent
9def441e9a
commit
0058301e6f
@ -64,6 +64,30 @@ template ::sycl::event SubtractionHist(::sycl::queue qu,
|
|||||||
const GHistRow<double, MemoryType::on_device>& src2,
|
const GHistRow<double, MemoryType::on_device>& src2,
|
||||||
size_t size, ::sycl::event event_priv);
|
size_t size, ::sycl::event event_priv);
|
||||||
|
|
||||||
|
inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) {
|
||||||
|
struct _ {
|
||||||
|
size_t block_size, nblocks;
|
||||||
|
};
|
||||||
|
|
||||||
|
const size_t min_block_size = 32;
|
||||||
|
const size_t max_compute_units =
|
||||||
|
qu.get_device().get_info<::sycl::info::device::max_compute_units>();
|
||||||
|
|
||||||
|
size_t nblocks = max_compute_units;
|
||||||
|
|
||||||
|
size_t block_size = size / nblocks + !!(size % nblocks);
|
||||||
|
if (block_size > (1u << 12)) {
|
||||||
|
nblocks = max_nblocks;
|
||||||
|
block_size = size / nblocks + !!(size % nblocks);
|
||||||
|
}
|
||||||
|
if (block_size < min_block_size) {
|
||||||
|
block_size = min_block_size;
|
||||||
|
nblocks = size / block_size + !!(size % block_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return _{block_size, nblocks};
|
||||||
|
}
|
||||||
|
|
||||||
// Kernel with buffer using
|
// Kernel with buffer using
|
||||||
template<typename FPType, typename BinIdxType, bool isDense>
|
template<typename FPType, typename BinIdxType, bool isDense>
|
||||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
::sycl::event BuildHistKernel(::sycl::queue qu,
|
||||||
@ -73,27 +97,26 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
GHistRow<FPType, MemoryType::on_device>* hist,
|
GHistRow<FPType, MemoryType::on_device>* hist,
|
||||||
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
|
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
|
||||||
::sycl::event event_priv) {
|
::sycl::event event_priv) {
|
||||||
|
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
|
||||||
const size_t size = row_indices.Size();
|
const size_t size = row_indices.Size();
|
||||||
const size_t* rid = row_indices.begin;
|
const size_t* rid = row_indices.begin;
|
||||||
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
||||||
const GradientPair::ValueT* pgh =
|
const auto* pgh = gpair_device.DataConst();
|
||||||
reinterpret_cast<const GradientPair::ValueT*>(gpair_device.DataConst());
|
|
||||||
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
|
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
|
||||||
const uint32_t* offsets = gmat.index.Offset();
|
const uint32_t* offsets = gmat.index.Offset();
|
||||||
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
|
|
||||||
const size_t nbins = gmat.nbins;
|
const size_t nbins = gmat.nbins;
|
||||||
|
|
||||||
const size_t max_work_group_size =
|
const size_t max_work_group_size =
|
||||||
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
|
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
|
||||||
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
||||||
|
|
||||||
const size_t max_nblocks = hist_buffer->Size() / (nbins * 2);
|
// Captured structured bindings are a C++20 extension
|
||||||
const size_t min_block_size = 128;
|
const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2));
|
||||||
size_t nblocks = std::min(max_nblocks, size / min_block_size + !!(size % min_block_size));
|
const size_t block_size = block_params.block_size;
|
||||||
const size_t block_size = size / nblocks + !!(size % nblocks);
|
const size_t nblocks = block_params.nblocks;
|
||||||
FPType* hist_buffer_data = reinterpret_cast<FPType*>(hist_buffer->Data());
|
|
||||||
|
|
||||||
auto event_fill = qu.fill(hist_buffer_data, FPType(0), nblocks * nbins * 2, event_priv);
|
GradientPairT* hist_buffer_data = hist_buffer->Data();
|
||||||
|
auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv);
|
||||||
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_fill);
|
cgh.depends_on(event_fill);
|
||||||
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
|
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
|
||||||
@ -102,13 +125,14 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
size_t block = pid.get_global_id(0);
|
size_t block = pid.get_global_id(0);
|
||||||
size_t feat = pid.get_global_id(1);
|
size_t feat = pid.get_global_id(1);
|
||||||
|
|
||||||
FPType* hist_local = hist_buffer_data + block * nbins * 2;
|
GradientPairT* hist_local = hist_buffer_data + block * nbins;
|
||||||
for (size_t idx = 0; idx < block_size; ++idx) {
|
for (size_t idx = 0; idx < block_size; ++idx) {
|
||||||
size_t i = block * block_size + idx;
|
size_t i = block * block_size + idx;
|
||||||
if (i < size) {
|
if (i < size) {
|
||||||
const size_t icol_start = n_columns * rid[i];
|
const size_t icol_start = n_columns * rid[i];
|
||||||
const size_t idx_gh = rid[i];
|
const size_t idx_gh = rid[i];
|
||||||
|
|
||||||
|
const GradientPairT pgh_row = {pgh[idx_gh].GetGrad(), pgh[idx_gh].GetHess()};
|
||||||
pid.barrier(::sycl::access::fence_space::local_space);
|
pid.barrier(::sycl::access::fence_space::local_space);
|
||||||
const BinIdxType* gr_index_local = gradient_index + icol_start;
|
const BinIdxType* gr_index_local = gradient_index + icol_start;
|
||||||
|
|
||||||
@ -118,8 +142,7 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
idx_bin += offsets[j];
|
idx_bin += offsets[j];
|
||||||
}
|
}
|
||||||
if (idx_bin < nbins) {
|
if (idx_bin < nbins) {
|
||||||
hist_local[2 * idx_bin] += pgh[2 * idx_gh];
|
hist_local[idx_bin] += pgh_row;
|
||||||
hist_local[2 * idx_bin+1] += pgh[2 * idx_gh+1];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -127,21 +150,19 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
GradientPairT* hist_data = hist->Data();
|
||||||
auto event_save = qu.submit([&](::sycl::handler& cgh) {
|
auto event_save = qu.submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_main);
|
cgh.depends_on(event_main);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
|
||||||
size_t idx_bin = pid.get_id(0);
|
size_t idx_bin = pid.get_id(0);
|
||||||
|
|
||||||
FPType gsum = 0.0f;
|
GradientPairT gpair = {0, 0};
|
||||||
FPType hsum = 0.0f;
|
|
||||||
|
|
||||||
for (size_t j = 0; j < nblocks; ++j) {
|
for (size_t j = 0; j < nblocks; ++j) {
|
||||||
gsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin];
|
gpair += hist_buffer_data[j * nbins + idx_bin];
|
||||||
hsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin + 1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
hist_data[2 * idx_bin] = gsum;
|
hist_data[idx_bin] = gpair;
|
||||||
hist_data[2 * idx_bin + 1] = hsum;
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
return event_save;
|
return event_save;
|
||||||
@ -165,24 +186,27 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
|
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
|
||||||
const size_t nbins = gmat.nbins;
|
const size_t nbins = gmat.nbins;
|
||||||
|
|
||||||
const size_t max_work_group_size =
|
constexpr size_t work_group_size = 32;
|
||||||
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
|
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
|
||||||
const size_t feat_local = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
|
||||||
|
|
||||||
auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
|
auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
|
||||||
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_fill);
|
cgh.depends_on(event_fill);
|
||||||
cgh.parallel_for<>(::sycl::range<2>(size, feat_local),
|
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size),
|
||||||
[=](::sycl::item<2> pid) {
|
::sycl::range<2>(1, work_group_size)),
|
||||||
size_t i = pid.get_id(0);
|
[=](::sycl::nd_item<2> pid) {
|
||||||
size_t feat = pid.get_id(1);
|
const int i = pid.get_global_id(0);
|
||||||
|
auto group = pid.get_group();
|
||||||
|
|
||||||
const size_t icol_start = n_columns * rid[i];
|
const size_t icol_start = n_columns * rid[i];
|
||||||
const size_t idx_gh = rid[i];
|
const size_t idx_gh = rid[i];
|
||||||
|
const FPType pgh_row[2] = {pgh[2 * idx_gh], pgh[2 * idx_gh + 1]};
|
||||||
const BinIdxType* gr_index_local = gradient_index + icol_start;
|
const BinIdxType* gr_index_local = gradient_index + icol_start;
|
||||||
|
|
||||||
for (size_t j = feat; j < n_columns; j += feat_local) {
|
const size_t group_id = group.get_group_id()[1];
|
||||||
|
const size_t local_id = group.get_local_id()[1];
|
||||||
|
const size_t j = group_id * work_group_size + local_id;
|
||||||
|
if (j < n_columns) {
|
||||||
uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
|
uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
|
||||||
if constexpr (isDense) {
|
if constexpr (isDense) {
|
||||||
idx_bin += offsets[j];
|
idx_bin += offsets[j];
|
||||||
@ -190,8 +214,8 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
if (idx_bin < nbins) {
|
if (idx_bin < nbins) {
|
||||||
AtomicRef<FPType> gsum(hist_data[2 * idx_bin]);
|
AtomicRef<FPType> gsum(hist_data[2 * idx_bin]);
|
||||||
AtomicRef<FPType> hsum(hist_data[2 * idx_bin + 1]);
|
AtomicRef<FPType> hsum(hist_data[2 * idx_bin + 1]);
|
||||||
gsum.fetch_add(pgh[2 * idx_gh]);
|
gsum += pgh_row[0];
|
||||||
hsum.fetch_add(pgh[2 * idx_gh + 1]);
|
hsum += pgh_row[1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -214,10 +238,6 @@ template<typename FPType, typename BinIdxType>
|
|||||||
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
|
||||||
const size_t nbins = gmat.nbins;
|
const size_t nbins = gmat.nbins;
|
||||||
|
|
||||||
// max cycle size, while atomics are still effective
|
|
||||||
const size_t max_cycle_size_atomics = nbins;
|
|
||||||
const size_t cycle_size = size;
|
|
||||||
|
|
||||||
// TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
|
// TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
|
||||||
bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
|
bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
|
||||||
|
|
||||||
|
|||||||
@ -136,10 +136,7 @@ void HistUpdater<GradientSumT>::InitData(
|
|||||||
|
|
||||||
hist_buffer_.Init(qu_, nbins);
|
hist_buffer_.Init(qu_, nbins);
|
||||||
size_t buffer_size = kBufferSize;
|
size_t buffer_size = kBufferSize;
|
||||||
if (buffer_size > info.num_row_ / kMinBlockSize + 1) {
|
hist_buffer_.Reset(kBufferSize);
|
||||||
buffer_size = info.num_row_ / kMinBlockSize + 1;
|
|
||||||
}
|
|
||||||
hist_buffer_.Reset(buffer_size);
|
|
||||||
|
|
||||||
// initialize histogram builder
|
// initialize histogram builder
|
||||||
hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
|
hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
|
||||||
|
|||||||
@ -130,7 +130,6 @@ class HistUpdater {
|
|||||||
DataLayout data_layout_;
|
DataLayout data_layout_;
|
||||||
|
|
||||||
constexpr static size_t kBufferSize = 2048;
|
constexpr static size_t kBufferSize = 2048;
|
||||||
constexpr static size_t kMinBlockSize = 128;
|
|
||||||
common::GHistBuilder<GradientSumT> hist_builder_;
|
common::GHistBuilder<GradientSumT> hist_builder_;
|
||||||
common::ParallelGHistBuilder<GradientSumT> hist_buffer_;
|
common::ParallelGHistBuilder<GradientSumT> hist_buffer_;
|
||||||
/*! \brief culmulative histogram of gradients. */
|
/*! \brief culmulative histogram of gradients. */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user