From 6a7c6a8ae6ff7e35ce6fdedae3331dd2ef324485 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Sat, 23 Mar 2024 05:55:25 +0100 Subject: [PATCH] add sycl reaslisation of ghist builder (#10138) Co-authored-by: Dmitry Razdoburdin <> --- plugin/sycl/common/hist_util.cc | 334 ++++++++++++++++++++ plugin/sycl/common/hist_util.h | 89 ++++++ tests/cpp/plugin/sycl_helpers.h | 9 +- tests/cpp/plugin/test_sycl_ghist_builder.cc | 157 +++++++++ 4 files changed, 585 insertions(+), 4 deletions(-) create mode 100644 plugin/sycl/common/hist_util.cc create mode 100644 plugin/sycl/common/hist_util.h create mode 100644 tests/cpp/plugin/test_sycl_ghist_builder.cc diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc new file mode 100644 index 000000000..fd813a92c --- /dev/null +++ b/plugin/sycl/common/hist_util.cc @@ -0,0 +1,334 @@ +/*! + * Copyright 2017-2023 by Contributors + * \file hist_util.cc + */ +#include +#include +#include + +#include "../data/gradient_index.h" +#include "hist_util.h" + +#include + +namespace xgboost { +namespace sycl { +namespace common { + +/*! + * \brief Fill histogram with zeroes + */ +template +void InitHist(::sycl::queue qu, GHistRow* hist, + size_t size, ::sycl::event* event) { + *event = qu.fill(hist->Begin(), + xgboost::detail::GradientPairInternal(), size, *event); +} +template void InitHist(::sycl::queue qu, + GHistRow* hist, + size_t size, ::sycl::event* event); +template void InitHist(::sycl::queue qu, + GHistRow* hist, + size_t size, ::sycl::event* event); + +/*! + * \brief Compute Subtraction: dst = src1 - src2 + */ +template +::sycl::event SubtractionHist(::sycl::queue qu, + GHistRow* dst, + const GHistRow& src1, + const GHistRow& src2, + size_t size, ::sycl::event event_priv) { + GradientSumT* pdst = reinterpret_cast(dst->Data()); + const GradientSumT* psrc1 = reinterpret_cast(src1.DataConst()); + const GradientSumT* psrc2 = reinterpret_cast(src2.DataConst()); + + auto event_final = qu.submit([&](::sycl::handler& cgh) { + cgh.depends_on(event_priv); + cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) { + const size_t i = pid.get_id(0); + pdst[i] = psrc1[i] - psrc2[i]; + }); + }); + return event_final; +} +template ::sycl::event SubtractionHist(::sycl::queue qu, + GHistRow* dst, + const GHistRow& src1, + const GHistRow& src2, + size_t size, ::sycl::event event_priv); +template ::sycl::event SubtractionHist(::sycl::queue qu, + GHistRow* dst, + const GHistRow& src1, + const GHistRow& src2, + size_t size, ::sycl::event event_priv); + +// Kernel with buffer using +template +::sycl::event BuildHistKernel(::sycl::queue qu, + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, + GHistRow* hist, + GHistRow* hist_buffer, + ::sycl::event event_priv) { + const size_t size = row_indices.Size(); + const size_t* rid = row_indices.begin; + const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride; + const GradientPair::ValueT* pgh = + reinterpret_cast(gpair_device.DataConst()); + const BinIdxType* gradient_index = gmat.index.data(); + const uint32_t* offsets = gmat.index.Offset(); + FPType* hist_data = reinterpret_cast(hist->Data()); + const size_t nbins = gmat.nbins; + + const size_t max_work_group_size = + qu.get_device().get_info<::sycl::info::device::max_work_group_size>(); + const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size; + + const size_t max_nblocks = hist_buffer->Size() / (nbins * 2); + const size_t min_block_size = 128; + size_t nblocks = std::min(max_nblocks, size / min_block_size + !!(size % min_block_size)); + const size_t block_size = size / nblocks + !!(size % nblocks); + FPType* hist_buffer_data = reinterpret_cast(hist_buffer->Data()); + + auto event_fill = qu.fill(hist_buffer_data, FPType(0), nblocks * nbins * 2, event_priv); + auto event_main = qu.submit([&](::sycl::handler& cgh) { + cgh.depends_on(event_fill); + cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size), + ::sycl::range<2>(1, work_group_size)), + [=](::sycl::nd_item<2> pid) { + size_t block = pid.get_global_id(0); + size_t feat = pid.get_global_id(1); + + FPType* hist_local = hist_buffer_data + block * nbins * 2; + for (size_t idx = 0; idx < block_size; ++idx) { + size_t i = block * block_size + idx; + if (i < size) { + const size_t icol_start = n_columns * rid[i]; + const size_t idx_gh = rid[i]; + + pid.barrier(::sycl::access::fence_space::local_space); + const BinIdxType* gr_index_local = gradient_index + icol_start; + + for (size_t j = feat; j < n_columns; j += work_group_size) { + uint32_t idx_bin = static_cast(gr_index_local[j]); + if constexpr (isDense) { + idx_bin += offsets[j]; + } + if (idx_bin < nbins) { + hist_local[2 * idx_bin] += pgh[2 * idx_gh]; + hist_local[2 * idx_bin+1] += pgh[2 * idx_gh+1]; + } + } + } + } + }); + }); + + auto event_save = qu.submit([&](::sycl::handler& cgh) { + cgh.depends_on(event_main); + cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) { + size_t idx_bin = pid.get_id(0); + + FPType gsum = 0.0f; + FPType hsum = 0.0f; + + for (size_t j = 0; j < nblocks; ++j) { + gsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin]; + hsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin + 1]; + } + + hist_data[2 * idx_bin] = gsum; + hist_data[2 * idx_bin + 1] = hsum; + }); + }); + return event_save; +} + +// Kernel with atomic using +template +::sycl::event BuildHistKernel(::sycl::queue qu, + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, + GHistRow* hist, + ::sycl::event event_priv) { + const size_t size = row_indices.Size(); + const size_t* rid = row_indices.begin; + const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride; + const GradientPair::ValueT* pgh = + reinterpret_cast(gpair_device.DataConst()); + const BinIdxType* gradient_index = gmat.index.data(); + const uint32_t* offsets = gmat.index.Offset(); + FPType* hist_data = reinterpret_cast(hist->Data()); + const size_t nbins = gmat.nbins; + + const size_t max_work_group_size = + qu.get_device().get_info<::sycl::info::device::max_work_group_size>(); + const size_t feat_local = n_columns < max_work_group_size ? n_columns : max_work_group_size; + + auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv); + auto event_main = qu.submit([&](::sycl::handler& cgh) { + cgh.depends_on(event_fill); + cgh.parallel_for<>(::sycl::range<2>(size, feat_local), + [=](::sycl::item<2> pid) { + size_t i = pid.get_id(0); + size_t feat = pid.get_id(1); + + const size_t icol_start = n_columns * rid[i]; + const size_t idx_gh = rid[i]; + + const BinIdxType* gr_index_local = gradient_index + icol_start; + + for (size_t j = feat; j < n_columns; j += feat_local) { + uint32_t idx_bin = static_cast(gr_index_local[j]); + if constexpr (isDense) { + idx_bin += offsets[j]; + } + if (idx_bin < nbins) { + AtomicRef gsum(hist_data[2 * idx_bin]); + AtomicRef hsum(hist_data[2 * idx_bin + 1]); + gsum.fetch_add(pgh[2 * idx_gh]); + hsum.fetch_add(pgh[2 * idx_gh + 1]); + } + } + }); + }); + return event_main; +} + +template +::sycl::event BuildHistDispatchKernel( + ::sycl::queue qu, + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, + GHistRow* hist, + bool isDense, + GHistRow* hist_buffer, + ::sycl::event events_priv, + bool force_atomic_use) { + const size_t size = row_indices.Size(); + const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride; + const size_t nbins = gmat.nbins; + + // max cycle size, while atomics are still effective + const size_t max_cycle_size_atomics = nbins; + const size_t cycle_size = size; + + // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one + bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns); + + // force_atomic_use flag is used only for testing + use_atomic = use_atomic || force_atomic_use; + if (!use_atomic) { + if (isDense) { + return BuildHistKernel(qu, gpair_device, row_indices, + gmat, hist, hist_buffer, + events_priv); + } else { + return BuildHistKernel(qu, gpair_device, row_indices, + gmat, hist, hist_buffer, + events_priv); + } + } else { + if (isDense) { + return BuildHistKernel(qu, gpair_device, row_indices, + gmat, hist, events_priv); + } else { + return BuildHistKernel(qu, gpair_device, row_indices, + gmat, hist, events_priv); + } + } +} + +template +::sycl::event BuildHistKernel(::sycl::queue qu, + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, const bool isDense, + GHistRow* hist, + GHistRow* hist_buffer, + ::sycl::event event_priv, + bool force_atomic_use) { + const bool is_dense = isDense; + switch (gmat.index.GetBinTypeSize()) { + case BinTypeSize::kUint8BinsTypeSize: + return BuildHistDispatchKernel(qu, gpair_device, row_indices, + gmat, hist, is_dense, hist_buffer, + event_priv, force_atomic_use); + break; + case BinTypeSize::kUint16BinsTypeSize: + return BuildHistDispatchKernel(qu, gpair_device, row_indices, + gmat, hist, is_dense, hist_buffer, + event_priv, force_atomic_use); + break; + case BinTypeSize::kUint32BinsTypeSize: + return BuildHistDispatchKernel(qu, gpair_device, row_indices, + gmat, hist, is_dense, hist_buffer, + event_priv, force_atomic_use); + break; + default: + CHECK(false); // no default behavior + } +} + +template +::sycl::event GHistBuilder::BuildHist( + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix &gmat, + GHistRowT* hist, + bool isDense, + GHistRowT* hist_buffer, + ::sycl::event event_priv, + bool force_atomic_use) { + return BuildHistKernel(qu_, gpair_device, row_indices, gmat, + isDense, hist, hist_buffer, event_priv, + force_atomic_use); +} + +template +::sycl::event GHistBuilder::BuildHist( + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, + GHistRow* hist, + bool isDense, + GHistRow* hist_buffer, + ::sycl::event event_priv, + bool force_atomic_use); +template +::sycl::event GHistBuilder::BuildHist( + const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, + GHistRow* hist, + bool isDense, + GHistRow* hist_buffer, + ::sycl::event event_priv, + bool force_atomic_use); + +template +void GHistBuilder::SubtractionTrick(GHistRowT* self, + const GHistRowT& sibling, + const GHistRowT& parent) { + const size_t size = self->Size(); + CHECK_EQ(sibling.Size(), size); + CHECK_EQ(parent.Size(), size); + + SubtractionHist(qu_, self, parent, sibling, size, ::sycl::event()); +} +template +void GHistBuilder::SubtractionTrick(GHistRow* self, + const GHistRow& sibling, + const GHistRow& parent); +template +void GHistBuilder::SubtractionTrick(GHistRow* self, + const GHistRow& sibling, + const GHistRow& parent); +} // namespace common +} // namespace sycl +} // namespace xgboost diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h new file mode 100644 index 000000000..7c7af71ae --- /dev/null +++ b/plugin/sycl/common/hist_util.h @@ -0,0 +1,89 @@ +/*! + * Copyright 2017-2023 by Contributors + * \file hist_util.h + */ +#ifndef PLUGIN_SYCL_COMMON_HIST_UTIL_H_ +#define PLUGIN_SYCL_COMMON_HIST_UTIL_H_ + +#include +#include +#include + +#include "../data.h" +#include "row_set.h" + +#include "../../src/common/hist_util.h" +#include "../data/gradient_index.h" + +#include + +namespace xgboost { +namespace sycl { +namespace common { + +template +using GHistRow = USMVector, memory_type>; + +using BinTypeSize = ::xgboost::common::BinTypeSize; + +class ColumnMatrix; + +/*! + * \brief Fill histogram with zeroes + */ +template +void InitHist(::sycl::queue qu, + GHistRow* hist, + size_t size, ::sycl::event* event); + +/*! + * \brief Compute subtraction: dst = src1 - src2 + */ +template +::sycl::event SubtractionHist(::sycl::queue qu, + GHistRow* dst, + const GHistRow& src1, + const GHistRow& src2, + size_t size, ::sycl::event event_priv); + +/*! + * \brief Builder for histograms of gradient statistics + */ +template +class GHistBuilder { + public: + template + using GHistRowT = GHistRow; + + GHistBuilder() = default; + GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {} + + // Construct a histogram via histogram aggregation + ::sycl::event BuildHist(const USMVector& gpair_device, + const RowSetCollection::Elem& row_indices, + const GHistIndexMatrix& gmat, + GHistRowT* HistCollection, + bool isDense, + GHistRowT* hist_buffer, + ::sycl::event event, + bool force_atomic_use = false); + + // Construct a histogram via subtraction trick + void SubtractionTrick(GHistRowT* self, + const GHistRowT& sibling, + const GHistRowT& parent); + + uint32_t GetNumBins() const { + return nbins_; + } + + private: + /*! \brief Number of all bins over all features */ + uint32_t nbins_ { 0 }; + + ::sycl::queue qu_; +}; +} // namespace common +} // namespace sycl +} // namespace xgboost +#endif // PLUGIN_SYCL_COMMON_HIST_UTIL_H_ diff --git a/tests/cpp/plugin/sycl_helpers.h b/tests/cpp/plugin/sycl_helpers.h index c5cdd3ea5..afc403d86 100644 --- a/tests/cpp/plugin/sycl_helpers.h +++ b/tests/cpp/plugin/sycl_helpers.h @@ -8,22 +8,23 @@ namespace xgboost::sycl { template void VerifySyclVector(const USMVector& sycl_vector, - const Container& host_vector) { + const Container& host_vector, T eps = T()) { ASSERT_EQ(sycl_vector.Size(), host_vector.size()); size_t size = sycl_vector.Size(); for (size_t i = 0; i < size; ++i) { - ASSERT_EQ(sycl_vector[i], host_vector[i]); + EXPECT_NEAR(sycl_vector[i], host_vector[i], eps); } } template -void VerifySyclVector(const std::vector& sycl_vector, const Container& host_vector) { +void VerifySyclVector(const std::vector& sycl_vector, + const Container& host_vector, T eps = T()) { ASSERT_EQ(sycl_vector.size(), host_vector.size()); size_t size = sycl_vector.size(); for (size_t i = 0; i < size; ++i) { - ASSERT_EQ(sycl_vector[i], host_vector[i]); + EXPECT_NEAR(sycl_vector[i], host_vector[i], eps); } } diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc new file mode 100644 index 000000000..dacbc75fc --- /dev/null +++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc @@ -0,0 +1,157 @@ +/** + * Copyright 2020-2024 by XGBoost contributors + */ +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix +#pragma GCC diagnostic pop + +#include "../../../plugin/sycl/common/hist_util.h" +#include "../../../plugin/sycl/device_manager.h" +#include "sycl_helpers.h" +#include "../helpers.h" + +namespace xgboost::sycl::common { + +template +void GHistBuilderTest(float sparsity, bool force_atomic_use) { + const size_t num_rows = 8; + const size_t num_columns = 1; + const int n_bins = 2; + const GradientSumT eps = 1e-6; + + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + DeviceManager device_manager; + auto qu = device_manager.GetQueue(ctx.Device()); + + auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix(); + sycl::DeviceMatrix dmat; + dmat.Init(qu, p_fmat.get()); + + GHistIndexMatrix gmat_sycl; + gmat_sycl.Init(qu, &ctx, dmat, n_bins); + + xgboost::GHistIndexMatrix gmat{&ctx, p_fmat.get(), n_bins, 0.3, false}; + + RowSetCollection row_set_collection; + auto& row_indices = row_set_collection.Data(); + row_indices.Resize(&qu, num_rows); + size_t* p_row_indices = row_indices.Data(); + + qu.submit([&](::sycl::handler& cgh) { + cgh.parallel_for<>(::sycl::range<1>(num_rows), + [p_row_indices](::sycl::item<1> pid) { + const size_t idx = pid.get_id(0); + p_row_indices[idx] = idx; + }); + }).wait_and_throw(); + row_set_collection.Init(); + + auto builder = GHistBuilder(qu, n_bins); + + std::vector gpair = { + {0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f}, + {0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}}; + CHECK_EQ(gpair.size(), num_rows); + USMVector gpair_device(&qu, gpair); + + std::vector hist_host(2*n_bins); + GHistRow hist(&qu, 2 * n_bins); + ::sycl::event event; + + const size_t nblocks = 2; + GHistRow hist_buffer(&qu, 2 * nblocks * n_bins); + + InitHist(qu, &hist, hist.Size(), &event); + InitHist(qu, &hist_buffer, hist_buffer.Size(), &event); + + event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist, + sparsity < eps , &hist_buffer, event, force_atomic_use); + qu.memcpy(hist_host.data(), hist.Data(), + 2 * n_bins * sizeof(GradientSumT), event); + qu.wait_and_throw(); + + // Build hist on host to compare + std::vector hist_desired(2*n_bins); + for (size_t rid = 0; rid < num_rows; ++rid) { + const size_t ibegin = gmat.row_ptr[rid]; + const size_t iend = gmat.row_ptr[rid + 1]; + for (size_t i = ibegin; i < iend; ++i) { + const size_t bin_idx = gmat.index[i]; + hist_desired[2*bin_idx] += gpair[rid].GetGrad(); + hist_desired[2*bin_idx+1] += gpair[rid].GetHess(); + } + } + + VerifySyclVector(hist_host, hist_desired, eps); +} + +template +void GHistSubtractionTest() { + const size_t n_bins = 4; + using GHistType = GHistRow; + + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + DeviceManager device_manager; + auto qu = device_manager.GetQueue(ctx.Device()); + + ::sycl::event event; + std::vector hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}; + GHistType hist1(&qu, 2 * n_bins); + event = qu.memcpy(hist1.Data(), hist1_host.data(), + 2 * n_bins * sizeof(GradientSumT), event); + + std::vector hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1}; + GHistType hist2(&qu, 2 * n_bins); + event = qu.memcpy(hist2.Data(), hist2_host.data(), + 2 * n_bins * sizeof(GradientSumT), event); + + std::vector hist3_host(2 * n_bins); + GHistType hist3(&qu, 2 * n_bins); + event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event); + qu.memcpy(hist3_host.data(), hist3.Data(), + 2 * n_bins * sizeof(GradientSumT), event); + qu.wait_and_throw(); + + std::vector hist3_desired(2 * n_bins); + for (size_t idx = 0; idx < 2 * n_bins; ++idx) { + hist3_desired[idx] = hist1_host[idx] - hist2_host[idx]; + } + + const GradientSumT eps = 1e-6; + VerifySyclVector(hist3_host, hist3_desired, eps); +} + +TEST(SyclGHistBuilder, ByBlockDenseCase) { + GHistBuilderTest(0.0, false); + GHistBuilderTest(0.0, false); +} + +TEST(SyclGHistBuilder, ByBlockSparseCase) { + GHistBuilderTest(0.3, false); + GHistBuilderTest(0.3, false); +} + +TEST(SyclGHistBuilder, ByAtomicDenseCase) { + GHistBuilderTest(0.0, true); + GHistBuilderTest(0.0, true); +} + +TEST(SyclGHistBuilder, ByAtomicSparseCase) { + GHistBuilderTest(0.3, true); + GHistBuilderTest(0.3, true); +} + +TEST(SyclGHistBuilder, Subtraction) { + GHistSubtractionTest(); + GHistSubtractionTest(); +} + +} // namespace xgboost::sycl::common