From c7e7ce75693f899f5dc9720c5adb58305d2d93b1 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Tue, 21 May 2024 17:38:52 +0200 Subject: [PATCH] [SYCL] Add nodes initialisation (#10269) --------- Co-authored-by: Dmitry Razdoburdin <> Co-authored-by: Jiaming Yuan --- plugin/sycl/device_manager.cc | 5 - plugin/sycl/device_manager.h | 4 + plugin/sycl/objective/multiclass_obj.cc | 8 +- plugin/sycl/tree/hist_updater.cc | 97 +++++++++++++++++- plugin/sycl/tree/hist_updater.h | 46 +++++++-- plugin/sycl/tree/param.h | 109 +++++++++++++++++++++ tests/cpp/plugin/test_sycl_hist_updater.cc | 106 +++++++++++++++++--- 7 files changed, 342 insertions(+), 33 deletions(-) diff --git a/plugin/sycl/device_manager.cc b/plugin/sycl/device_manager.cc index 072c9fd55..0ddbf1440 100644 --- a/plugin/sycl/device_manager.cc +++ b/plugin/sycl/device_manager.cc @@ -2,11 +2,6 @@ * Copyright 2017-2023 by Contributors * \file device_manager.cc */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wtautological-constant-compare" -#pragma GCC diagnostic ignored "-W#pragma-messages" -#pragma GCC diagnostic pop - #include "../sycl/device_manager.h" #include "../../src/collective/communicator-inl.h" diff --git a/plugin/sycl/device_manager.h b/plugin/sycl/device_manager.h index 0ae2ee9fe..84d4b24c0 100644 --- a/plugin/sycl/device_manager.h +++ b/plugin/sycl/device_manager.h @@ -12,7 +12,11 @@ #include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" #include "xgboost/context.h" +#pragma GCC diagnostic pop namespace xgboost { namespace sycl { diff --git a/plugin/sycl/objective/multiclass_obj.cc b/plugin/sycl/objective/multiclass_obj.cc index 16efe2a45..5dcc8c3de 100644 --- a/plugin/sycl/objective/multiclass_obj.cc +++ b/plugin/sycl/objective/multiclass_obj.cc @@ -3,19 +3,15 @@ * \file multiclass_obj.cc * \brief Definition of multi-class classification objectives. */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wtautological-constant-compare" -#pragma GCC diagnostic ignored "-W#pragma-messages" -#pragma GCC diagnostic pop - #include #include #include #include -#include "xgboost/parameter.h" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wtautological-constant-compare" +#pragma GCC diagnostic ignored "-W#pragma-messages" +#include "xgboost/parameter.h" #include "xgboost/data.h" #include "../../src/common/math.h" #pragma GCC diagnostic pop diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc index 03be41994..76ecdeab8 100644 --- a/plugin/sycl/tree/hist_updater.cc +++ b/plugin/sycl/tree/hist_updater.cc @@ -8,6 +8,7 @@ #include #include "../common/hist_util.h" +#include "../../src/collective/allreduce.h" namespace xgboost { namespace sycl { @@ -111,7 +112,6 @@ void HistUpdater::InitSampling( template void HistUpdater::InitData( - Context const * ctx, const common::GHistIndexMatrix& gmat, const USMVector &gpair, const DMatrix& fmat, @@ -215,6 +215,101 @@ void HistUpdater::InitData( data_layout_ = kSparseData; } } + + if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { + /* specialized code for dense data: + choose the column that has a least positive number of discrete bins. + For dense data (with no missing value), + the sum of gradient histogram is equal to snode[nid] */ + const std::vector& row_ptr = gmat.cut.Ptrs(); + const auto nfeature = static_cast(row_ptr.size() - 1); + uint32_t min_nbins_per_feature = 0; + for (bst_uint i = 0; i < nfeature; ++i) { + const uint32_t nbins = row_ptr[i + 1] - row_ptr[i]; + if (nbins > 0) { + if (min_nbins_per_feature == 0 || min_nbins_per_feature > nbins) { + min_nbins_per_feature = nbins; + fid_least_bins_ = i; + } + } + } + CHECK_GT(min_nbins_per_feature, 0U); + } + + std::fill(snode_host_.begin(), snode_host_.end(), NodeEntry(param_)); + builder_monitor_.Stop("InitData"); +} + +template +void HistUpdater::InitNewNode(int nid, + const common::GHistIndexMatrix& gmat, + const USMVector &gpair, + const DMatrix& fmat, + const RegTree& tree) { + builder_monitor_.Start("InitNewNode"); + + snode_host_.resize(tree.NumNodes(), NodeEntry(param_)); + { + if (tree[nid].IsRoot()) { + GradStats grad_stat; + if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { + const std::vector& row_ptr = gmat.cut.Ptrs(); + const uint32_t ibegin = row_ptr[fid_least_bins_]; + const uint32_t iend = row_ptr[fid_least_bins_ + 1]; + const auto* hist = reinterpret_cast*>(hist_[nid].Data()); + + std::vector> ets(iend - ibegin); + qu_.memcpy(ets.data(), hist + ibegin, + (iend - ibegin) * sizeof(GradStats)).wait_and_throw(); + for (const auto& et : ets) { + grad_stat += et; + } + } else { + const common::RowSetCollection::Elem e = row_set_collection_[nid]; + const size_t* row_idxs = e.begin; + const size_t size = e.Size(); + const GradientPair* gpair_ptr = gpair.DataConst(); + + ::sycl::buffer> buff(&grad_stat, 1); + qu_.submit([&](::sycl::handler& cgh) { + auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>()); + cgh.parallel_for<>(::sycl::range<1>(size), reduction, + [=](::sycl::item<1> pid, auto& sum) { + size_t i = pid.get_id(0); + size_t row_idx = row_idxs[i]; + if constexpr (std::is_same::value) { + sum += gpair_ptr[row_idx]; + } else { + sum += GradStats(gpair_ptr[row_idx].GetGrad(), + gpair_ptr[row_idx].GetHess()); + } + }); + }).wait_and_throw(); + } + auto rc = collective::Allreduce( + ctx_, linalg::MakeVec(reinterpret_cast(&grad_stat), 2), + collective::Op::kSum); + SafeColl(rc); + snode_host_[nid].stats = grad_stat; + } else { + int parent_id = tree[nid].Parent(); + if (tree[nid].IsLeftChild()) { + snode_host_[nid].stats = snode_host_[parent_id].best.left_sum; + } else { + snode_host_[nid].stats = snode_host_[parent_id].best.right_sum; + } + } + } + + // calculating the weights + { + auto evaluator = tree_evaluator_.GetEvaluator(); + bst_uint parentid = tree[nid].Parent(); + snode_host_[nid].weight = evaluator.CalcWeight(parentid, snode_host_[nid].stats); + snode_host_[nid].root_gain = evaluator.CalcGain(parentid, snode_host_[nid].stats); + } + builder_monitor_.Stop("InitNewNode"); } template class HistUpdater; diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h index d60eb7065..544a7c266 100644 --- a/plugin/sycl/tree/hist_updater.h +++ b/plugin/sycl/tree/hist_updater.h @@ -26,6 +26,22 @@ namespace xgboost { namespace sycl { namespace tree { +// data structure +template +struct NodeEntry { + /*! \brief statics for node entry */ + GradStats stats; + /*! \brief loss of this node, without split */ + GradType root_gain; + /*! \brief weight calculated related to current data */ + GradType weight; + /*! \brief current best solution */ + SplitEntry best; + // constructor + explicit NodeEntry(const xgboost::tree::TrainParam& param) + : root_gain(0.0f), weight(0.0f) {} +}; + template class HistUpdater { public: @@ -33,12 +49,13 @@ class HistUpdater { using GHistRowT = common::GHistRow; using GradientPairT = xgboost::detail::GradientPairInternal; - explicit HistUpdater(::sycl::queue qu, - const xgboost::tree::TrainParam& param, - std::unique_ptr pruner, - FeatureInteractionConstraintHost int_constraints_, - DMatrix const* fmat) - : qu_(qu), param_(param), + explicit HistUpdater(const Context* ctx, + ::sycl::queue qu, + const xgboost::tree::TrainParam& param, + std::unique_ptr pruner, + FeatureInteractionConstraintHost int_constraints_, + DMatrix const* fmat) + : ctx_(ctx), qu_(qu), param_(param), tree_evaluator_(qu, param, fmat->Info().num_col_), pruner_(std::move(pruner)), interaction_constraints_{std::move(int_constraints_)}, @@ -61,8 +78,7 @@ class HistUpdater { USMVector* row_indices); - void InitData(Context const * ctx, - const common::GHistIndexMatrix& gmat, + void InitData(const common::GHistIndexMatrix& gmat, const USMVector &gpair, const DMatrix& fmat, const RegTree& tree); @@ -78,6 +94,12 @@ class HistUpdater { data_layout_ != kSparseData, hist_buffer, event_priv); } + void InitNewNode(int nid, + const common::GHistIndexMatrix& gmat, + const USMVector &gpair, + const DMatrix& fmat, + const RegTree& tree); + void BuildLocalHistograms(const common::GHistIndexMatrix &gmat, RegTree *p_tree, const USMVector &gpair); @@ -89,6 +111,7 @@ class HistUpdater { const USMVector &gpair); // --data fields-- + const Context* ctx_; size_t sub_group_size_; // the internal row sets @@ -113,9 +136,16 @@ class HistUpdater { /*! \brief culmulative histogram of gradients. */ common::HistCollection hist_; + /*! \brief TreeNode Data: statistics for each constructed node */ + std::vector> snode_host_; + xgboost::common::Monitor builder_monitor_; xgboost::common::Monitor kernel_monitor_; + /*! \brief feature with least # of bins. to be used for dense specialization + of InitNewNode() */ + uint32_t fid_least_bins_; + uint64_t seed_ = 0; // key is the node id which should be calculated by Subtraction Trick, value is the node which diff --git a/plugin/sycl/tree/param.h b/plugin/sycl/tree/param.h index 1b47d83a4..a83a7ad13 100644 --- a/plugin/sycl/tree/param.h +++ b/plugin/sycl/tree/param.h @@ -49,6 +49,115 @@ struct TrainParam { template using GradStats = xgboost::detail::GradientPairInternal; +/*! + * \brief SYCL implementation of SplitEntryContainer for device compilation. + * Original structure cannot be used due 'cat_bits' field of type std::vector, + * which is not device-copyable + */ +template +struct SplitEntryContainer { + /*! \brief loss change after split this node */ + bst_float loss_chg {0.0f}; + /*! \brief split index */ + bst_feature_t sindex{0}; + bst_float split_value{0.0f}; + + + GradientT left_sum; + GradientT right_sum; + + + SplitEntryContainer() = default; + + + friend std::ostream& operator<<(std::ostream& os, SplitEntryContainer const& s) { + os << "loss_chg: " << s.loss_chg << ", " + << "split index: " << s.SplitIndex() << ", " + << "split value: " << s.split_value << ", " + << "left_sum: " << s.left_sum << ", " + << "right_sum: " << s.right_sum; + return os; + } + /*!\return feature index to split on */ + bst_feature_t SplitIndex() const { return sindex & ((1U << 31) - 1U); } + /*!\return whether missing value goes to left branch */ + bool DefaultLeft() const { return (sindex >> 31) != 0; } + /*! + * \brief decides whether we can replace current entry with the given statistics + * + * This function gives better priority to lower index when loss_chg == new_loss_chg. + * Not the best way, but helps to give consistent result during multi-thread + * execution. + * + * \param new_loss_chg the loss reduction get through the split + * \param split_index the feature index where the split is on + */ + inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const { + if (::sycl::isinf(new_loss_chg)) { // in some cases new_loss_chg can be NaN or Inf, + // for example when lambda = 0 & min_child_weight = 0 + // skip value in this case + return false; + } else if (this->SplitIndex() <= split_index) { + return new_loss_chg > this->loss_chg; + } else { + return !(this->loss_chg > new_loss_chg); + } + } + /*! + * \brief update the split entry, replace it if e is better + * \param e candidate split solution + * \return whether the proposed split is better and can replace current split + */ + inline bool Update(const SplitEntryContainer &e) { + if (this->NeedReplace(e.loss_chg, e.SplitIndex())) { + this->loss_chg = e.loss_chg; + this->sindex = e.sindex; + this->split_value = e.split_value; + this->left_sum = e.left_sum; + this->right_sum = e.right_sum; + return true; + } else { + return false; + } + } + /*! + * \brief update the split entry, replace it if e is better + * \param new_loss_chg loss reduction of new candidate + * \param split_index feature index to split on + * \param new_split_value the split point + * \param default_left whether the missing value goes to left + * \return whether the proposed split is better and can replace current split + */ + bool Update(bst_float new_loss_chg, unsigned split_index, + bst_float new_split_value, bool default_left, + const GradientT &left_sum, + const GradientT &right_sum) { + if (this->NeedReplace(new_loss_chg, split_index)) { + this->loss_chg = new_loss_chg; + if (default_left) { + split_index |= (1U << 31); + } + this->sindex = split_index; + this->split_value = new_split_value; + this->left_sum = left_sum; + this->right_sum = right_sum; + return true; + } else { + return false; + } + } + + + /*! \brief same as update, used by AllReduce*/ + inline static void Reduce(SplitEntryContainer &dst, // NOLINT(*) + const SplitEntryContainer &src) { // NOLINT(*) + dst.Update(src); + } +}; + +template +using SplitEntry = SplitEntryContainer>; + } // namespace tree } // namespace sycl } // namespace xgboost diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc index 4bf1ab30e..1ef771a0c 100644 --- a/tests/cpp/plugin/test_sycl_hist_updater.cc +++ b/tests/cpp/plugin/test_sycl_hist_updater.cc @@ -16,11 +16,13 @@ namespace xgboost::sycl::tree { template class TestHistUpdater : public HistUpdater { public: - TestHistUpdater(::sycl::queue qu, + TestHistUpdater(const Context* ctx, + ::sycl::queue qu, const xgboost::tree::TrainParam& param, std::unique_ptr pruner, FeatureInteractionConstraintHost int_constraints_, - DMatrix const* fmat) : HistUpdater(qu, param, std::move(pruner), + DMatrix const* fmat) : HistUpdater(ctx, qu, param, + std::move(pruner), int_constraints_, fmat) {} void TestInitSampling(const USMVector &gpair, @@ -28,12 +30,11 @@ class TestHistUpdater : public HistUpdater { HistUpdater::InitSampling(gpair, row_indices); } - auto* TestInitData(Context const * ctx, - const common::GHistIndexMatrix& gmat, - const USMVector &gpair, - const DMatrix& fmat, - const RegTree& tree) { - HistUpdater::InitData(ctx, gmat, gpair, fmat, tree); + auto* TestInitData(const common::GHistIndexMatrix& gmat, + const USMVector &gpair, + const DMatrix& fmat, + const RegTree& tree) { + HistUpdater::InitData(gmat, gpair, fmat, tree); return &(HistUpdater::row_set_collection_); } @@ -44,6 +45,15 @@ class TestHistUpdater : public HistUpdater { HistUpdater::BuildHistogramsLossGuide(entry, gmat, p_tree, gpair); return &(HistUpdater::hist_); } + + auto TestInitNewNode(int nid, + const common::GHistIndexMatrix& gmat, + const USMVector &gpair, + const DMatrix& fmat, + const RegTree& tree) { + HistUpdater::InitNewNode(nid, gmat, gpair, fmat, tree); + return HistUpdater::snode_host_[nid]; + } }; void GenerateRandomGPairs(::sycl::queue* qu, GradientPair* gpair_ptr, size_t num_rows, bool has_neg_hess) { @@ -79,7 +89,7 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) { FeatureInteractionConstraintHost int_constraints; std::unique_ptr pruner{TreeUpdater::Create("prune", &ctx, &task)}; - TestHistUpdater updater(qu, param, std::move(pruner), int_constraints, p_fmat.get()); + TestHistUpdater updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get()); USMVector row_indices_0(&qu, num_rows); USMVector row_indices_1(&qu, num_rows); @@ -135,7 +145,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne FeatureInteractionConstraintHost int_constraints; std::unique_ptr pruner{TreeUpdater::Create("prune", &ctx, &task)}; - TestHistUpdater updater(qu, param, std::move(pruner), int_constraints, p_fmat.get()); + TestHistUpdater updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get()); USMVector gpair(&qu, num_rows); GenerateRandomGPairs(&qu, gpair.Data(), num_rows, has_neg_hess); @@ -146,7 +156,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne gmat.Init(qu, &ctx, dmat, n_bins); RegTree tree; - auto* row_set_collection = updater.TestInitData(&ctx, gmat, gpair, *p_fmat, tree); + auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree); auto& row_indices = row_set_collection->Data(); std::vector row_indices_host(row_indices.Size()); @@ -191,7 +201,7 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa FeatureInteractionConstraintHost int_constraints; std::unique_ptr pruner{TreeUpdater::Create("prune", &ctx, &task)}; - TestHistUpdater updater(qu, param, std::move(pruner), int_constraints, p_fmat.get()); + TestHistUpdater updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get()); updater.SetHistSynchronizer(new BatchHistSynchronizer()); updater.SetHistRowsAdder(new BatchHistRowsAdder()); @@ -213,7 +223,7 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa ExpandEntry node1(1, tree.GetDepth(1)); ExpandEntry node2(2, tree.GetDepth(2)); - auto* row_set_collection = updater.TestInitData(&ctx, gmat, gpair, *p_fmat, tree); + auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree); row_set_collection->AddSplit(0, 1, 2, 42, num_rows - 42); updater.TestBuildHistogramsLossGuide(node0, gmat, &tree, gpair); @@ -237,6 +247,66 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa } } +template +void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sparsity) { + const size_t num_rows = 1u << 8; + const size_t num_columns = 1; + const size_t n_bins = 32; + + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + DeviceManager device_manager; + auto qu = device_manager.GetQueue(ctx.Device()); + ObjInfo task{ObjInfo::kRegression}; + + auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix(); + + FeatureInteractionConstraintHost int_constraints; + std::unique_ptr pruner{TreeUpdater::Create("prune", &ctx, &task)}; + + TestHistUpdater updater(&ctx, qu, param, std::move(pruner), int_constraints, p_fmat.get()); + updater.SetHistSynchronizer(new BatchHistSynchronizer()); + updater.SetHistRowsAdder(new BatchHistRowsAdder()); + + USMVector gpair(&qu, num_rows); + auto* gpair_ptr = gpair.Data(); + GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false); + + DeviceMatrix dmat; + dmat.Init(qu, p_fmat.get()); + common::GHistIndexMatrix gmat; + gmat.Init(qu, &ctx, dmat, n_bins); + + RegTree tree; + tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0); + ExpandEntry node(ExpandEntry::kRootNid, tree.GetDepth(ExpandEntry::kRootNid)); + + auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree); + auto& row_idxs = row_set_collection->Data(); + const size_t* row_idxs_ptr = row_idxs.DataConst(); + updater.TestBuildHistogramsLossGuide(node, gmat, &tree, gpair); + const auto snode = updater.TestInitNewNode(ExpandEntry::kRootNid, gmat, gpair, *p_fmat, tree); + + GradStats grad_stat; + { + ::sycl::buffer> buff(&grad_stat, 1); + qu.submit([&](::sycl::handler& cgh) { + auto buff_acc = buff.template get_access<::sycl::access::mode::read_write>(cgh); + cgh.single_task<>([=]() { + for (size_t i = 0; i < num_rows; ++i) { + size_t row_idx = row_idxs_ptr[i]; + buff_acc[0] += GradStats(gpair_ptr[row_idx].GetGrad(), + gpair_ptr[row_idx].GetHess()); + } + }); + }).wait_and_throw(); + } + + EXPECT_NEAR(snode.stats.GetGrad(), grad_stat.GetGrad(), 1e-6 * grad_stat.GetGrad()); + EXPECT_NEAR(snode.stats.GetHess(), grad_stat.GetHess(), 1e-6 * grad_stat.GetHess()); +} + TEST(SyclHistUpdater, Sampling) { xgboost::tree::TrainParam param; param.UpdateAllowUnknown(Args{{"subsample", "0.7"}}); @@ -266,4 +336,14 @@ TEST(SyclHistUpdater, BuildHistogramsLossGuide) { TestHistUpdaterBuildHistogramsLossGuide(param, 0.5); } +TEST(SyclHistUpdater, InitNewNode) { + xgboost::tree::TrainParam param; + param.UpdateAllowUnknown(Args{{"max_depth", "3"}}); + + TestHistUpdaterInitNewNode(param, 0.0); + TestHistUpdaterInitNewNode(param, 0.5); + TestHistUpdaterInitNewNode(param, 0.0); + TestHistUpdaterInitNewNode(param, 0.5); +} + } // namespace xgboost::sycl::tree