[SYC]. Implementation of HostDeviceVector (#10842)

This commit is contained in:
Dmitry Razdoburdin
2024-09-24 22:45:17 +02:00
committed by GitHub
parent bc69a3e877
commit 2179baa50c
25 changed files with 937 additions and 282 deletions

View File

@@ -48,7 +48,7 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
this_hist, nbins, ::sycl::event());
}
}
builder->qu_.wait_and_throw();
builder->qu_->wait_and_throw();
builder->builder_monitor_.Stop("SyncHistograms");
}
@@ -84,7 +84,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
auto& sibling_hist = builder->hist_[sibling_nid];
common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
this_hist, nbins, ::sycl::event());
builder->qu_.wait_and_throw();
builder->qu_->wait_and_throw();
// Store posible parent node
auto& sibling_local = builder->hist_local_worker_[sibling_nid];
common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
@@ -113,7 +113,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
sibling_hist, nbins, ::sycl::event());
builder->qu_.wait_and_throw();
builder->qu_->wait_and_throw();
}
}
}

View File

@@ -31,7 +31,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
for (size_t i = 0; i < sync_ids.size(); i++) {
auto& this_hist = hist_[sync_ids[i]];
const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
qu_.memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
qu_->memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
}
auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
@@ -42,7 +42,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
for (size_t i = 0; i < sync_ids.size(); i++) {
auto& this_hist = hist_[sync_ids[i]];
GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
qu_.memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
qu_->memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
}
}
@@ -75,7 +75,7 @@ void HistUpdater<GradientSumT>::BuildHistogramsLossGuide(
std::vector<int> sync_ids;
hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree);
qu_.wait_and_throw();
qu_->wait_and_throw();
BuildLocalHistograms(gmat, p_tree, gpair_device);
hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree);
}
@@ -99,7 +99,7 @@ void HistUpdater<GradientSumT>::BuildLocalHistograms(
common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event);
}
}
qu_.wait_and_throw();
qu_->wait_and_throw();
builder_monitor_.Stop("BuildLocalHistograms");
}
@@ -382,9 +382,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
::sycl::event event;
if (is_first_group) {
out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
out_preds_buf_.ResizeNoCopy(qu_, buffer_size);
out_pred_ptr = &out_preds(0);
event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
event = qu_->memcpy(out_preds_buf_.Data(), out_pred_ptr,
buffer_size * sizeof(bst_float), event);
}
auto* out_preds_buf_ptr = out_preds_buf_.Data();
@@ -406,7 +407,7 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
const size_t* rid = rowset.begin;
const size_t num_rows = rowset.Size();
events[node] = qu_.submit([&](::sycl::handler& cgh) {
events[node] = qu_->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
@@ -415,10 +416,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
}
}
if (is_last_group) {
qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
qu_->memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
out_pred_ptr = nullptr;
}
qu_.wait();
qu_->wait();
builder_monitor_.Stop("UpdatePredictionCache");
return true;
@@ -447,7 +448,7 @@ void HistUpdater<GradientSumT>::InitSampling(
*/
if (has_fp64_support_) {
// Use oneDPL bernoulli_distribution for better perf
event = qu_.submit([&](::sycl::handler& cgh) {
event = qu_->submit([&](::sycl::handler& cgh) {
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
[=](::sycl::item<1> pid) {
@@ -465,7 +466,7 @@ void HistUpdater<GradientSumT>::InitSampling(
});
} else {
// Use oneDPL uniform, as far as bernoulli_distribution uses fp64
event = qu_.submit([&](::sycl::handler& cgh) {
event = qu_->submit([&](::sycl::handler& cgh) {
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
[=](::sycl::item<1> pid) {
@@ -485,8 +486,8 @@ void HistUpdater<GradientSumT>::InitSampling(
/* After calling a destructor for flag_buf, content will be copyed to num_samples */
}
row_indices->Resize(&qu_, num_samples, 0, &event);
qu_.wait();
row_indices->Resize(qu_, num_samples, 0, &event);
qu_->wait();
}
template<typename GradientSumT>
@@ -526,7 +527,7 @@ void HistUpdater<GradientSumT>::InitData(
hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
USMVector<size_t, MemoryType::on_device>* row_indices = &(row_set_collection_.Data());
row_indices->Resize(&qu_, info.num_row_);
row_indices->Resize(qu_, info.num_row_);
size_t* p_row_indices = row_indices->Data();
// mark subsample and build list of member rows
if (param_.subsample < 1.0f) {
@@ -540,7 +541,7 @@ void HistUpdater<GradientSumT>::InitData(
::sycl::event event;
{
::sycl::buffer<int, 1> flag_buf(&has_neg_hess, 1);
event = qu_.submit([&](::sycl::handler& cgh) {
event = qu_->submit([&](::sycl::handler& cgh) {
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
[=](::sycl::item<1> pid) {
@@ -558,7 +559,7 @@ void HistUpdater<GradientSumT>::InitData(
size_t max_idx = 0;
{
::sycl::buffer<size_t, 1> flag_buf(&max_idx, 1);
event = qu_.submit([&](::sycl::handler& cgh) {
event = qu_->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
@@ -571,9 +572,9 @@ void HistUpdater<GradientSumT>::InitData(
});
});
}
row_indices->Resize(&qu_, max_idx, 0, &event);
row_indices->Resize(qu_, max_idx, 0, &event);
}
qu_.wait_and_throw();
qu_->wait_and_throw();
}
}
row_set_collection_.Init();
@@ -661,7 +662,7 @@ void HistUpdater<GradientSumT>::ApplySplit(
std::vector<int32_t> split_conditions(n_nodes);
CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
partition_builder_.Init(&qu_, n_nodes, [&](size_t node_in_set) {
partition_builder_.Init(qu_, n_nodes, [&](size_t node_in_set) {
const int32_t nid = nodes[node_in_set].nid;
return row_set_collection_[nid].Size();
});
@@ -669,14 +670,14 @@ void HistUpdater<GradientSumT>::ApplySplit(
::sycl::event event;
partition_builder_.Partition(gmat, nodes, row_set_collection_,
split_conditions, p_tree, &event);
qu_.wait_and_throw();
qu_->wait_and_throw();
for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
const int32_t nid = nodes[node_in_set].nid;
size_t* data_result = const_cast<size_t*>(row_set_collection_[nid].begin);
partition_builder_.MergeToArray(node_in_set, data_result, &event);
}
qu_.wait_and_throw();
qu_->wait_and_throw();
AddSplitsToRowSet(nodes, p_tree);
@@ -702,7 +703,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
const auto* hist = reinterpret_cast<GradStats<GradientSumT>*>(hist_[nid].Data());
std::vector<GradStats<GradientSumT>> ets(iend - ibegin);
qu_.memcpy(ets.data(), hist + ibegin,
qu_->memcpy(ets.data(), hist + ibegin,
(iend - ibegin) * sizeof(GradStats<GradientSumT>)).wait_and_throw();
for (const auto& et : ets) {
grad_stat += et;
@@ -714,7 +715,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
const GradientPair* gpair_ptr = gpair.DataConst();
::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
qu_.submit([&](::sycl::handler& cgh) {
qu_->submit([&](::sycl::handler& cgh) {
auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
cgh.parallel_for<>(::sycl::range<1>(size), reduction,
[=](::sycl::item<1> pid, auto& sum) {
@@ -786,8 +787,8 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
}
const size_t total_features = pos;
split_queries_device_.Resize(&qu_, total_features);
auto event = qu_.memcpy(split_queries_device_.Data(), split_queries_host_.data(),
split_queries_device_.Resize(qu_, total_features);
auto event = qu_->memcpy(split_queries_device_.Data(), split_queries_host_.data(),
total_features * sizeof(SplitQuery));
auto evaluator = tree_evaluator_.GetEvaluator();
@@ -796,18 +797,18 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
const bst_float* cut_val = gmat.cut_device.Values().DataConst();
const bst_float* cut_minval = gmat.cut_device.MinValues().DataConst();
snode_device_.ResizeNoCopy(&qu_, snode_host_.size());
event = qu_.memcpy(snode_device_.Data(), snode_host_.data(),
snode_device_.ResizeNoCopy(qu_, snode_host_.size());
event = qu_->memcpy(snode_device_.Data(), snode_host_.data(),
snode_host_.size() * sizeof(NodeEntry<GradientSumT>), event);
const NodeEntry<GradientSumT>* snode = snode_device_.Data();
const float min_child_weight = param_.min_child_weight;
best_splits_device_.ResizeNoCopy(&qu_, total_features);
best_splits_device_.ResizeNoCopy(qu_, total_features);
if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features);
SplitEntry<GradientSumT>* best_splits = best_splits_device_.Data();
event = qu_.submit([&](::sycl::handler& cgh) {
event = qu_->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_),
::sycl::range<2>(1, sub_group_size_)),
@@ -823,10 +824,10 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
&(best_splits[i]), fid, nid, evaluator, min_child_weight);
});
});
event = qu_.memcpy(best_splits_host_.data(), best_splits,
event = qu_->memcpy(best_splits_host_.data(), best_splits,
total_features * sizeof(SplitEntry<GradientSumT>), event);
qu_.wait();
qu_->wait();
for (size_t i = 0; i < total_features; i++) {
int nid = split_queries_host_[i].nid;
snode_host_[nid].best.Update(best_splits_host_[i]);

View File

@@ -52,7 +52,7 @@ class HistUpdater {
using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
explicit HistUpdater(const Context* ctx,
::sycl::queue qu,
::sycl::queue* qu,
const xgboost::tree::TrainParam& param,
FeatureInteractionConstraintHost int_constraints_,
DMatrix const* fmat)
@@ -63,11 +63,11 @@ class HistUpdater {
builder_monitor_.Init("SYCL::Quantile::HistUpdater");
kernel_monitor_.Init("SYCL::Quantile::HistUpdater");
if (param.max_depth > 0) {
snode_device_.Resize(&qu, 1u << (param.max_depth + 1));
snode_device_.Resize(qu, 1u << (param.max_depth + 1));
}
has_fp64_support_ = qu_.get_device().has(::sycl::aspect::fp64);
has_fp64_support_ = qu_->get_device().has(::sycl::aspect::fp64);
const auto sub_group_sizes =
qu_.get_device().get_info<::sycl::info::device::sub_group_sizes>();
qu_->get_device().get_info<::sycl::info::device::sub_group_sizes>();
sub_group_size_ = sub_group_sizes.back();
}
@@ -266,8 +266,7 @@ class HistUpdater {
bst_float* out_pred_ptr = nullptr;
std::vector<GradientPairT> reduce_buffer_;
::sycl::queue qu_;
::sycl::queue* qu_;
};
} // namespace tree

View File

@@ -42,11 +42,11 @@ class TreeEvaluator {
USMVector<GradType> upper_bounds_;
USMVector<int> monotone_;
TrainParam param_;
::sycl::queue qu_;
::sycl::queue* qu_;
bool has_constraint_;
public:
void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
void Reset(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
qu_ = qu;
has_constraint_ = false;
@@ -58,13 +58,13 @@ class TreeEvaluator {
}
if (has_constraint_) {
monotone_.Resize(&qu_, n_features, 0);
qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(),
monotone_.Resize(qu_, n_features, 0);
qu_->memcpy(monotone_.Data(), p.monotone_constraints.data(),
sizeof(int) * p.monotone_constraints.size());
qu_.wait();
qu_->wait();
lower_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
lower_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
upper_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
}
param_ = TrainParam(p);
}
@@ -73,7 +73,7 @@ class TreeEvaluator {
return has_constraint_;
}
TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
TreeEvaluator(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
Reset(qu, p, n_features);
}

View File

@@ -31,7 +31,7 @@ void QuantileHistMaker::Configure(const Args& args) {
param_.UpdateAllowUnknown(args);
hist_maker_param_.UpdateAllowUnknown(args);
bool has_fp64_support = qu_.get_device().has(::sycl::aspect::fp64);
bool has_fp64_support = qu_->get_device().has(::sycl::aspect::fp64);
if (hist_maker_param_.single_precision_histogram || !has_fp64_support) {
if (!hist_maker_param_.single_precision_histogram) {
LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True";
@@ -68,9 +68,9 @@ void QuantileHistMaker::CallUpdate(
xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree *> &trees) {
const auto* gpair_h = gpair->Data();
gpair_device_.Resize(&qu_, gpair_h->Size());
qu_.memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
qu_.wait();
gpair_device_.Resize(qu_, gpair_h->Size());
qu_->memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
qu_->wait();
for (auto tree : trees) {
pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree);

View File

@@ -105,7 +105,7 @@ class QuantileHistMaker: public TreeUpdater {
FeatureInteractionConstraintHost int_constraint_;
::sycl::queue qu_;
::sycl::queue* qu_;
DeviceManager device_manager;
ObjInfo const *task_{nullptr};