[SYC]. Implementation of HostDeviceVector (#10842)
This commit is contained in:
committed by
GitHub
parent
bc69a3e877
commit
2179baa50c
@@ -48,7 +48,7 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
|
||||
this_hist, nbins, ::sycl::event());
|
||||
}
|
||||
}
|
||||
builder->qu_.wait_and_throw();
|
||||
builder->qu_->wait_and_throw();
|
||||
|
||||
builder->builder_monitor_.Stop("SyncHistograms");
|
||||
}
|
||||
@@ -84,7 +84,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
|
||||
auto& sibling_hist = builder->hist_[sibling_nid];
|
||||
common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
|
||||
this_hist, nbins, ::sycl::event());
|
||||
builder->qu_.wait_and_throw();
|
||||
builder->qu_->wait_and_throw();
|
||||
// Store posible parent node
|
||||
auto& sibling_local = builder->hist_local_worker_[sibling_nid];
|
||||
common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
|
||||
@@ -113,7 +113,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
|
||||
auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
|
||||
common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
|
||||
sibling_hist, nbins, ::sycl::event());
|
||||
builder->qu_.wait_and_throw();
|
||||
builder->qu_->wait_and_throw();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
|
||||
for (size_t i = 0; i < sync_ids.size(); i++) {
|
||||
auto& this_hist = hist_[sync_ids[i]];
|
||||
const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
|
||||
qu_.memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
|
||||
qu_->memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
|
||||
}
|
||||
|
||||
auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
|
||||
@@ -42,7 +42,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
|
||||
for (size_t i = 0; i < sync_ids.size(); i++) {
|
||||
auto& this_hist = hist_[sync_ids[i]];
|
||||
GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
|
||||
qu_.memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
|
||||
qu_->memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ void HistUpdater<GradientSumT>::BuildHistogramsLossGuide(
|
||||
|
||||
std::vector<int> sync_ids;
|
||||
hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree);
|
||||
qu_.wait_and_throw();
|
||||
qu_->wait_and_throw();
|
||||
BuildLocalHistograms(gmat, p_tree, gpair_device);
|
||||
hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree);
|
||||
}
|
||||
@@ -99,7 +99,7 @@ void HistUpdater<GradientSumT>::BuildLocalHistograms(
|
||||
common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event);
|
||||
}
|
||||
}
|
||||
qu_.wait_and_throw();
|
||||
qu_->wait_and_throw();
|
||||
builder_monitor_.Stop("BuildLocalHistograms");
|
||||
}
|
||||
|
||||
@@ -382,9 +382,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
|
||||
|
||||
::sycl::event event;
|
||||
if (is_first_group) {
|
||||
out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
|
||||
out_preds_buf_.ResizeNoCopy(qu_, buffer_size);
|
||||
out_pred_ptr = &out_preds(0);
|
||||
event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
|
||||
event = qu_->memcpy(out_preds_buf_.Data(), out_pred_ptr,
|
||||
buffer_size * sizeof(bst_float), event);
|
||||
}
|
||||
auto* out_preds_buf_ptr = out_preds_buf_.Data();
|
||||
|
||||
@@ -406,7 +407,7 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
|
||||
const size_t* rid = rowset.begin;
|
||||
const size_t num_rows = rowset.Size();
|
||||
|
||||
events[node] = qu_.submit([&](::sycl::handler& cgh) {
|
||||
events[node] = qu_->submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
|
||||
out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
|
||||
@@ -415,10 +416,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
|
||||
}
|
||||
}
|
||||
if (is_last_group) {
|
||||
qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
|
||||
qu_->memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
|
||||
out_pred_ptr = nullptr;
|
||||
}
|
||||
qu_.wait();
|
||||
qu_->wait();
|
||||
|
||||
builder_monitor_.Stop("UpdatePredictionCache");
|
||||
return true;
|
||||
@@ -447,7 +448,7 @@ void HistUpdater<GradientSumT>::InitSampling(
|
||||
*/
|
||||
if (has_fp64_support_) {
|
||||
// Use oneDPL bernoulli_distribution for better perf
|
||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
||||
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
|
||||
[=](::sycl::item<1> pid) {
|
||||
@@ -465,7 +466,7 @@ void HistUpdater<GradientSumT>::InitSampling(
|
||||
});
|
||||
} else {
|
||||
// Use oneDPL uniform, as far as bernoulli_distribution uses fp64
|
||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
||||
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
|
||||
[=](::sycl::item<1> pid) {
|
||||
@@ -485,8 +486,8 @@ void HistUpdater<GradientSumT>::InitSampling(
|
||||
/* After calling a destructor for flag_buf, content will be copyed to num_samples */
|
||||
}
|
||||
|
||||
row_indices->Resize(&qu_, num_samples, 0, &event);
|
||||
qu_.wait();
|
||||
row_indices->Resize(qu_, num_samples, 0, &event);
|
||||
qu_->wait();
|
||||
}
|
||||
|
||||
template<typename GradientSumT>
|
||||
@@ -526,7 +527,7 @@ void HistUpdater<GradientSumT>::InitData(
|
||||
hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
|
||||
|
||||
USMVector<size_t, MemoryType::on_device>* row_indices = &(row_set_collection_.Data());
|
||||
row_indices->Resize(&qu_, info.num_row_);
|
||||
row_indices->Resize(qu_, info.num_row_);
|
||||
size_t* p_row_indices = row_indices->Data();
|
||||
// mark subsample and build list of member rows
|
||||
if (param_.subsample < 1.0f) {
|
||||
@@ -540,7 +541,7 @@ void HistUpdater<GradientSumT>::InitData(
|
||||
::sycl::event event;
|
||||
{
|
||||
::sycl::buffer<int, 1> flag_buf(&has_neg_hess, 1);
|
||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
||||
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
|
||||
[=](::sycl::item<1> pid) {
|
||||
@@ -558,7 +559,7 @@ void HistUpdater<GradientSumT>::InitData(
|
||||
size_t max_idx = 0;
|
||||
{
|
||||
::sycl::buffer<size_t, 1> flag_buf(&max_idx, 1);
|
||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
||||
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
|
||||
@@ -571,9 +572,9 @@ void HistUpdater<GradientSumT>::InitData(
|
||||
});
|
||||
});
|
||||
}
|
||||
row_indices->Resize(&qu_, max_idx, 0, &event);
|
||||
row_indices->Resize(qu_, max_idx, 0, &event);
|
||||
}
|
||||
qu_.wait_and_throw();
|
||||
qu_->wait_and_throw();
|
||||
}
|
||||
}
|
||||
row_set_collection_.Init();
|
||||
@@ -661,7 +662,7 @@ void HistUpdater<GradientSumT>::ApplySplit(
|
||||
std::vector<int32_t> split_conditions(n_nodes);
|
||||
CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
|
||||
|
||||
partition_builder_.Init(&qu_, n_nodes, [&](size_t node_in_set) {
|
||||
partition_builder_.Init(qu_, n_nodes, [&](size_t node_in_set) {
|
||||
const int32_t nid = nodes[node_in_set].nid;
|
||||
return row_set_collection_[nid].Size();
|
||||
});
|
||||
@@ -669,14 +670,14 @@ void HistUpdater<GradientSumT>::ApplySplit(
|
||||
::sycl::event event;
|
||||
partition_builder_.Partition(gmat, nodes, row_set_collection_,
|
||||
split_conditions, p_tree, &event);
|
||||
qu_.wait_and_throw();
|
||||
qu_->wait_and_throw();
|
||||
|
||||
for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
|
||||
const int32_t nid = nodes[node_in_set].nid;
|
||||
size_t* data_result = const_cast<size_t*>(row_set_collection_[nid].begin);
|
||||
partition_builder_.MergeToArray(node_in_set, data_result, &event);
|
||||
}
|
||||
qu_.wait_and_throw();
|
||||
qu_->wait_and_throw();
|
||||
|
||||
AddSplitsToRowSet(nodes, p_tree);
|
||||
|
||||
@@ -702,7 +703,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
|
||||
const auto* hist = reinterpret_cast<GradStats<GradientSumT>*>(hist_[nid].Data());
|
||||
|
||||
std::vector<GradStats<GradientSumT>> ets(iend - ibegin);
|
||||
qu_.memcpy(ets.data(), hist + ibegin,
|
||||
qu_->memcpy(ets.data(), hist + ibegin,
|
||||
(iend - ibegin) * sizeof(GradStats<GradientSumT>)).wait_and_throw();
|
||||
for (const auto& et : ets) {
|
||||
grad_stat += et;
|
||||
@@ -714,7 +715,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
|
||||
const GradientPair* gpair_ptr = gpair.DataConst();
|
||||
|
||||
::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
|
||||
qu_.submit([&](::sycl::handler& cgh) {
|
||||
qu_->submit([&](::sycl::handler& cgh) {
|
||||
auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
|
||||
cgh.parallel_for<>(::sycl::range<1>(size), reduction,
|
||||
[=](::sycl::item<1> pid, auto& sum) {
|
||||
@@ -786,8 +787,8 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
|
||||
}
|
||||
const size_t total_features = pos;
|
||||
|
||||
split_queries_device_.Resize(&qu_, total_features);
|
||||
auto event = qu_.memcpy(split_queries_device_.Data(), split_queries_host_.data(),
|
||||
split_queries_device_.Resize(qu_, total_features);
|
||||
auto event = qu_->memcpy(split_queries_device_.Data(), split_queries_host_.data(),
|
||||
total_features * sizeof(SplitQuery));
|
||||
|
||||
auto evaluator = tree_evaluator_.GetEvaluator();
|
||||
@@ -796,18 +797,18 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
|
||||
const bst_float* cut_val = gmat.cut_device.Values().DataConst();
|
||||
const bst_float* cut_minval = gmat.cut_device.MinValues().DataConst();
|
||||
|
||||
snode_device_.ResizeNoCopy(&qu_, snode_host_.size());
|
||||
event = qu_.memcpy(snode_device_.Data(), snode_host_.data(),
|
||||
snode_device_.ResizeNoCopy(qu_, snode_host_.size());
|
||||
event = qu_->memcpy(snode_device_.Data(), snode_host_.data(),
|
||||
snode_host_.size() * sizeof(NodeEntry<GradientSumT>), event);
|
||||
const NodeEntry<GradientSumT>* snode = snode_device_.Data();
|
||||
|
||||
const float min_child_weight = param_.min_child_weight;
|
||||
|
||||
best_splits_device_.ResizeNoCopy(&qu_, total_features);
|
||||
best_splits_device_.ResizeNoCopy(qu_, total_features);
|
||||
if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features);
|
||||
SplitEntry<GradientSumT>* best_splits = best_splits_device_.Data();
|
||||
|
||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
||||
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_),
|
||||
::sycl::range<2>(1, sub_group_size_)),
|
||||
@@ -823,10 +824,10 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
|
||||
&(best_splits[i]), fid, nid, evaluator, min_child_weight);
|
||||
});
|
||||
});
|
||||
event = qu_.memcpy(best_splits_host_.data(), best_splits,
|
||||
event = qu_->memcpy(best_splits_host_.data(), best_splits,
|
||||
total_features * sizeof(SplitEntry<GradientSumT>), event);
|
||||
|
||||
qu_.wait();
|
||||
qu_->wait();
|
||||
for (size_t i = 0; i < total_features; i++) {
|
||||
int nid = split_queries_host_[i].nid;
|
||||
snode_host_[nid].best.Update(best_splits_host_[i]);
|
||||
|
||||
@@ -52,7 +52,7 @@ class HistUpdater {
|
||||
using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
|
||||
|
||||
explicit HistUpdater(const Context* ctx,
|
||||
::sycl::queue qu,
|
||||
::sycl::queue* qu,
|
||||
const xgboost::tree::TrainParam& param,
|
||||
FeatureInteractionConstraintHost int_constraints_,
|
||||
DMatrix const* fmat)
|
||||
@@ -63,11 +63,11 @@ class HistUpdater {
|
||||
builder_monitor_.Init("SYCL::Quantile::HistUpdater");
|
||||
kernel_monitor_.Init("SYCL::Quantile::HistUpdater");
|
||||
if (param.max_depth > 0) {
|
||||
snode_device_.Resize(&qu, 1u << (param.max_depth + 1));
|
||||
snode_device_.Resize(qu, 1u << (param.max_depth + 1));
|
||||
}
|
||||
has_fp64_support_ = qu_.get_device().has(::sycl::aspect::fp64);
|
||||
has_fp64_support_ = qu_->get_device().has(::sycl::aspect::fp64);
|
||||
const auto sub_group_sizes =
|
||||
qu_.get_device().get_info<::sycl::info::device::sub_group_sizes>();
|
||||
qu_->get_device().get_info<::sycl::info::device::sub_group_sizes>();
|
||||
sub_group_size_ = sub_group_sizes.back();
|
||||
}
|
||||
|
||||
@@ -266,8 +266,7 @@ class HistUpdater {
|
||||
bst_float* out_pred_ptr = nullptr;
|
||||
|
||||
std::vector<GradientPairT> reduce_buffer_;
|
||||
|
||||
::sycl::queue qu_;
|
||||
::sycl::queue* qu_;
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
|
||||
@@ -42,11 +42,11 @@ class TreeEvaluator {
|
||||
USMVector<GradType> upper_bounds_;
|
||||
USMVector<int> monotone_;
|
||||
TrainParam param_;
|
||||
::sycl::queue qu_;
|
||||
::sycl::queue* qu_;
|
||||
bool has_constraint_;
|
||||
|
||||
public:
|
||||
void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
||||
void Reset(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
||||
qu_ = qu;
|
||||
|
||||
has_constraint_ = false;
|
||||
@@ -58,13 +58,13 @@ class TreeEvaluator {
|
||||
}
|
||||
|
||||
if (has_constraint_) {
|
||||
monotone_.Resize(&qu_, n_features, 0);
|
||||
qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(),
|
||||
monotone_.Resize(qu_, n_features, 0);
|
||||
qu_->memcpy(monotone_.Data(), p.monotone_constraints.data(),
|
||||
sizeof(int) * p.monotone_constraints.size());
|
||||
qu_.wait();
|
||||
qu_->wait();
|
||||
|
||||
lower_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
|
||||
upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
|
||||
lower_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
|
||||
upper_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
|
||||
}
|
||||
param_ = TrainParam(p);
|
||||
}
|
||||
@@ -73,7 +73,7 @@ class TreeEvaluator {
|
||||
return has_constraint_;
|
||||
}
|
||||
|
||||
TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
||||
TreeEvaluator(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
||||
Reset(qu, p, n_features);
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ void QuantileHistMaker::Configure(const Args& args) {
|
||||
param_.UpdateAllowUnknown(args);
|
||||
hist_maker_param_.UpdateAllowUnknown(args);
|
||||
|
||||
bool has_fp64_support = qu_.get_device().has(::sycl::aspect::fp64);
|
||||
bool has_fp64_support = qu_->get_device().has(::sycl::aspect::fp64);
|
||||
if (hist_maker_param_.single_precision_histogram || !has_fp64_support) {
|
||||
if (!hist_maker_param_.single_precision_histogram) {
|
||||
LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True";
|
||||
@@ -68,9 +68,9 @@ void QuantileHistMaker::CallUpdate(
|
||||
xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||
const std::vector<RegTree *> &trees) {
|
||||
const auto* gpair_h = gpair->Data();
|
||||
gpair_device_.Resize(&qu_, gpair_h->Size());
|
||||
qu_.memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
|
||||
qu_.wait();
|
||||
gpair_device_.Resize(qu_, gpair_h->Size());
|
||||
qu_->memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
|
||||
qu_->wait();
|
||||
|
||||
for (auto tree : trees) {
|
||||
pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree);
|
||||
|
||||
@@ -105,7 +105,7 @@ class QuantileHistMaker: public TreeUpdater {
|
||||
|
||||
FeatureInteractionConstraintHost int_constraint_;
|
||||
|
||||
::sycl::queue qu_;
|
||||
::sycl::queue* qu_;
|
||||
DeviceManager device_manager;
|
||||
ObjInfo const *task_{nullptr};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user