[SYC]. Implementation of HostDeviceVector (#10842)
This commit is contained in:
parent
bc69a3e877
commit
2179baa50c
@ -19,15 +19,15 @@ namespace common {
|
|||||||
* \brief Fill histogram with zeroes
|
* \brief Fill histogram with zeroes
|
||||||
*/
|
*/
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
void InitHist(::sycl::queue qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
|
void InitHist(::sycl::queue* qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
|
||||||
size_t size, ::sycl::event* event) {
|
size_t size, ::sycl::event* event) {
|
||||||
*event = qu.fill(hist->Begin(),
|
*event = qu->fill(hist->Begin(),
|
||||||
xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
|
xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
|
||||||
}
|
}
|
||||||
template void InitHist(::sycl::queue qu,
|
template void InitHist(::sycl::queue* qu,
|
||||||
GHistRow<float, MemoryType::on_device>* hist,
|
GHistRow<float, MemoryType::on_device>* hist,
|
||||||
size_t size, ::sycl::event* event);
|
size_t size, ::sycl::event* event);
|
||||||
template void InitHist(::sycl::queue qu,
|
template void InitHist(::sycl::queue* qu,
|
||||||
GHistRow<double, MemoryType::on_device>* hist,
|
GHistRow<double, MemoryType::on_device>* hist,
|
||||||
size_t size, ::sycl::event* event);
|
size_t size, ::sycl::event* event);
|
||||||
|
|
||||||
@ -35,25 +35,25 @@ template void InitHist(::sycl::queue qu,
|
|||||||
* \brief Copy histogram from src to dst
|
* \brief Copy histogram from src to dst
|
||||||
*/
|
*/
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
void CopyHist(::sycl::queue qu,
|
void CopyHist(::sycl::queue* qu,
|
||||||
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
||||||
const GHistRow<GradientSumT, MemoryType::on_device>& src,
|
const GHistRow<GradientSumT, MemoryType::on_device>& src,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
|
GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
|
||||||
const GradientSumT* psrc = reinterpret_cast<const GradientSumT*>(src.DataConst());
|
const GradientSumT* psrc = reinterpret_cast<const GradientSumT*>(src.DataConst());
|
||||||
|
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(2 * size), [=](::sycl::item<1> pid) {
|
||||||
const size_t i = pid.get_id(0);
|
const size_t i = pid.get_id(0);
|
||||||
pdst[i] = psrc[i];
|
pdst[i] = psrc[i];
|
||||||
});
|
});
|
||||||
}).wait();
|
}).wait();
|
||||||
}
|
}
|
||||||
template void CopyHist(::sycl::queue qu,
|
template void CopyHist(::sycl::queue* qu,
|
||||||
GHistRow<float, MemoryType::on_device>* dst,
|
GHistRow<float, MemoryType::on_device>* dst,
|
||||||
const GHistRow<float, MemoryType::on_device>& src,
|
const GHistRow<float, MemoryType::on_device>& src,
|
||||||
size_t size);
|
size_t size);
|
||||||
template void CopyHist(::sycl::queue qu,
|
template void CopyHist(::sycl::queue* qu,
|
||||||
GHistRow<double, MemoryType::on_device>* dst,
|
GHistRow<double, MemoryType::on_device>* dst,
|
||||||
const GHistRow<double, MemoryType::on_device>& src,
|
const GHistRow<double, MemoryType::on_device>& src,
|
||||||
size_t size);
|
size_t size);
|
||||||
@ -62,7 +62,7 @@ template void CopyHist(::sycl::queue qu,
|
|||||||
* \brief Compute Subtraction: dst = src1 - src2
|
* \brief Compute Subtraction: dst = src1 - src2
|
||||||
*/
|
*/
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
::sycl::event SubtractionHist(::sycl::queue qu,
|
::sycl::event SubtractionHist(::sycl::queue* qu,
|
||||||
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
||||||
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
|
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
|
||||||
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
|
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
|
||||||
@ -71,7 +71,7 @@ template<typename GradientSumT>
|
|||||||
const GradientSumT* psrc1 = reinterpret_cast<const GradientSumT*>(src1.DataConst());
|
const GradientSumT* psrc1 = reinterpret_cast<const GradientSumT*>(src1.DataConst());
|
||||||
const GradientSumT* psrc2 = reinterpret_cast<const GradientSumT*>(src2.DataConst());
|
const GradientSumT* psrc2 = reinterpret_cast<const GradientSumT*>(src2.DataConst());
|
||||||
|
|
||||||
auto event_final = qu.submit([&](::sycl::handler& cgh) {
|
auto event_final = qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_priv);
|
cgh.depends_on(event_priv);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) {
|
||||||
const size_t i = pid.get_id(0);
|
const size_t i = pid.get_id(0);
|
||||||
@ -80,25 +80,25 @@ template<typename GradientSumT>
|
|||||||
});
|
});
|
||||||
return event_final;
|
return event_final;
|
||||||
}
|
}
|
||||||
template ::sycl::event SubtractionHist(::sycl::queue qu,
|
template ::sycl::event SubtractionHist(::sycl::queue* qu,
|
||||||
GHistRow<float, MemoryType::on_device>* dst,
|
GHistRow<float, MemoryType::on_device>* dst,
|
||||||
const GHistRow<float, MemoryType::on_device>& src1,
|
const GHistRow<float, MemoryType::on_device>& src1,
|
||||||
const GHistRow<float, MemoryType::on_device>& src2,
|
const GHistRow<float, MemoryType::on_device>& src2,
|
||||||
size_t size, ::sycl::event event_priv);
|
size_t size, ::sycl::event event_priv);
|
||||||
template ::sycl::event SubtractionHist(::sycl::queue qu,
|
template ::sycl::event SubtractionHist(::sycl::queue* qu,
|
||||||
GHistRow<double, MemoryType::on_device>* dst,
|
GHistRow<double, MemoryType::on_device>* dst,
|
||||||
const GHistRow<double, MemoryType::on_device>& src1,
|
const GHistRow<double, MemoryType::on_device>& src1,
|
||||||
const GHistRow<double, MemoryType::on_device>& src2,
|
const GHistRow<double, MemoryType::on_device>& src2,
|
||||||
size_t size, ::sycl::event event_priv);
|
size_t size, ::sycl::event event_priv);
|
||||||
|
|
||||||
inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max_nblocks) {
|
inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
|
||||||
struct _ {
|
struct _ {
|
||||||
size_t block_size, nblocks;
|
size_t block_size, nblocks;
|
||||||
};
|
};
|
||||||
|
|
||||||
const size_t min_block_size = 32;
|
const size_t min_block_size = 32;
|
||||||
const size_t max_compute_units =
|
const size_t max_compute_units =
|
||||||
qu.get_device().get_info<::sycl::info::device::max_compute_units>();
|
qu->get_device().get_info<::sycl::info::device::max_compute_units>();
|
||||||
|
|
||||||
size_t nblocks = max_compute_units;
|
size_t nblocks = max_compute_units;
|
||||||
|
|
||||||
@ -117,7 +117,7 @@ inline auto GetBlocksParameters(const ::sycl::queue& qu, size_t size, size_t max
|
|||||||
|
|
||||||
// Kernel with buffer using
|
// Kernel with buffer using
|
||||||
template<typename FPType, typename BinIdxType, bool isDense>
|
template<typename FPType, typename BinIdxType, bool isDense>
|
||||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
::sycl::event BuildHistKernel(::sycl::queue* qu,
|
||||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||||
const RowSetCollection::Elem& row_indices,
|
const RowSetCollection::Elem& row_indices,
|
||||||
const GHistIndexMatrix& gmat,
|
const GHistIndexMatrix& gmat,
|
||||||
@ -134,7 +134,7 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
const size_t nbins = gmat.nbins;
|
const size_t nbins = gmat.nbins;
|
||||||
|
|
||||||
const size_t max_work_group_size =
|
const size_t max_work_group_size =
|
||||||
qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
|
qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
|
||||||
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
|
||||||
|
|
||||||
// Captured structured bindings are a C++20 extension
|
// Captured structured bindings are a C++20 extension
|
||||||
@ -143,8 +143,9 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
const size_t nblocks = block_params.nblocks;
|
const size_t nblocks = block_params.nblocks;
|
||||||
|
|
||||||
GradientPairT* hist_buffer_data = hist_buffer->Data();
|
GradientPairT* hist_buffer_data = hist_buffer->Data();
|
||||||
auto event_fill = qu.fill(hist_buffer_data, GradientPairT(0, 0), nblocks * nbins * 2, event_priv);
|
auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
|
||||||
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
nblocks * nbins * 2, event_priv);
|
||||||
|
auto event_main = qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_fill);
|
cgh.depends_on(event_fill);
|
||||||
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
|
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
|
||||||
::sycl::range<2>(1, work_group_size)),
|
::sycl::range<2>(1, work_group_size)),
|
||||||
@ -178,7 +179,7 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
});
|
});
|
||||||
|
|
||||||
GradientPairT* hist_data = hist->Data();
|
GradientPairT* hist_data = hist->Data();
|
||||||
auto event_save = qu.submit([&](::sycl::handler& cgh) {
|
auto event_save = qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_main);
|
cgh.depends_on(event_main);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
|
||||||
size_t idx_bin = pid.get_id(0);
|
size_t idx_bin = pid.get_id(0);
|
||||||
@ -197,7 +198,7 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
|
|
||||||
// Kernel with atomic using
|
// Kernel with atomic using
|
||||||
template<typename FPType, typename BinIdxType, bool isDense>
|
template<typename FPType, typename BinIdxType, bool isDense>
|
||||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
::sycl::event BuildHistKernel(::sycl::queue* qu,
|
||||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||||
const RowSetCollection::Elem& row_indices,
|
const RowSetCollection::Elem& row_indices,
|
||||||
const GHistIndexMatrix& gmat,
|
const GHistIndexMatrix& gmat,
|
||||||
@ -216,8 +217,8 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
constexpr size_t work_group_size = 32;
|
constexpr size_t work_group_size = 32;
|
||||||
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
|
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
|
||||||
|
|
||||||
auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
|
auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
|
||||||
auto event_main = qu.submit([&](::sycl::handler& cgh) {
|
auto event_main = qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event_fill);
|
cgh.depends_on(event_fill);
|
||||||
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size),
|
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(size, n_work_groups * work_group_size),
|
||||||
::sycl::range<2>(1, work_group_size)),
|
::sycl::range<2>(1, work_group_size)),
|
||||||
@ -252,7 +253,7 @@ template<typename FPType, typename BinIdxType, bool isDense>
|
|||||||
|
|
||||||
template<typename FPType, typename BinIdxType>
|
template<typename FPType, typename BinIdxType>
|
||||||
::sycl::event BuildHistDispatchKernel(
|
::sycl::event BuildHistDispatchKernel(
|
||||||
::sycl::queue qu,
|
::sycl::queue* qu,
|
||||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||||
const RowSetCollection::Elem& row_indices,
|
const RowSetCollection::Elem& row_indices,
|
||||||
const GHistIndexMatrix& gmat,
|
const GHistIndexMatrix& gmat,
|
||||||
@ -292,7 +293,7 @@ template<typename FPType, typename BinIdxType>
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename FPType>
|
template<typename FPType>
|
||||||
::sycl::event BuildHistKernel(::sycl::queue qu,
|
::sycl::event BuildHistKernel(::sycl::queue* qu,
|
||||||
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||||
const RowSetCollection::Elem& row_indices,
|
const RowSetCollection::Elem& row_indices,
|
||||||
const GHistIndexMatrix& gmat, const bool isDense,
|
const GHistIndexMatrix& gmat, const bool isDense,
|
||||||
|
|||||||
@ -32,7 +32,7 @@ class ColumnMatrix;
|
|||||||
* \brief Fill histogram with zeroes
|
* \brief Fill histogram with zeroes
|
||||||
*/
|
*/
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
void InitHist(::sycl::queue qu,
|
void InitHist(::sycl::queue* qu,
|
||||||
GHistRow<GradientSumT, MemoryType::on_device>* hist,
|
GHistRow<GradientSumT, MemoryType::on_device>* hist,
|
||||||
size_t size, ::sycl::event* event);
|
size_t size, ::sycl::event* event);
|
||||||
|
|
||||||
@ -40,7 +40,7 @@ void InitHist(::sycl::queue qu,
|
|||||||
* \brief Copy histogram from src to dst
|
* \brief Copy histogram from src to dst
|
||||||
*/
|
*/
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
void CopyHist(::sycl::queue qu,
|
void CopyHist(::sycl::queue* qu,
|
||||||
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
||||||
const GHistRow<GradientSumT, MemoryType::on_device>& src,
|
const GHistRow<GradientSumT, MemoryType::on_device>& src,
|
||||||
size_t size);
|
size_t size);
|
||||||
@ -49,7 +49,7 @@ void CopyHist(::sycl::queue qu,
|
|||||||
* \brief Compute subtraction: dst = src1 - src2
|
* \brief Compute subtraction: dst = src1 - src2
|
||||||
*/
|
*/
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
::sycl::event SubtractionHist(::sycl::queue qu,
|
::sycl::event SubtractionHist(::sycl::queue* qu,
|
||||||
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
GHistRow<GradientSumT, MemoryType::on_device>* dst,
|
||||||
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
|
const GHistRow<GradientSumT, MemoryType::on_device>& src1,
|
||||||
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
|
const GHistRow<GradientSumT, MemoryType::on_device>& src2,
|
||||||
@ -73,7 +73,7 @@ class HistCollection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialize histogram collection
|
// Initialize histogram collection
|
||||||
void Init(::sycl::queue qu, uint32_t nbins) {
|
void Init(::sycl::queue* qu, uint32_t nbins) {
|
||||||
qu_ = qu;
|
qu_ = qu;
|
||||||
if (nbins_ != nbins) {
|
if (nbins_ != nbins) {
|
||||||
nbins_ = nbins;
|
nbins_ = nbins;
|
||||||
@ -86,11 +86,11 @@ class HistCollection {
|
|||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
if (data_.count(nid) == 0) {
|
if (data_.count(nid) == 0) {
|
||||||
data_[nid] =
|
data_[nid] =
|
||||||
std::make_shared<GHistRowT>(&qu_, nbins_,
|
std::make_shared<GHistRowT>(qu_, nbins_,
|
||||||
xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
|
xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
|
||||||
&event);
|
&event);
|
||||||
} else {
|
} else {
|
||||||
data_[nid]->Resize(&qu_, nbins_,
|
data_[nid]->Resize(qu_, nbins_,
|
||||||
xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
|
xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
|
||||||
&event);
|
&event);
|
||||||
}
|
}
|
||||||
@ -103,7 +103,7 @@ class HistCollection {
|
|||||||
|
|
||||||
std::unordered_map<uint32_t, std::shared_ptr<GHistRowT>> data_;
|
std::unordered_map<uint32_t, std::shared_ptr<GHistRowT>> data_;
|
||||||
|
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -114,7 +114,7 @@ class ParallelGHistBuilder {
|
|||||||
public:
|
public:
|
||||||
using GHistRowT = GHistRow<GradientSumT, MemoryType::on_device>;
|
using GHistRowT = GHistRow<GradientSumT, MemoryType::on_device>;
|
||||||
|
|
||||||
void Init(::sycl::queue qu, size_t nbins) {
|
void Init(::sycl::queue* qu, size_t nbins) {
|
||||||
qu_ = qu;
|
qu_ = qu;
|
||||||
if (nbins != nbins_) {
|
if (nbins != nbins_) {
|
||||||
hist_buffer_.Init(qu_, nbins);
|
hist_buffer_.Init(qu_, nbins);
|
||||||
@ -123,7 +123,7 @@ class ParallelGHistBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Reset(size_t nblocks) {
|
void Reset(size_t nblocks) {
|
||||||
hist_device_buffer_.Resize(&qu_, nblocks * nbins_ * 2);
|
hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
GHistRowT& GetDeviceBuffer() {
|
GHistRowT& GetDeviceBuffer() {
|
||||||
@ -139,7 +139,7 @@ class ParallelGHistBuilder {
|
|||||||
/*! \brief Buffer for additional histograms for Parallel processing */
|
/*! \brief Buffer for additional histograms for Parallel processing */
|
||||||
GHistRowT hist_device_buffer_;
|
GHistRowT hist_device_buffer_;
|
||||||
|
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -152,7 +152,7 @@ class GHistBuilder {
|
|||||||
using GHistRowT = GHistRow<GradientSumT, memory_type>;
|
using GHistRowT = GHistRow<GradientSumT, memory_type>;
|
||||||
|
|
||||||
GHistBuilder() = default;
|
GHistBuilder() = default;
|
||||||
GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
|
GHistBuilder(::sycl::queue* qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
|
||||||
|
|
||||||
// Construct a histogram via histogram aggregation
|
// Construct a histogram via histogram aggregation
|
||||||
::sycl::event BuildHist(const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
::sycl::event BuildHist(const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
|
||||||
@ -177,7 +177,7 @@ class GHistBuilder {
|
|||||||
/*! \brief Number of all bins over all features */
|
/*! \brief Number of all bins over all features */
|
||||||
uint32_t nbins_ { 0 };
|
uint32_t nbins_ { 0 };
|
||||||
|
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
};
|
};
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace sycl
|
} // namespace sycl
|
||||||
|
|||||||
410
plugin/sycl/common/host_device_vector.cc
Normal file
410
plugin/sycl/common/host_device_vector.cc
Normal file
@ -0,0 +1,410 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2017-2024 by XGBoost contributors
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef XGBOOST_USE_SYCL
|
||||||
|
|
||||||
|
// implementation of HostDeviceVector with sycl support
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <utility>
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||||
|
#include "xgboost/host_device_vector.h"
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
|
#include "../device_manager.h"
|
||||||
|
#include "../data.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
template <typename T>
|
||||||
|
class HostDeviceVectorImpl {
|
||||||
|
using DeviceStorage = sycl::USMVector<T, sycl::MemoryType::on_device>;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
|
||||||
|
if (device.IsSycl()) {
|
||||||
|
device_access_ = GPUAccess::kWrite;
|
||||||
|
SetDevice();
|
||||||
|
data_d_->Resize(qu_, size, v);
|
||||||
|
} else {
|
||||||
|
data_h_.resize(size, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Initializer>
|
||||||
|
HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
|
||||||
|
if (device.IsSycl()) {
|
||||||
|
device_access_ = GPUAccess::kWrite;
|
||||||
|
|
||||||
|
ResizeDevice(init.size());
|
||||||
|
Copy(init);
|
||||||
|
} else {
|
||||||
|
data_h_ = init;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) : device_{that.device_},
|
||||||
|
data_h_{std::move(that.data_h_)},
|
||||||
|
data_d_{std::move(that.data_d_)},
|
||||||
|
device_access_{that.device_access_} {}
|
||||||
|
|
||||||
|
std::vector<T>& HostVector() {
|
||||||
|
SyncHost(GPUAccess::kNone);
|
||||||
|
return data_h_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<T>& ConstHostVector() {
|
||||||
|
SyncHost(GPUAccess::kRead);
|
||||||
|
return data_h_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetDevice(DeviceOrd device) {
|
||||||
|
if (device_ == device) { return; }
|
||||||
|
if (device_.IsSycl()) {
|
||||||
|
SyncHost(GPUAccess::kNone);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (device_.IsSycl() && device.IsSycl()) {
|
||||||
|
CHECK_EQ(device_, device)
|
||||||
|
<< "New device is different from previous one.";
|
||||||
|
}
|
||||||
|
device_ = device;
|
||||||
|
if (device_.IsSycl()) {
|
||||||
|
ResizeDevice(data_h_.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... U>
|
||||||
|
void Resize(size_t new_size, U&&... args) {
|
||||||
|
if (new_size == Size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if ((Size() == 0 && device_.IsSycl()) || (DeviceCanWrite() && device_.IsSycl())) {
|
||||||
|
// fast on-device resize
|
||||||
|
device_access_ = GPUAccess::kWrite;
|
||||||
|
SetDevice();
|
||||||
|
auto old_size = data_d_->Size();
|
||||||
|
data_d_->Resize(qu_, new_size, std::forward<U>(args)...);
|
||||||
|
} else {
|
||||||
|
// resize on host
|
||||||
|
SyncHost(GPUAccess::kNone);
|
||||||
|
auto old_size = data_h_.size();
|
||||||
|
data_h_.resize(new_size, std::forward<U>(args)...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SyncHost(GPUAccess access) {
|
||||||
|
if (HostCanAccess(access)) { return; }
|
||||||
|
if (HostCanRead()) {
|
||||||
|
// data is present, just need to deny access to the device
|
||||||
|
device_access_ = access;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
device_access_ = access;
|
||||||
|
if (data_h_.size() != data_d_->Size()) { data_h_.resize(data_d_->Size()); }
|
||||||
|
SetDevice();
|
||||||
|
qu_->memcpy(data_h_.data(), data_d_->Data(), data_d_->Size() * sizeof(T)).wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SyncDevice(GPUAccess access) {
|
||||||
|
if (DeviceCanAccess(access)) { return; }
|
||||||
|
if (DeviceCanRead()) {
|
||||||
|
device_access_ = access;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// data is on the host
|
||||||
|
ResizeDevice(data_h_.size());
|
||||||
|
SetDevice();
|
||||||
|
qu_->memcpy(data_d_->Data(), data_h_.data(), data_d_->Size() * sizeof(T)).wait();
|
||||||
|
device_access_ = access;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool HostCanAccess(GPUAccess access) const { return device_access_ <= access; }
|
||||||
|
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||||
|
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||||
|
bool DeviceCanAccess(GPUAccess access) const { return device_access_ >= access; }
|
||||||
|
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||||
|
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||||
|
GPUAccess Access() const { return device_access_; }
|
||||||
|
|
||||||
|
size_t Size() const {
|
||||||
|
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->Size() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceOrd Device() const { return device_; }
|
||||||
|
|
||||||
|
T* DevicePointer() {
|
||||||
|
SyncDevice(GPUAccess::kWrite);
|
||||||
|
return data_d_->Data();
|
||||||
|
}
|
||||||
|
|
||||||
|
const T* ConstDevicePointer() {
|
||||||
|
SyncDevice(GPUAccess::kRead);
|
||||||
|
return data_d_->DataConst();
|
||||||
|
}
|
||||||
|
|
||||||
|
common::Span<T> DeviceSpan() {
|
||||||
|
SyncDevice(GPUAccess::kWrite);
|
||||||
|
return {this->DevicePointer(), Size()};
|
||||||
|
}
|
||||||
|
|
||||||
|
common::Span<const T> ConstDeviceSpan() {
|
||||||
|
SyncDevice(GPUAccess::kRead);
|
||||||
|
return {this->ConstDevicePointer(), Size()};
|
||||||
|
}
|
||||||
|
|
||||||
|
void Fill(T v) {
|
||||||
|
if (HostCanWrite()) {
|
||||||
|
std::fill(data_h_.begin(), data_h_.end(), v);
|
||||||
|
} else {
|
||||||
|
device_access_ = GPUAccess::kWrite;
|
||||||
|
SetDevice();
|
||||||
|
qu_->fill(data_d_->Data(), v, data_d_->Size()).wait();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(HostDeviceVectorImpl<T>* other) {
|
||||||
|
CHECK_EQ(Size(), other->Size());
|
||||||
|
SetDevice(other->device_);
|
||||||
|
// Data is on host.
|
||||||
|
if (HostCanWrite() && other->HostCanWrite()) {
|
||||||
|
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
SetDevice();
|
||||||
|
CopyToDevice(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(const std::vector<T>& other) {
|
||||||
|
CHECK_EQ(Size(), other.size());
|
||||||
|
if (HostCanWrite()) {
|
||||||
|
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||||
|
} else {
|
||||||
|
CopyToDevice(other.data());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Copy(std::initializer_list<T> other) {
|
||||||
|
CHECK_EQ(Size(), other.size());
|
||||||
|
if (HostCanWrite()) {
|
||||||
|
std::copy(other.begin(), other.end(), data_h_.begin());
|
||||||
|
} else {
|
||||||
|
CopyToDevice(other.begin());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Extend(HostDeviceVectorImpl* other) {
|
||||||
|
auto ori_size = this->Size();
|
||||||
|
this->Resize(ori_size + other->Size(), T{});
|
||||||
|
if (HostCanWrite() && other->HostCanRead()) {
|
||||||
|
auto& h_vec = this->HostVector();
|
||||||
|
auto& other_vec = other->HostVector();
|
||||||
|
CHECK_EQ(h_vec.size(), ori_size + other->Size());
|
||||||
|
std::copy(other_vec.cbegin(), other_vec.cend(), h_vec.begin() + ori_size);
|
||||||
|
} else {
|
||||||
|
auto ptr = other->ConstDevicePointer();
|
||||||
|
SetDevice();
|
||||||
|
CHECK_EQ(this->Device(), other->Device());
|
||||||
|
qu_->memcpy(this->DevicePointer() + ori_size, ptr, other->Size() * sizeof(T)).wait();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void ResizeDevice(size_t new_size) {
|
||||||
|
if (data_d_ && new_size == data_d_->Size()) { return; }
|
||||||
|
SetDevice();
|
||||||
|
data_d_->Resize(qu_, new_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetDevice() {
|
||||||
|
if (!qu_) {
|
||||||
|
qu_ = device_manager_.GetQueue(device_);
|
||||||
|
}
|
||||||
|
if (!data_d_) {
|
||||||
|
data_d_.reset(new DeviceStorage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CopyToDevice(HostDeviceVectorImpl* other) {
|
||||||
|
if (other->HostCanWrite()) {
|
||||||
|
CopyToDevice(other->data_h_.data());
|
||||||
|
} else {
|
||||||
|
ResizeDevice(Size());
|
||||||
|
device_access_ = GPUAccess::kWrite;
|
||||||
|
SetDevice();
|
||||||
|
qu_->memcpy(data_d_->Data(), other->data_d_->Data(), data_d_->Size() * sizeof(T)).wait();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void CopyToDevice(const T* begin) {
|
||||||
|
data_d_->ResizeNoCopy(qu_, Size());
|
||||||
|
qu_->memcpy(data_d_->Data(), begin, data_d_->Size() * sizeof(T)).wait();
|
||||||
|
device_access_ = GPUAccess::kWrite;
|
||||||
|
}
|
||||||
|
|
||||||
|
sycl::DeviceManager device_manager_;
|
||||||
|
::sycl::queue* qu_ = nullptr;
|
||||||
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
|
std::vector<T> data_h_{};
|
||||||
|
std::unique_ptr<DeviceStorage> data_d_{};
|
||||||
|
GPUAccess device_access_{GPUAccess::kNone};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
|
||||||
|
: impl_(nullptr) {
|
||||||
|
impl_ = new HostDeviceVectorImpl<T>(size, v, device);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
|
||||||
|
: impl_(nullptr) {
|
||||||
|
impl_ = new HostDeviceVectorImpl<T>(init, device);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
|
||||||
|
: impl_(nullptr) {
|
||||||
|
impl_ = new HostDeviceVectorImpl<T>(init, device);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
|
||||||
|
impl_ = new HostDeviceVectorImpl<T>(std::move(*that.impl_));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
|
||||||
|
if (this == &that) { return *this; }
|
||||||
|
|
||||||
|
std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
|
||||||
|
new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
|
||||||
|
delete impl_;
|
||||||
|
impl_ = new_impl.release();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
HostDeviceVector<T>::~HostDeviceVector() {
|
||||||
|
delete impl_;
|
||||||
|
impl_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
DeviceOrd HostDeviceVector<T>::Device() const {
|
||||||
|
return impl_->Device();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
T* HostDeviceVector<T>::DevicePointer() {
|
||||||
|
return impl_->DevicePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const T* HostDeviceVector<T>::ConstDevicePointer() const {
|
||||||
|
return impl_->ConstDevicePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
common::Span<T> HostDeviceVector<T>::DeviceSpan() {
|
||||||
|
return impl_->DeviceSpan();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
|
||||||
|
return impl_->ConstDeviceSpan();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
||||||
|
return impl_->ConstHostVector();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||||
|
impl_->Resize(new_size, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Resize(size_t new_size) {
|
||||||
|
impl_->Resize(new_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Fill(T v) {
|
||||||
|
impl_->Fill(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(const HostDeviceVector<T>& other) {
|
||||||
|
impl_->Copy(other.impl_);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(const std::vector<T>& other) {
|
||||||
|
impl_->Copy(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Copy(std::initializer_list<T> other) {
|
||||||
|
impl_->Copy(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::Extend(HostDeviceVector const& other) {
|
||||||
|
impl_->Extend(other.impl_);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool HostDeviceVector<T>::HostCanRead() const {
|
||||||
|
return impl_->HostCanRead();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool HostDeviceVector<T>::HostCanWrite() const {
|
||||||
|
return impl_->HostCanWrite();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool HostDeviceVector<T>::DeviceCanRead() const {
|
||||||
|
return impl_->DeviceCanRead();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
bool HostDeviceVector<T>::DeviceCanWrite() const {
|
||||||
|
return impl_->DeviceCanWrite();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
GPUAccess HostDeviceVector<T>::DeviceAccess() const {
|
||||||
|
return impl_->Access();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
||||||
|
impl_->SetDevice(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||||
|
template class HostDeviceVector<bst_float>;
|
||||||
|
template class HostDeviceVector<double>;
|
||||||
|
template class HostDeviceVector<GradientPair>;
|
||||||
|
template class HostDeviceVector<GradientPairPrecise>;
|
||||||
|
template class HostDeviceVector<int32_t>; // bst_node_t
|
||||||
|
template class HostDeviceVector<uint8_t>;
|
||||||
|
template class HostDeviceVector<int8_t>;
|
||||||
|
template class HostDeviceVector<FeatureType>;
|
||||||
|
template class HostDeviceVector<Entry>;
|
||||||
|
template class HostDeviceVector<bst_idx_t>;
|
||||||
|
template class HostDeviceVector<uint32_t>; // bst_feature_t
|
||||||
|
|
||||||
|
} // namespace xgboost
|
||||||
|
|
||||||
|
#endif // XGBOOST_USE_SYCL
|
||||||
@ -37,14 +37,14 @@ enum class MemoryType { shared, on_device};
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
class USMDeleter {
|
class USMDeleter {
|
||||||
public:
|
public:
|
||||||
explicit USMDeleter(::sycl::queue qu) : qu_(qu) {}
|
explicit USMDeleter(::sycl::queue* qu) : qu_(qu) {}
|
||||||
|
|
||||||
void operator()(T* data) const {
|
void operator()(T* data) const {
|
||||||
::sycl::free(data, qu_);
|
::sycl::free(data, *qu_);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T, MemoryType memory_type = MemoryType::shared>
|
template <typename T, MemoryType memory_type = MemoryType::shared>
|
||||||
@ -53,9 +53,9 @@ class USMVector {
|
|||||||
|
|
||||||
std::shared_ptr<T> allocate_memory_(::sycl::queue* qu, size_t size) {
|
std::shared_ptr<T> allocate_memory_(::sycl::queue* qu, size_t size) {
|
||||||
if constexpr (memory_type == MemoryType::shared) {
|
if constexpr (memory_type == MemoryType::shared) {
|
||||||
return std::shared_ptr<T>(::sycl::malloc_shared<T>(size_, *qu), USMDeleter<T>(*qu));
|
return std::shared_ptr<T>(::sycl::malloc_shared<T>(size_, *qu), USMDeleter<T>(qu));
|
||||||
} else {
|
} else {
|
||||||
return std::shared_ptr<T>(::sycl::malloc_device<T>(size_, *qu), USMDeleter<T>(*qu));
|
return std::shared_ptr<T>(::sycl::malloc_device<T>(size_, *qu), USMDeleter<T>(qu));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -227,14 +227,14 @@ class USMVector {
|
|||||||
/* Wrapper for DMatrix which stores all batches in a single USM buffer */
|
/* Wrapper for DMatrix which stores all batches in a single USM buffer */
|
||||||
struct DeviceMatrix {
|
struct DeviceMatrix {
|
||||||
DMatrix* p_mat; // Pointer to the original matrix on the host
|
DMatrix* p_mat; // Pointer to the original matrix on the host
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
USMVector<size_t, MemoryType::on_device> row_ptr;
|
USMVector<size_t, MemoryType::on_device> row_ptr;
|
||||||
USMVector<Entry, MemoryType::on_device> data;
|
USMVector<Entry, MemoryType::on_device> data;
|
||||||
size_t total_offset;
|
size_t total_offset;
|
||||||
|
|
||||||
DeviceMatrix() = default;
|
DeviceMatrix() = default;
|
||||||
|
|
||||||
void Init(::sycl::queue qu, DMatrix* dmat) {
|
void Init(::sycl::queue* qu, DMatrix* dmat) {
|
||||||
qu_ = qu;
|
qu_ = qu;
|
||||||
p_mat = dmat;
|
p_mat = dmat;
|
||||||
|
|
||||||
@ -247,9 +247,9 @@ struct DeviceMatrix {
|
|||||||
num_row += batch.Size();
|
num_row += batch.Size();
|
||||||
}
|
}
|
||||||
|
|
||||||
row_ptr.Resize(&qu_, num_row + 1);
|
row_ptr.Resize(qu_, num_row + 1);
|
||||||
size_t* rows = row_ptr.Data();
|
size_t* rows = row_ptr.Data();
|
||||||
data.Resize(&qu_, num_nonzero);
|
data.Resize(qu_, num_nonzero);
|
||||||
|
|
||||||
size_t data_offset = 0;
|
size_t data_offset = 0;
|
||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
@ -259,10 +259,10 @@ struct DeviceMatrix {
|
|||||||
size_t batch_size = batch.Size();
|
size_t batch_size = batch.Size();
|
||||||
if (batch_size > 0) {
|
if (batch_size > 0) {
|
||||||
const auto base_rowid = batch.base_rowid;
|
const auto base_rowid = batch.base_rowid;
|
||||||
event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
|
event = qu->memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
|
||||||
sizeof(size_t) * batch_size, event);
|
sizeof(size_t) * batch_size, event);
|
||||||
if (base_rowid > 0) {
|
if (base_rowid > 0) {
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event);
|
cgh.depends_on(event);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
|
||||||
int row_id = pid[0];
|
int row_id = pid[0];
|
||||||
@ -270,19 +270,19 @@ struct DeviceMatrix {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
event = qu.memcpy(data.Data() + data_offset, data_vec.data(),
|
event = qu->memcpy(data.Data() + data_offset, data_vec.data(),
|
||||||
sizeof(Entry) * offset_vec[batch_size], event);
|
sizeof(Entry) * offset_vec[batch_size], event);
|
||||||
data_offset += offset_vec[batch_size];
|
data_offset += offset_vec[batch_size];
|
||||||
qu.wait();
|
qu->wait();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu_->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event);
|
cgh.depends_on(event);
|
||||||
cgh.single_task<>([=] {
|
cgh.single_task<>([=] {
|
||||||
rows[num_row] = data_offset;
|
rows[num_row] = data_offset;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
qu.wait();
|
qu_->wait();
|
||||||
total_offset = data_offset;
|
total_offset = data_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -49,7 +49,7 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename BinIdxType>
|
template <typename BinIdxType>
|
||||||
void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
|
void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
|
||||||
BinIdxType* index_data,
|
BinIdxType* index_data,
|
||||||
const DeviceMatrix &dmat,
|
const DeviceMatrix &dmat,
|
||||||
size_t nbins,
|
size_t nbins,
|
||||||
@ -66,11 +66,11 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
|
|||||||
// Sparse case only
|
// Sparse case only
|
||||||
if (!offsets) {
|
if (!offsets) {
|
||||||
// sort_buff has type uint8_t
|
// sort_buff has type uint8_t
|
||||||
sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
|
sort_buff.Resize(qu, num_rows * row_stride * sizeof(BinIdxType));
|
||||||
}
|
}
|
||||||
BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
|
BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
|
||||||
|
|
||||||
auto event = qu.submit([&](::sycl::handler& cgh) {
|
auto event = qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
|
||||||
const size_t i = pid.get_id(0);
|
const size_t i = pid.get_id(0);
|
||||||
const size_t ibegin = offset_vec[i];
|
const size_t ibegin = offset_vec[i];
|
||||||
@ -92,8 +92,8 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
|
qu->memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
|
||||||
qu.wait();
|
qu->wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
|
void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
|
||||||
@ -110,7 +110,7 @@ void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GHistIndexMatrix::Init(::sycl::queue qu,
|
void GHistIndexMatrix::Init(::sycl::queue* qu,
|
||||||
Context const * ctx,
|
Context const * ctx,
|
||||||
const DeviceMatrix& p_fmat_device,
|
const DeviceMatrix& p_fmat_device,
|
||||||
int max_bins) {
|
int max_bins) {
|
||||||
@ -123,7 +123,7 @@ void GHistIndexMatrix::Init(::sycl::queue qu,
|
|||||||
const uint32_t nbins = cut.Ptrs().back();
|
const uint32_t nbins = cut.Ptrs().back();
|
||||||
this->nbins = nbins;
|
this->nbins = nbins;
|
||||||
hit_count.resize(nbins, 0);
|
hit_count.resize(nbins, 0);
|
||||||
hit_count_buff.Resize(&qu, nbins, 0);
|
hit_count_buff.Resize(qu, nbins, 0);
|
||||||
|
|
||||||
this->p_fmat = p_fmat_device.p_mat;
|
this->p_fmat = p_fmat_device.p_mat;
|
||||||
const bool isDense = p_fmat_device.p_mat->IsDense();
|
const bool isDense = p_fmat_device.p_mat->IsDense();
|
||||||
@ -150,7 +150,7 @@ void GHistIndexMatrix::Init(::sycl::queue qu,
|
|||||||
if (isDense) {
|
if (isDense) {
|
||||||
index.ResizeOffset(n_offsets);
|
index.ResizeOffset(n_offsets);
|
||||||
offsets = index.Offset();
|
offsets = index.Offset();
|
||||||
qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
|
qu->memcpy(offsets, cut_device.Ptrs().DataConst(),
|
||||||
sizeof(uint32_t) * n_offsets).wait_and_throw();
|
sizeof(uint32_t) * n_offsets).wait_and_throw();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -26,16 +26,16 @@ class HistogramCuts {
|
|||||||
public:
|
public:
|
||||||
HistogramCuts() {}
|
HistogramCuts() {}
|
||||||
|
|
||||||
explicit HistogramCuts(::sycl::queue qu) {}
|
explicit HistogramCuts(::sycl::queue* qu) {}
|
||||||
|
|
||||||
~HistogramCuts() {
|
~HistogramCuts() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) {
|
void Init(::sycl::queue* qu, xgboost::common::HistogramCuts const& cuts) {
|
||||||
qu_ = qu;
|
qu_ = qu;
|
||||||
cut_values_.Init(&qu_, cuts.cut_values_.HostVector());
|
cut_values_.Init(qu_, cuts.cut_values_.HostVector());
|
||||||
cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector());
|
cut_ptrs_.Init(qu_, cuts.cut_ptrs_.HostVector());
|
||||||
min_vals_.Init(&qu_, cuts.min_vals_.HostVector());
|
min_vals_.Init(qu_, cuts.min_vals_.HostVector());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Getters for USM buffers to pass pointers into device kernels
|
// Getters for USM buffers to pass pointers into device kernels
|
||||||
@ -47,7 +47,7 @@ class HistogramCuts {
|
|||||||
USMVector<bst_float> cut_values_;
|
USMVector<bst_float> cut_values_;
|
||||||
USMVector<uint32_t> cut_ptrs_;
|
USMVector<uint32_t> cut_ptrs_;
|
||||||
USMVector<float> min_vals_;
|
USMVector<float> min_vals_;
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
};
|
};
|
||||||
|
|
||||||
using BinTypeSize = ::xgboost::common::BinTypeSize;
|
using BinTypeSize = ::xgboost::common::BinTypeSize;
|
||||||
@ -115,11 +115,11 @@ struct Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Resize(const size_t nBytesData) {
|
void Resize(const size_t nBytesData) {
|
||||||
data_.Resize(&qu_, nBytesData);
|
data_.Resize(qu_, nBytesData);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ResizeOffset(const size_t nDisps) {
|
void ResizeOffset(const size_t nDisps) {
|
||||||
offset_.Resize(&qu_, nDisps);
|
offset_.Resize(qu_, nDisps);
|
||||||
p_ = nDisps;
|
p_ = nDisps;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,7 +131,7 @@ struct Index {
|
|||||||
return data_.End();
|
return data_.End();
|
||||||
}
|
}
|
||||||
|
|
||||||
void setQueue(::sycl::queue qu) {
|
void setQueue(::sycl::queue* qu) {
|
||||||
qu_ = qu;
|
qu_ = qu;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,7 +155,7 @@ struct Index {
|
|||||||
size_t p_ {1};
|
size_t p_ {1};
|
||||||
Func func_;
|
Func func_;
|
||||||
|
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -182,11 +182,11 @@ struct GHistIndexMatrix {
|
|||||||
size_t row_stride;
|
size_t row_stride;
|
||||||
|
|
||||||
// Create a global histogram matrix based on a given DMatrix device wrapper
|
// Create a global histogram matrix based on a given DMatrix device wrapper
|
||||||
void Init(::sycl::queue qu, Context const * ctx,
|
void Init(::sycl::queue* qu, Context const * ctx,
|
||||||
const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
|
const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
|
||||||
|
|
||||||
template <typename BinIdxType>
|
template <typename BinIdxType>
|
||||||
void SetIndexData(::sycl::queue qu, BinIdxType* index_data,
|
void SetIndexData(::sycl::queue* qu, BinIdxType* index_data,
|
||||||
const sycl::DeviceMatrix &dmat_device,
|
const sycl::DeviceMatrix &dmat_device,
|
||||||
size_t nbins, size_t row_stride, uint32_t* offsets);
|
size_t nbins, size_t row_stride, uint32_t* offsets);
|
||||||
|
|
||||||
|
|||||||
@ -9,85 +9,50 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace sycl {
|
namespace sycl {
|
||||||
|
|
||||||
::sycl::device DeviceManager::GetDevice(const DeviceOrd& device_spec) const {
|
::sycl::queue* DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
|
||||||
if (!device_spec.IsSycl()) {
|
if (!device_spec.IsSycl()) {
|
||||||
LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
|
LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
|
||||||
<< device_spec.Name() << ". "
|
<< device_spec.Name() << ". "
|
||||||
<< "Default sycl device_selector will be used.";
|
<< "Default sycl device_selector will be used.";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t queue_idx;
|
||||||
bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
|
bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
|
||||||
(collective::IsDistributed());
|
(collective::IsDistributed());
|
||||||
|
DeviceRegister& device_register = GetDevicesRegister();
|
||||||
if (not_use_default_selector) {
|
if (not_use_default_selector) {
|
||||||
DeviceRegister& device_register = GetDevicesRegister();
|
|
||||||
const int device_idx =
|
|
||||||
collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal;
|
|
||||||
if (device_spec.IsSyclDefault()) {
|
|
||||||
auto& devices = device_register.devices;
|
|
||||||
CHECK_LT(device_idx, devices.size());
|
|
||||||
return devices[device_idx];
|
|
||||||
} else if (device_spec.IsSyclCPU()) {
|
|
||||||
auto& cpu_devices = device_register.cpu_devices;
|
|
||||||
CHECK_LT(device_idx, cpu_devices.size());
|
|
||||||
return cpu_devices[device_idx];
|
|
||||||
} else {
|
|
||||||
auto& gpu_devices = device_register.gpu_devices;
|
|
||||||
CHECK_LT(device_idx, gpu_devices.size());
|
|
||||||
return gpu_devices[device_idx];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (device_spec.IsSyclCPU()) {
|
|
||||||
return ::sycl::device(::sycl::cpu_selector_v);
|
|
||||||
} else if (device_spec.IsSyclGPU()) {
|
|
||||||
return ::sycl::device(::sycl::gpu_selector_v);
|
|
||||||
} else {
|
|
||||||
return ::sycl::device(::sycl::default_selector_v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
::sycl::queue DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
|
|
||||||
if (!device_spec.IsSycl()) {
|
|
||||||
LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
|
|
||||||
<< device_spec.Name() << ". "
|
|
||||||
<< "Default sycl device_selector will be used.";
|
|
||||||
}
|
|
||||||
|
|
||||||
QueueRegister_t& queue_register = GetQueueRegister();
|
|
||||||
if (queue_register.count(device_spec.Name()) > 0) {
|
|
||||||
return queue_register.at(device_spec.Name());
|
|
||||||
}
|
|
||||||
|
|
||||||
bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
|
|
||||||
(collective::IsDistributed());
|
|
||||||
std::lock_guard<std::mutex> guard(queue_registering_mutex);
|
|
||||||
if (not_use_default_selector) {
|
|
||||||
DeviceRegister& device_register = GetDevicesRegister();
|
|
||||||
const int device_idx =
|
const int device_idx =
|
||||||
collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal;
|
collective::IsDistributed() ? collective::GetRank() : device_spec.ordinal;
|
||||||
if (device_spec.IsSyclDefault()) {
|
if (device_spec.IsSyclDefault()) {
|
||||||
auto& devices = device_register.devices;
|
auto& devices = device_register.devices;
|
||||||
CHECK_LT(device_idx, devices.size());
|
CHECK_LT(device_idx, devices.size());
|
||||||
queue_register[device_spec.Name()] = ::sycl::queue(devices[device_idx]);
|
queue_idx = device_idx;
|
||||||
} else if (device_spec.IsSyclCPU()) {
|
} else if (device_spec.IsSyclCPU()) {
|
||||||
auto& cpu_devices = device_register.cpu_devices;
|
auto& cpu_devices_idxes = device_register.cpu_devices_idxes;
|
||||||
CHECK_LT(device_idx, cpu_devices.size());
|
CHECK_LT(device_idx, cpu_devices_idxes.size());
|
||||||
queue_register[device_spec.Name()] = ::sycl::queue(cpu_devices[device_idx]);
|
queue_idx = cpu_devices_idxes[device_idx];
|
||||||
} else if (device_spec.IsSyclGPU()) {
|
} else if (device_spec.IsSyclGPU()) {
|
||||||
auto& gpu_devices = device_register.gpu_devices;
|
auto& gpu_devices_idxes = device_register.gpu_devices_idxes;
|
||||||
CHECK_LT(device_idx, gpu_devices.size());
|
CHECK_LT(device_idx, gpu_devices_idxes.size());
|
||||||
queue_register[device_spec.Name()] = ::sycl::queue(gpu_devices[device_idx]);
|
queue_idx = gpu_devices_idxes[device_idx];
|
||||||
|
} else {
|
||||||
|
LOG(WARNING) << device_spec << " is not sycl, sycl:cpu or sycl:gpu";
|
||||||
|
auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
|
||||||
|
queue_idx = device_register.devices.at(device);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (device_spec.IsSyclCPU()) {
|
if (device_spec.IsSyclCPU()) {
|
||||||
queue_register[device_spec.Name()] = ::sycl::queue(::sycl::cpu_selector_v);
|
auto device = ::sycl::queue(::sycl::cpu_selector_v).get_device();
|
||||||
|
queue_idx = device_register.devices.at(device);
|
||||||
} else if (device_spec.IsSyclGPU()) {
|
} else if (device_spec.IsSyclGPU()) {
|
||||||
queue_register[device_spec.Name()] = ::sycl::queue(::sycl::gpu_selector_v);
|
auto device = ::sycl::queue(::sycl::gpu_selector_v).get_device();
|
||||||
|
queue_idx = device_register.devices.at(device);
|
||||||
} else {
|
} else {
|
||||||
queue_register[device_spec.Name()] = ::sycl::queue(::sycl::default_selector_v);
|
auto device = ::sycl::queue(::sycl::default_selector_v).get_device();
|
||||||
|
queue_idx = device_register.devices.at(device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return queue_register.at(device_spec.Name());
|
return &(device_register.queues[queue_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
|
DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
|
||||||
@ -102,21 +67,17 @@ DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < devices.size(); i++) {
|
for (size_t i = 0; i < devices.size(); i++) {
|
||||||
device_register.devices.push_back(devices[i]);
|
device_register.devices[devices[i]] = i;
|
||||||
|
device_register.queues.push_back(::sycl::queue(devices[i]));
|
||||||
if (devices[i].is_cpu()) {
|
if (devices[i].is_cpu()) {
|
||||||
device_register.cpu_devices.push_back(devices[i]);
|
device_register.cpu_devices_idxes.push_back(i);
|
||||||
} else if (devices[i].is_gpu()) {
|
} else if (devices[i].is_gpu()) {
|
||||||
device_register.gpu_devices.push_back(devices[i]);
|
device_register.gpu_devices_idxes.push_back(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return device_register;
|
return device_register;
|
||||||
}
|
}
|
||||||
|
|
||||||
DeviceManager::QueueRegister_t& DeviceManager::GetQueueRegister() const {
|
|
||||||
static QueueRegister_t queue_register;
|
|
||||||
return queue_register;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace sycl
|
} // namespace sycl
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -23,25 +23,20 @@ namespace sycl {
|
|||||||
|
|
||||||
class DeviceManager {
|
class DeviceManager {
|
||||||
public:
|
public:
|
||||||
::sycl::queue GetQueue(const DeviceOrd& device_spec) const;
|
::sycl::queue* GetQueue(const DeviceOrd& device_spec) const;
|
||||||
|
|
||||||
::sycl::device GetDevice(const DeviceOrd& device_spec) const;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
using QueueRegister_t = std::unordered_map<std::string, ::sycl::queue>;
|
|
||||||
constexpr static int kDefaultOrdinal = -1;
|
constexpr static int kDefaultOrdinal = -1;
|
||||||
|
|
||||||
struct DeviceRegister {
|
struct DeviceRegister {
|
||||||
std::vector<::sycl::device> devices;
|
std::vector<::sycl::queue> queues;
|
||||||
std::vector<::sycl::device> cpu_devices;
|
std::unordered_map<::sycl::device, size_t> devices;
|
||||||
std::vector<::sycl::device> gpu_devices;
|
std::vector<size_t> cpu_devices_idxes;
|
||||||
|
std::vector<size_t> gpu_devices_idxes;
|
||||||
};
|
};
|
||||||
|
|
||||||
QueueRegister_t& GetQueueRegister() const;
|
|
||||||
|
|
||||||
DeviceRegister& GetDevicesRegister() const;
|
DeviceRegister& GetDevicesRegister() const;
|
||||||
|
|
||||||
mutable std::mutex queue_registering_mutex;
|
|
||||||
mutable std::mutex device_registering_mutex;
|
mutable std::mutex device_registering_mutex;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
|
|
||||||
void InitBuffers(const std::vector<int>& sample_rate) const {
|
void InitBuffers(const std::vector<int>& sample_rate) const {
|
||||||
if (!are_buffs_init) {
|
if (!are_buffs_init) {
|
||||||
batch_processor_.InitBuffers(&qu_, sample_rate);
|
batch_processor_.InitBuffers(qu_, sample_rate);
|
||||||
are_buffs_init = true;
|
are_buffs_init = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -88,7 +88,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
const bst_float* weights) {
|
const bst_float* weights) {
|
||||||
const size_t wg_size = 32;
|
const size_t wg_size = 32;
|
||||||
const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
|
const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
|
||||||
return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size},
|
return linalg::GroupWiseKernel(qu_, &flag, events, {nwgs, wg_size},
|
||||||
[=] (size_t idx, auto flag) {
|
[=] (size_t idx, auto flag) {
|
||||||
const bst_float* pred = preds + idx * nclass;
|
const bst_float* pred = preds + idx * nclass;
|
||||||
|
|
||||||
@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
*(info.labels.Data()),
|
*(info.labels.Data()),
|
||||||
info.weights_);
|
info.weights_);
|
||||||
}
|
}
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
|
|
||||||
if (flag == 0) {
|
if (flag == 0) {
|
||||||
LOG(FATAL) << "SYCL::SoftmaxMultiClassObj: label must be in [0, num_class).";
|
LOG(FATAL) << "SYCL::SoftmaxMultiClassObj: label must be in [0, num_class).";
|
||||||
@ -160,7 +160,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
|
::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
|
||||||
|
|
||||||
if (prob) {
|
if (prob) {
|
||||||
qu_.submit([&](::sycl::handler& cgh) {
|
qu_->submit([&](::sycl::handler& cgh) {
|
||||||
auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
|
||||||
int idx = pid[0];
|
int idx = pid[0];
|
||||||
@ -171,7 +171,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
} else {
|
} else {
|
||||||
::sycl::buffer<bst_float, 1> max_preds_buf(max_preds_.HostPointer(), max_preds_.Size());
|
::sycl::buffer<bst_float, 1> max_preds_buf(max_preds_.HostPointer(), max_preds_.Size());
|
||||||
|
|
||||||
qu_.submit([&](::sycl::handler& cgh) {
|
qu_->submit([&](::sycl::handler& cgh) {
|
||||||
auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read>(cgh);
|
auto io_preds_acc = io_preds_buf.get_access<::sycl::access::mode::read>(cgh);
|
||||||
auto max_preds_acc = max_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
auto max_preds_acc = max_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
|
||||||
@ -215,7 +215,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
|
|
||||||
sycl::DeviceManager device_manager;
|
sycl::DeviceManager device_manager;
|
||||||
|
|
||||||
mutable ::sycl::queue qu_;
|
mutable ::sycl::queue* qu_;
|
||||||
static constexpr size_t kBatchSize = 1u << 22;
|
static constexpr size_t kBatchSize = 1u << 22;
|
||||||
mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
|
mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class RegLossObj : public ObjFunction {
|
|||||||
|
|
||||||
void InitBuffers() const {
|
void InitBuffers() const {
|
||||||
if (!are_buffs_init) {
|
if (!are_buffs_init) {
|
||||||
batch_processor_.InitBuffers(&qu_, {1, 1, 1, 1});
|
batch_processor_.InitBuffers(qu_, {1, 1, 1, 1});
|
||||||
are_buffs_init = true;
|
are_buffs_init = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -58,13 +58,16 @@ class RegLossObj : public ObjFunction {
|
|||||||
|
|
||||||
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
|
||||||
param_.UpdateAllowUnknown(args);
|
param_.UpdateAllowUnknown(args);
|
||||||
qu_ = device_manager.GetQueue(ctx_->Device());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetGradient(const HostDeviceVector<bst_float>& preds,
|
void GetGradient(const HostDeviceVector<bst_float>& preds,
|
||||||
const MetaInfo &info,
|
const MetaInfo &info,
|
||||||
int iter,
|
int iter,
|
||||||
xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
|
xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
|
||||||
|
if (qu_ == nullptr) {
|
||||||
|
LOG(WARNING) << ctx_->Device();
|
||||||
|
qu_ = device_manager.GetQueue(ctx_->Device());
|
||||||
|
}
|
||||||
if (info.labels.Size() == 0) return;
|
if (info.labels.Size() == 0) return;
|
||||||
CHECK_EQ(preds.Size(), info.labels.Size())
|
CHECK_EQ(preds.Size(), info.labels.Size())
|
||||||
<< " " << "labels are not correctly provided"
|
<< " " << "labels are not correctly provided"
|
||||||
@ -97,7 +100,7 @@ class RegLossObj : public ObjFunction {
|
|||||||
const bst_float* weights) {
|
const bst_float* weights) {
|
||||||
const size_t wg_size = 32;
|
const size_t wg_size = 32;
|
||||||
const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
|
const size_t nwgs = ndata / wg_size + (ndata % wg_size > 0);
|
||||||
return linalg::GroupWiseKernel(&qu_, &flag, events, {nwgs, wg_size},
|
return linalg::GroupWiseKernel(qu_, &flag, events, {nwgs, wg_size},
|
||||||
[=] (size_t idx, auto flag) {
|
[=] (size_t idx, auto flag) {
|
||||||
const bst_float pred = Loss::PredTransform(preds[idx]);
|
const bst_float pred = Loss::PredTransform(preds[idx]);
|
||||||
bst_float weight = is_null_weight ? 1.0f : weights[idx/n_targets];
|
bst_float weight = is_null_weight ? 1.0f : weights[idx/n_targets];
|
||||||
@ -129,7 +132,7 @@ class RegLossObj : public ObjFunction {
|
|||||||
*(info.labels.Data()),
|
*(info.labels.Data()),
|
||||||
info.weights_);
|
info.weights_);
|
||||||
}
|
}
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
|
|
||||||
if (flag == 0) {
|
if (flag == 0) {
|
||||||
LOG(FATAL) << Loss::LabelErrorMsg();
|
LOG(FATAL) << Loss::LabelErrorMsg();
|
||||||
@ -142,6 +145,10 @@ class RegLossObj : public ObjFunction {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
|
void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
|
||||||
|
if (qu_ == nullptr) {
|
||||||
|
LOG(WARNING) << ctx_->Device();
|
||||||
|
qu_ = device_manager.GetQueue(ctx_->Device());
|
||||||
|
}
|
||||||
size_t const ndata = io_preds->Size();
|
size_t const ndata = io_preds->Size();
|
||||||
if (ndata == 0) return;
|
if (ndata == 0) return;
|
||||||
InitBuffers();
|
InitBuffers();
|
||||||
@ -149,7 +156,7 @@ class RegLossObj : public ObjFunction {
|
|||||||
batch_processor_.Calculate([=] (const std::vector<::sycl::event>& events,
|
batch_processor_.Calculate([=] (const std::vector<::sycl::event>& events,
|
||||||
size_t ndata,
|
size_t ndata,
|
||||||
bst_float* io_preds) {
|
bst_float* io_preds) {
|
||||||
return qu_.submit([&](::sycl::handler& cgh) {
|
return qu_->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(events);
|
cgh.depends_on(events);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
|
||||||
int idx = pid[0];
|
int idx = pid[0];
|
||||||
@ -157,7 +164,7 @@ class RegLossObj : public ObjFunction {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
}, io_preds);
|
}, io_preds);
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
}
|
}
|
||||||
|
|
||||||
float ProbToMargin(float base_score) const override {
|
float ProbToMargin(float base_score) const override {
|
||||||
@ -187,7 +194,7 @@ class RegLossObj : public ObjFunction {
|
|||||||
xgboost::obj::RegLossParam param_;
|
xgboost::obj::RegLossParam param_;
|
||||||
sycl::DeviceManager device_manager;
|
sycl::DeviceManager device_manager;
|
||||||
|
|
||||||
mutable ::sycl::queue qu_;
|
mutable ::sycl::queue* qu_ = nullptr;
|
||||||
static constexpr size_t kBatchSize = 1u << 22;
|
static constexpr size_t kBatchSize = 1u << 22;
|
||||||
mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
|
mutable linalg::BatchProcessingHelper<GradientPair, bst_float, kBatchSize, 3> batch_processor_;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -277,7 +277,7 @@ class Predictor : public xgboost::Predictor {
|
|||||||
void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
|
void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
|
||||||
const gbm::GBTreeModel &model, uint32_t tree_begin,
|
const gbm::GBTreeModel &model, uint32_t tree_begin,
|
||||||
uint32_t tree_end = 0) const override {
|
uint32_t tree_end = 0) const override {
|
||||||
::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
|
::sycl::queue* qu = device_manager.GetQueue(ctx_->Device());
|
||||||
// TODO(razdoburdin): remove temporary workaround after cache fix
|
// TODO(razdoburdin): remove temporary workaround after cache fix
|
||||||
sycl::DeviceMatrix device_matrix;
|
sycl::DeviceMatrix device_matrix;
|
||||||
device_matrix.Init(qu, dmat);
|
device_matrix.Init(qu, dmat);
|
||||||
@ -290,9 +290,9 @@ class Predictor : public xgboost::Predictor {
|
|||||||
if (tree_begin < tree_end) {
|
if (tree_begin < tree_end) {
|
||||||
const bool any_missing = !(dmat->IsDense());
|
const bool any_missing = !(dmat->IsDense());
|
||||||
if (any_missing) {
|
if (any_missing) {
|
||||||
DevicePredictInternal<true>(&qu, device_matrix, out_preds, model, tree_begin, tree_end);
|
DevicePredictInternal<true>(qu, device_matrix, out_preds, model, tree_begin, tree_end);
|
||||||
} else {
|
} else {
|
||||||
DevicePredictInternal<false>(&qu, device_matrix, out_preds, model, tree_begin, tree_end);
|
DevicePredictInternal<false>(qu, device_matrix, out_preds, model, tree_begin, tree_end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class BatchHistSynchronizer: public HistSynchronizer<GradientSumT> {
|
|||||||
this_hist, nbins, ::sycl::event());
|
this_hist, nbins, ::sycl::event());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
builder->qu_.wait_and_throw();
|
builder->qu_->wait_and_throw();
|
||||||
|
|
||||||
builder->builder_monitor_.Stop("SyncHistograms");
|
builder->builder_monitor_.Stop("SyncHistograms");
|
||||||
}
|
}
|
||||||
@ -84,7 +84,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
|
|||||||
auto& sibling_hist = builder->hist_[sibling_nid];
|
auto& sibling_hist = builder->hist_[sibling_nid];
|
||||||
common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
|
common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
|
||||||
this_hist, nbins, ::sycl::event());
|
this_hist, nbins, ::sycl::event());
|
||||||
builder->qu_.wait_and_throw();
|
builder->qu_->wait_and_throw();
|
||||||
// Store posible parent node
|
// Store posible parent node
|
||||||
auto& sibling_local = builder->hist_local_worker_[sibling_nid];
|
auto& sibling_local = builder->hist_local_worker_[sibling_nid];
|
||||||
common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
|
common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
|
||||||
@ -113,7 +113,7 @@ class DistributedHistSynchronizer: public HistSynchronizer<GradientSumT> {
|
|||||||
auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
|
auto& sibling_hist = builder->hist_[entry.GetSiblingId(p_tree, parent_id)];
|
||||||
common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
|
common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
|
||||||
sibling_hist, nbins, ::sycl::event());
|
sibling_hist, nbins, ::sycl::event());
|
||||||
builder->qu_.wait_and_throw();
|
builder->qu_->wait_and_throw();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -31,7 +31,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
|
|||||||
for (size_t i = 0; i < sync_ids.size(); i++) {
|
for (size_t i = 0; i < sync_ids.size(); i++) {
|
||||||
auto& this_hist = hist_[sync_ids[i]];
|
auto& this_hist = hist_[sync_ids[i]];
|
||||||
const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
|
const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
|
||||||
qu_.memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
|
qu_->memcpy(reduce_buffer_.data() + i * nbins, psrc, nbins*sizeof(GradientPairT)).wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
|
auto buffer_vec = linalg::MakeVec(reinterpret_cast<GradientSumT*>(reduce_buffer_.data()),
|
||||||
@ -42,7 +42,7 @@ void HistUpdater<GradientSumT>::ReduceHists(const std::vector<int>& sync_ids,
|
|||||||
for (size_t i = 0; i < sync_ids.size(); i++) {
|
for (size_t i = 0; i < sync_ids.size(); i++) {
|
||||||
auto& this_hist = hist_[sync_ids[i]];
|
auto& this_hist = hist_[sync_ids[i]];
|
||||||
GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
|
GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
|
||||||
qu_.memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
|
qu_->memcpy(psrc, reduce_buffer_.data() + i * nbins, nbins*sizeof(GradientPairT)).wait();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,7 +75,7 @@ void HistUpdater<GradientSumT>::BuildHistogramsLossGuide(
|
|||||||
|
|
||||||
std::vector<int> sync_ids;
|
std::vector<int> sync_ids;
|
||||||
hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree);
|
hist_rows_adder_->AddHistRows(this, &sync_ids, p_tree);
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
BuildLocalHistograms(gmat, p_tree, gpair_device);
|
BuildLocalHistograms(gmat, p_tree, gpair_device);
|
||||||
hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree);
|
hist_synchronizer_->SyncHistograms(this, sync_ids, p_tree);
|
||||||
}
|
}
|
||||||
@ -99,7 +99,7 @@ void HistUpdater<GradientSumT>::BuildLocalHistograms(
|
|||||||
common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event);
|
common::InitHist(qu_, &(hist_[nid]), hist_[nid].Size(), &event);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
builder_monitor_.Stop("BuildLocalHistograms");
|
builder_monitor_.Stop("BuildLocalHistograms");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -382,9 +382,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
|
|||||||
|
|
||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
if (is_first_group) {
|
if (is_first_group) {
|
||||||
out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
|
out_preds_buf_.ResizeNoCopy(qu_, buffer_size);
|
||||||
out_pred_ptr = &out_preds(0);
|
out_pred_ptr = &out_preds(0);
|
||||||
event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
|
event = qu_->memcpy(out_preds_buf_.Data(), out_pred_ptr,
|
||||||
|
buffer_size * sizeof(bst_float), event);
|
||||||
}
|
}
|
||||||
auto* out_preds_buf_ptr = out_preds_buf_.Data();
|
auto* out_preds_buf_ptr = out_preds_buf_.Data();
|
||||||
|
|
||||||
@ -406,7 +407,7 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
|
|||||||
const size_t* rid = rowset.begin;
|
const size_t* rid = rowset.begin;
|
||||||
const size_t num_rows = rowset.Size();
|
const size_t num_rows = rowset.Size();
|
||||||
|
|
||||||
events[node] = qu_.submit([&](::sycl::handler& cgh) {
|
events[node] = qu_->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event);
|
cgh.depends_on(event);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
|
||||||
out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
|
out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
|
||||||
@ -415,10 +416,10 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (is_last_group) {
|
if (is_last_group) {
|
||||||
qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
|
qu_->memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
|
||||||
out_pred_ptr = nullptr;
|
out_pred_ptr = nullptr;
|
||||||
}
|
}
|
||||||
qu_.wait();
|
qu_->wait();
|
||||||
|
|
||||||
builder_monitor_.Stop("UpdatePredictionCache");
|
builder_monitor_.Stop("UpdatePredictionCache");
|
||||||
return true;
|
return true;
|
||||||
@ -447,7 +448,7 @@ void HistUpdater<GradientSumT>::InitSampling(
|
|||||||
*/
|
*/
|
||||||
if (has_fp64_support_) {
|
if (has_fp64_support_) {
|
||||||
// Use oneDPL bernoulli_distribution for better perf
|
// Use oneDPL bernoulli_distribution for better perf
|
||||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
|
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
|
||||||
[=](::sycl::item<1> pid) {
|
[=](::sycl::item<1> pid) {
|
||||||
@ -465,7 +466,7 @@ void HistUpdater<GradientSumT>::InitSampling(
|
|||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// Use oneDPL uniform, as far as bernoulli_distribution uses fp64
|
// Use oneDPL uniform, as far as bernoulli_distribution uses fp64
|
||||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
|
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)),
|
||||||
[=](::sycl::item<1> pid) {
|
[=](::sycl::item<1> pid) {
|
||||||
@ -485,8 +486,8 @@ void HistUpdater<GradientSumT>::InitSampling(
|
|||||||
/* After calling a destructor for flag_buf, content will be copyed to num_samples */
|
/* After calling a destructor for flag_buf, content will be copyed to num_samples */
|
||||||
}
|
}
|
||||||
|
|
||||||
row_indices->Resize(&qu_, num_samples, 0, &event);
|
row_indices->Resize(qu_, num_samples, 0, &event);
|
||||||
qu_.wait();
|
qu_->wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename GradientSumT>
|
template<typename GradientSumT>
|
||||||
@ -526,7 +527,7 @@ void HistUpdater<GradientSumT>::InitData(
|
|||||||
hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
|
hist_builder_ = common::GHistBuilder<GradientSumT>(qu_, nbins);
|
||||||
|
|
||||||
USMVector<size_t, MemoryType::on_device>* row_indices = &(row_set_collection_.Data());
|
USMVector<size_t, MemoryType::on_device>* row_indices = &(row_set_collection_.Data());
|
||||||
row_indices->Resize(&qu_, info.num_row_);
|
row_indices->Resize(qu_, info.num_row_);
|
||||||
size_t* p_row_indices = row_indices->Data();
|
size_t* p_row_indices = row_indices->Data();
|
||||||
// mark subsample and build list of member rows
|
// mark subsample and build list of member rows
|
||||||
if (param_.subsample < 1.0f) {
|
if (param_.subsample < 1.0f) {
|
||||||
@ -540,7 +541,7 @@ void HistUpdater<GradientSumT>::InitData(
|
|||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
{
|
{
|
||||||
::sycl::buffer<int, 1> flag_buf(&has_neg_hess, 1);
|
::sycl::buffer<int, 1> flag_buf(&has_neg_hess, 1);
|
||||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
|
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
|
||||||
[=](::sycl::item<1> pid) {
|
[=](::sycl::item<1> pid) {
|
||||||
@ -558,7 +559,7 @@ void HistUpdater<GradientSumT>::InitData(
|
|||||||
size_t max_idx = 0;
|
size_t max_idx = 0;
|
||||||
{
|
{
|
||||||
::sycl::buffer<size_t, 1> flag_buf(&max_idx, 1);
|
::sycl::buffer<size_t, 1> flag_buf(&max_idx, 1);
|
||||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event);
|
cgh.depends_on(event);
|
||||||
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
|
cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(info.num_row_)),
|
||||||
@ -571,9 +572,9 @@ void HistUpdater<GradientSumT>::InitData(
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
row_indices->Resize(&qu_, max_idx, 0, &event);
|
row_indices->Resize(qu_, max_idx, 0, &event);
|
||||||
}
|
}
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
row_set_collection_.Init();
|
row_set_collection_.Init();
|
||||||
@ -661,7 +662,7 @@ void HistUpdater<GradientSumT>::ApplySplit(
|
|||||||
std::vector<int32_t> split_conditions(n_nodes);
|
std::vector<int32_t> split_conditions(n_nodes);
|
||||||
CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
|
CommonRowPartitioner::FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
|
||||||
|
|
||||||
partition_builder_.Init(&qu_, n_nodes, [&](size_t node_in_set) {
|
partition_builder_.Init(qu_, n_nodes, [&](size_t node_in_set) {
|
||||||
const int32_t nid = nodes[node_in_set].nid;
|
const int32_t nid = nodes[node_in_set].nid;
|
||||||
return row_set_collection_[nid].Size();
|
return row_set_collection_[nid].Size();
|
||||||
});
|
});
|
||||||
@ -669,14 +670,14 @@ void HistUpdater<GradientSumT>::ApplySplit(
|
|||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
partition_builder_.Partition(gmat, nodes, row_set_collection_,
|
partition_builder_.Partition(gmat, nodes, row_set_collection_,
|
||||||
split_conditions, p_tree, &event);
|
split_conditions, p_tree, &event);
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
|
|
||||||
for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
|
for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
|
||||||
const int32_t nid = nodes[node_in_set].nid;
|
const int32_t nid = nodes[node_in_set].nid;
|
||||||
size_t* data_result = const_cast<size_t*>(row_set_collection_[nid].begin);
|
size_t* data_result = const_cast<size_t*>(row_set_collection_[nid].begin);
|
||||||
partition_builder_.MergeToArray(node_in_set, data_result, &event);
|
partition_builder_.MergeToArray(node_in_set, data_result, &event);
|
||||||
}
|
}
|
||||||
qu_.wait_and_throw();
|
qu_->wait_and_throw();
|
||||||
|
|
||||||
AddSplitsToRowSet(nodes, p_tree);
|
AddSplitsToRowSet(nodes, p_tree);
|
||||||
|
|
||||||
@ -702,7 +703,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
|
|||||||
const auto* hist = reinterpret_cast<GradStats<GradientSumT>*>(hist_[nid].Data());
|
const auto* hist = reinterpret_cast<GradStats<GradientSumT>*>(hist_[nid].Data());
|
||||||
|
|
||||||
std::vector<GradStats<GradientSumT>> ets(iend - ibegin);
|
std::vector<GradStats<GradientSumT>> ets(iend - ibegin);
|
||||||
qu_.memcpy(ets.data(), hist + ibegin,
|
qu_->memcpy(ets.data(), hist + ibegin,
|
||||||
(iend - ibegin) * sizeof(GradStats<GradientSumT>)).wait_and_throw();
|
(iend - ibegin) * sizeof(GradStats<GradientSumT>)).wait_and_throw();
|
||||||
for (const auto& et : ets) {
|
for (const auto& et : ets) {
|
||||||
grad_stat += et;
|
grad_stat += et;
|
||||||
@ -714,7 +715,7 @@ void HistUpdater<GradientSumT>::InitNewNode(int nid,
|
|||||||
const GradientPair* gpair_ptr = gpair.DataConst();
|
const GradientPair* gpair_ptr = gpair.DataConst();
|
||||||
|
|
||||||
::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
|
::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
|
||||||
qu_.submit([&](::sycl::handler& cgh) {
|
qu_->submit([&](::sycl::handler& cgh) {
|
||||||
auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
|
auto reduction = ::sycl::reduction(buff, cgh, ::sycl::plus<>());
|
||||||
cgh.parallel_for<>(::sycl::range<1>(size), reduction,
|
cgh.parallel_for<>(::sycl::range<1>(size), reduction,
|
||||||
[=](::sycl::item<1> pid, auto& sum) {
|
[=](::sycl::item<1> pid, auto& sum) {
|
||||||
@ -786,8 +787,8 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
|
|||||||
}
|
}
|
||||||
const size_t total_features = pos;
|
const size_t total_features = pos;
|
||||||
|
|
||||||
split_queries_device_.Resize(&qu_, total_features);
|
split_queries_device_.Resize(qu_, total_features);
|
||||||
auto event = qu_.memcpy(split_queries_device_.Data(), split_queries_host_.data(),
|
auto event = qu_->memcpy(split_queries_device_.Data(), split_queries_host_.data(),
|
||||||
total_features * sizeof(SplitQuery));
|
total_features * sizeof(SplitQuery));
|
||||||
|
|
||||||
auto evaluator = tree_evaluator_.GetEvaluator();
|
auto evaluator = tree_evaluator_.GetEvaluator();
|
||||||
@ -796,18 +797,18 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
|
|||||||
const bst_float* cut_val = gmat.cut_device.Values().DataConst();
|
const bst_float* cut_val = gmat.cut_device.Values().DataConst();
|
||||||
const bst_float* cut_minval = gmat.cut_device.MinValues().DataConst();
|
const bst_float* cut_minval = gmat.cut_device.MinValues().DataConst();
|
||||||
|
|
||||||
snode_device_.ResizeNoCopy(&qu_, snode_host_.size());
|
snode_device_.ResizeNoCopy(qu_, snode_host_.size());
|
||||||
event = qu_.memcpy(snode_device_.Data(), snode_host_.data(),
|
event = qu_->memcpy(snode_device_.Data(), snode_host_.data(),
|
||||||
snode_host_.size() * sizeof(NodeEntry<GradientSumT>), event);
|
snode_host_.size() * sizeof(NodeEntry<GradientSumT>), event);
|
||||||
const NodeEntry<GradientSumT>* snode = snode_device_.Data();
|
const NodeEntry<GradientSumT>* snode = snode_device_.Data();
|
||||||
|
|
||||||
const float min_child_weight = param_.min_child_weight;
|
const float min_child_weight = param_.min_child_weight;
|
||||||
|
|
||||||
best_splits_device_.ResizeNoCopy(&qu_, total_features);
|
best_splits_device_.ResizeNoCopy(qu_, total_features);
|
||||||
if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features);
|
if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features);
|
||||||
SplitEntry<GradientSumT>* best_splits = best_splits_device_.Data();
|
SplitEntry<GradientSumT>* best_splits = best_splits_device_.Data();
|
||||||
|
|
||||||
event = qu_.submit([&](::sycl::handler& cgh) {
|
event = qu_->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.depends_on(event);
|
cgh.depends_on(event);
|
||||||
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_),
|
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_),
|
||||||
::sycl::range<2>(1, sub_group_size_)),
|
::sycl::range<2>(1, sub_group_size_)),
|
||||||
@ -823,10 +824,10 @@ void HistUpdater<GradientSumT>::EvaluateSplits(
|
|||||||
&(best_splits[i]), fid, nid, evaluator, min_child_weight);
|
&(best_splits[i]), fid, nid, evaluator, min_child_weight);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
event = qu_.memcpy(best_splits_host_.data(), best_splits,
|
event = qu_->memcpy(best_splits_host_.data(), best_splits,
|
||||||
total_features * sizeof(SplitEntry<GradientSumT>), event);
|
total_features * sizeof(SplitEntry<GradientSumT>), event);
|
||||||
|
|
||||||
qu_.wait();
|
qu_->wait();
|
||||||
for (size_t i = 0; i < total_features; i++) {
|
for (size_t i = 0; i < total_features; i++) {
|
||||||
int nid = split_queries_host_[i].nid;
|
int nid = split_queries_host_[i].nid;
|
||||||
snode_host_[nid].best.Update(best_splits_host_[i]);
|
snode_host_[nid].best.Update(best_splits_host_[i]);
|
||||||
|
|||||||
@ -52,7 +52,7 @@ class HistUpdater {
|
|||||||
using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
|
using GradientPairT = xgboost::detail::GradientPairInternal<GradientSumT>;
|
||||||
|
|
||||||
explicit HistUpdater(const Context* ctx,
|
explicit HistUpdater(const Context* ctx,
|
||||||
::sycl::queue qu,
|
::sycl::queue* qu,
|
||||||
const xgboost::tree::TrainParam& param,
|
const xgboost::tree::TrainParam& param,
|
||||||
FeatureInteractionConstraintHost int_constraints_,
|
FeatureInteractionConstraintHost int_constraints_,
|
||||||
DMatrix const* fmat)
|
DMatrix const* fmat)
|
||||||
@ -63,11 +63,11 @@ class HistUpdater {
|
|||||||
builder_monitor_.Init("SYCL::Quantile::HistUpdater");
|
builder_monitor_.Init("SYCL::Quantile::HistUpdater");
|
||||||
kernel_monitor_.Init("SYCL::Quantile::HistUpdater");
|
kernel_monitor_.Init("SYCL::Quantile::HistUpdater");
|
||||||
if (param.max_depth > 0) {
|
if (param.max_depth > 0) {
|
||||||
snode_device_.Resize(&qu, 1u << (param.max_depth + 1));
|
snode_device_.Resize(qu, 1u << (param.max_depth + 1));
|
||||||
}
|
}
|
||||||
has_fp64_support_ = qu_.get_device().has(::sycl::aspect::fp64);
|
has_fp64_support_ = qu_->get_device().has(::sycl::aspect::fp64);
|
||||||
const auto sub_group_sizes =
|
const auto sub_group_sizes =
|
||||||
qu_.get_device().get_info<::sycl::info::device::sub_group_sizes>();
|
qu_->get_device().get_info<::sycl::info::device::sub_group_sizes>();
|
||||||
sub_group_size_ = sub_group_sizes.back();
|
sub_group_size_ = sub_group_sizes.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,8 +266,7 @@ class HistUpdater {
|
|||||||
bst_float* out_pred_ptr = nullptr;
|
bst_float* out_pred_ptr = nullptr;
|
||||||
|
|
||||||
std::vector<GradientPairT> reduce_buffer_;
|
std::vector<GradientPairT> reduce_buffer_;
|
||||||
|
::sycl::queue* qu_;
|
||||||
::sycl::queue qu_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
|
|||||||
@ -42,11 +42,11 @@ class TreeEvaluator {
|
|||||||
USMVector<GradType> upper_bounds_;
|
USMVector<GradType> upper_bounds_;
|
||||||
USMVector<int> monotone_;
|
USMVector<int> monotone_;
|
||||||
TrainParam param_;
|
TrainParam param_;
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
bool has_constraint_;
|
bool has_constraint_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
void Reset(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
||||||
qu_ = qu;
|
qu_ = qu;
|
||||||
|
|
||||||
has_constraint_ = false;
|
has_constraint_ = false;
|
||||||
@ -58,13 +58,13 @@ class TreeEvaluator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (has_constraint_) {
|
if (has_constraint_) {
|
||||||
monotone_.Resize(&qu_, n_features, 0);
|
monotone_.Resize(qu_, n_features, 0);
|
||||||
qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(),
|
qu_->memcpy(monotone_.Data(), p.monotone_constraints.data(),
|
||||||
sizeof(int) * p.monotone_constraints.size());
|
sizeof(int) * p.monotone_constraints.size());
|
||||||
qu_.wait();
|
qu_->wait();
|
||||||
|
|
||||||
lower_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
|
lower_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
|
||||||
upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
|
upper_bounds_.Resize(qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
|
||||||
}
|
}
|
||||||
param_ = TrainParam(p);
|
param_ = TrainParam(p);
|
||||||
}
|
}
|
||||||
@ -73,7 +73,7 @@ class TreeEvaluator {
|
|||||||
return has_constraint_;
|
return has_constraint_;
|
||||||
}
|
}
|
||||||
|
|
||||||
TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
TreeEvaluator(::sycl::queue* qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
|
||||||
Reset(qu, p, n_features);
|
Reset(qu, p, n_features);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -31,7 +31,7 @@ void QuantileHistMaker::Configure(const Args& args) {
|
|||||||
param_.UpdateAllowUnknown(args);
|
param_.UpdateAllowUnknown(args);
|
||||||
hist_maker_param_.UpdateAllowUnknown(args);
|
hist_maker_param_.UpdateAllowUnknown(args);
|
||||||
|
|
||||||
bool has_fp64_support = qu_.get_device().has(::sycl::aspect::fp64);
|
bool has_fp64_support = qu_->get_device().has(::sycl::aspect::fp64);
|
||||||
if (hist_maker_param_.single_precision_histogram || !has_fp64_support) {
|
if (hist_maker_param_.single_precision_histogram || !has_fp64_support) {
|
||||||
if (!hist_maker_param_.single_precision_histogram) {
|
if (!hist_maker_param_.single_precision_histogram) {
|
||||||
LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True";
|
LOG(WARNING) << "Target device doesn't support fp64, using single_precision_histogram=True";
|
||||||
@ -68,9 +68,9 @@ void QuantileHistMaker::CallUpdate(
|
|||||||
xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
|
xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
|
||||||
const std::vector<RegTree *> &trees) {
|
const std::vector<RegTree *> &trees) {
|
||||||
const auto* gpair_h = gpair->Data();
|
const auto* gpair_h = gpair->Data();
|
||||||
gpair_device_.Resize(&qu_, gpair_h->Size());
|
gpair_device_.Resize(qu_, gpair_h->Size());
|
||||||
qu_.memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
|
qu_->memcpy(gpair_device_.Data(), gpair_h->HostPointer(), gpair_h->Size() * sizeof(GradientPair));
|
||||||
qu_.wait();
|
qu_->wait();
|
||||||
|
|
||||||
for (auto tree : trees) {
|
for (auto tree : trees) {
|
||||||
pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree);
|
pimpl->Update(param, gmat_, gpair_device_, dmat, out_position, tree);
|
||||||
|
|||||||
@ -105,7 +105,7 @@ class QuantileHistMaker: public TreeUpdater {
|
|||||||
|
|
||||||
FeatureInteractionConstraintHost int_constraint_;
|
FeatureInteractionConstraintHost int_constraint_;
|
||||||
|
|
||||||
::sycl::queue qu_;
|
::sycl::queue* qu_;
|
||||||
DeviceManager device_manager;
|
DeviceManager device_manager;
|
||||||
ObjInfo const *task_{nullptr};
|
ObjInfo const *task_{nullptr};
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
/**
|
/**
|
||||||
* Copyright 2017-2023 by XGBoost contributors
|
* Copyright 2017-2024 by XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#ifndef XGBOOST_USE_CUDA
|
#ifndef XGBOOST_USE_CUDA
|
||||||
|
#ifndef XGBOOST_USE_SYCL
|
||||||
|
|
||||||
// dummy implementation of HostDeviceVector in case CUDA is not used
|
// dummy implementation of HostDeviceVector in case CUDA is not used
|
||||||
|
|
||||||
@ -202,4 +203,5 @@ template class HostDeviceVector<std::size_t>;
|
|||||||
|
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|
||||||
|
#endif // XGBOOST_USE_SYCL
|
||||||
#endif // XGBOOST_USE_CUDA
|
#endif // XGBOOST_USE_CUDA
|
||||||
|
|||||||
@ -4,8 +4,36 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
#include "../../plugin/sycl/device_manager.h"
|
||||||
|
#include "../../plugin/sycl/data.h"
|
||||||
|
|
||||||
namespace xgboost::sycl {
|
namespace xgboost::sycl {
|
||||||
|
|
||||||
|
template<typename T, typename Fn>
|
||||||
|
void TransformOnDeviceData(DeviceOrd device, T* device_data, size_t n_data, Fn&& fn) {
|
||||||
|
sycl::DeviceManager device_manager;
|
||||||
|
::sycl::queue* qu = device_manager.GetQueue(device);
|
||||||
|
|
||||||
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for<>(::sycl::range<1>(n_data), [=](::sycl::item<1> nid) {
|
||||||
|
const size_t i = nid.get_id(0);
|
||||||
|
device_data[i] = fn(device_data[i]);
|
||||||
|
});
|
||||||
|
}).wait();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
void VerifyOnDeviceData(DeviceOrd device, const T* device_data, const T* host_data, size_t n_data, T eps = T()) {
|
||||||
|
sycl::DeviceManager device_manager;
|
||||||
|
::sycl::queue* qu = device_manager.GetQueue(device);
|
||||||
|
|
||||||
|
std::vector<T> copy_device_data(n_data);
|
||||||
|
qu->memcpy(copy_device_data.data(), device_data, n_data * sizeof(T)).wait();
|
||||||
|
for (size_t i = 0; i < n_data; ++i) {
|
||||||
|
EXPECT_NEAR(copy_device_data[i], host_data[i], eps);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T, typename Container>
|
template<typename T, typename Container>
|
||||||
void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
|
void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
|
||||||
const Container& host_vector, T eps = T()) {
|
const Container& host_vector, T eps = T()) {
|
||||||
|
|||||||
@ -40,10 +40,10 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) {
|
|||||||
|
|
||||||
RowSetCollection row_set_collection;
|
RowSetCollection row_set_collection;
|
||||||
auto& row_indices = row_set_collection.Data();
|
auto& row_indices = row_set_collection.Data();
|
||||||
row_indices.Resize(&qu, num_rows);
|
row_indices.Resize(qu, num_rows);
|
||||||
size_t* p_row_indices = row_indices.Data();
|
size_t* p_row_indices = row_indices.Data();
|
||||||
|
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
||||||
[p_row_indices](::sycl::item<1> pid) {
|
[p_row_indices](::sycl::item<1> pid) {
|
||||||
const size_t idx = pid.get_id(0);
|
const size_t idx = pid.get_id(0);
|
||||||
@ -58,23 +58,23 @@ void GHistBuilderTest(float sparsity, bool force_atomic_use) {
|
|||||||
{0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f},
|
{0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f},
|
||||||
{0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}};
|
{0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}};
|
||||||
CHECK_EQ(gpair.size(), num_rows);
|
CHECK_EQ(gpair.size(), num_rows);
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair_device(&qu, gpair);
|
USMVector<GradientPair, MemoryType::on_device> gpair_device(qu, gpair);
|
||||||
|
|
||||||
std::vector<GradientSumT> hist_host(2*n_bins);
|
std::vector<GradientSumT> hist_host(2*n_bins);
|
||||||
GHistRow<GradientSumT, MemoryType::on_device> hist(&qu, 2 * n_bins);
|
GHistRow<GradientSumT, MemoryType::on_device> hist(qu, 2 * n_bins);
|
||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
|
|
||||||
const size_t nblocks = 2;
|
const size_t nblocks = 2;
|
||||||
GHistRow<GradientSumT, MemoryType::on_device> hist_buffer(&qu, 2 * nblocks * n_bins);
|
GHistRow<GradientSumT, MemoryType::on_device> hist_buffer(qu, 2 * nblocks * n_bins);
|
||||||
|
|
||||||
InitHist(qu, &hist, hist.Size(), &event);
|
InitHist(qu, &hist, hist.Size(), &event);
|
||||||
InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
|
InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
|
||||||
|
|
||||||
event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist,
|
event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist,
|
||||||
sparsity < eps , &hist_buffer, event, force_atomic_use);
|
sparsity < eps , &hist_buffer, event, force_atomic_use);
|
||||||
qu.memcpy(hist_host.data(), hist.Data(),
|
qu->memcpy(hist_host.data(), hist.Data(),
|
||||||
2 * n_bins * sizeof(GradientSumT), event);
|
2 * n_bins * sizeof(GradientSumT), event);
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
// Build hist on host to compare
|
// Build hist on host to compare
|
||||||
std::vector<GradientSumT> hist_desired(2*n_bins);
|
std::vector<GradientSumT> hist_desired(2*n_bins);
|
||||||
@ -104,21 +104,21 @@ void GHistSubtractionTest() {
|
|||||||
|
|
||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
std::vector<GradientSumT> hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
|
std::vector<GradientSumT> hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
|
||||||
GHistType hist1(&qu, 2 * n_bins);
|
GHistType hist1(qu, 2 * n_bins);
|
||||||
event = qu.memcpy(hist1.Data(), hist1_host.data(),
|
event = qu->memcpy(hist1.Data(), hist1_host.data(),
|
||||||
2 * n_bins * sizeof(GradientSumT), event);
|
2 * n_bins * sizeof(GradientSumT), event);
|
||||||
|
|
||||||
std::vector<GradientSumT> hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1};
|
std::vector<GradientSumT> hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1};
|
||||||
GHistType hist2(&qu, 2 * n_bins);
|
GHistType hist2(qu, 2 * n_bins);
|
||||||
event = qu.memcpy(hist2.Data(), hist2_host.data(),
|
event = qu->memcpy(hist2.Data(), hist2_host.data(),
|
||||||
2 * n_bins * sizeof(GradientSumT), event);
|
2 * n_bins * sizeof(GradientSumT), event);
|
||||||
|
|
||||||
std::vector<GradientSumT> hist3_host(2 * n_bins);
|
std::vector<GradientSumT> hist3_host(2 * n_bins);
|
||||||
GHistType hist3(&qu, 2 * n_bins);
|
GHistType hist3(qu, 2 * n_bins);
|
||||||
event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event);
|
event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event);
|
||||||
qu.memcpy(hist3_host.data(), hist3.Data(),
|
qu->memcpy(hist3_host.data(), hist3.Data(),
|
||||||
2 * n_bins * sizeof(GradientSumT), event);
|
2 * n_bins * sizeof(GradientSumT), event);
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
std::vector<GradientSumT> hist3_desired(2 * n_bins);
|
std::vector<GradientSumT> hist3_desired(2 * n_bins);
|
||||||
for (size_t idx = 0; idx < 2 * n_bins; ++idx) {
|
for (size_t idx = 0; idx < 2 * n_bins; ++idx) {
|
||||||
|
|||||||
@ -19,7 +19,7 @@ template <typename GradientSumT>
|
|||||||
class TestHistUpdater : public HistUpdater<GradientSumT> {
|
class TestHistUpdater : public HistUpdater<GradientSumT> {
|
||||||
public:
|
public:
|
||||||
TestHistUpdater(const Context* ctx,
|
TestHistUpdater(const Context* ctx,
|
||||||
::sycl::queue qu,
|
::sycl::queue* qu,
|
||||||
const xgboost::tree::TrainParam& param,
|
const xgboost::tree::TrainParam& param,
|
||||||
FeatureInteractionConstraintHost int_constraints_,
|
FeatureInteractionConstraintHost int_constraints_,
|
||||||
DMatrix const* fmat) : HistUpdater<GradientSumT>(ctx, qu, param,
|
DMatrix const* fmat) : HistUpdater<GradientSumT>(ctx, qu, param,
|
||||||
@ -115,10 +115,10 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) {
|
|||||||
|
|
||||||
TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
|
TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
|
||||||
|
|
||||||
USMVector<size_t, MemoryType::on_device> row_indices_0(&qu, num_rows);
|
USMVector<size_t, MemoryType::on_device> row_indices_0(qu, num_rows);
|
||||||
USMVector<size_t, MemoryType::on_device> row_indices_1(&qu, num_rows);
|
USMVector<size_t, MemoryType::on_device> row_indices_1(qu, num_rows);
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
|
||||||
GenerateRandomGPairs(&qu, gpair.Data(), num_rows, true);
|
GenerateRandomGPairs(qu, gpair.Data(), num_rows, true);
|
||||||
|
|
||||||
updater.TestInitSampling(gpair, &row_indices_0);
|
updater.TestInitSampling(gpair, &row_indices_0);
|
||||||
|
|
||||||
@ -132,8 +132,8 @@ void TestHistUpdaterSampling(const xgboost::tree::TrainParam& param) {
|
|||||||
if (row_indices_1.Size() == n_samples) {
|
if (row_indices_1.Size() == n_samples) {
|
||||||
std::vector<size_t> row_indices_0_host(n_samples);
|
std::vector<size_t> row_indices_0_host(n_samples);
|
||||||
std::vector<size_t> row_indices_1_host(n_samples);
|
std::vector<size_t> row_indices_1_host(n_samples);
|
||||||
qu.memcpy(row_indices_0_host.data(), row_indices_0.Data(), n_samples * sizeof(size_t)).wait();
|
qu->memcpy(row_indices_0_host.data(), row_indices_0.Data(), n_samples * sizeof(size_t)).wait();
|
||||||
qu.memcpy(row_indices_1_host.data(), row_indices_1.Data(), n_samples * sizeof(size_t)).wait();
|
qu->memcpy(row_indices_1_host.data(), row_indices_1.Data(), n_samples * sizeof(size_t)).wait();
|
||||||
|
|
||||||
// The order in row_indices_0 and row_indices_1 can be different
|
// The order in row_indices_0 and row_indices_1 can be different
|
||||||
std::set<size_t> rows;
|
std::set<size_t> rows;
|
||||||
@ -168,8 +168,8 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
|
|||||||
|
|
||||||
TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
|
TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
|
||||||
|
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
|
||||||
GenerateRandomGPairs(&qu, gpair.Data(), num_rows, has_neg_hess);
|
GenerateRandomGPairs(qu, gpair.Data(), num_rows, has_neg_hess);
|
||||||
|
|
||||||
DeviceMatrix dmat;
|
DeviceMatrix dmat;
|
||||||
dmat.Init(qu, p_fmat.get());
|
dmat.Init(qu, p_fmat.get());
|
||||||
@ -181,7 +181,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
|
|||||||
auto& row_indices = row_set_collection->Data();
|
auto& row_indices = row_set_collection->Data();
|
||||||
|
|
||||||
std::vector<size_t> row_indices_host(row_indices.Size());
|
std::vector<size_t> row_indices_host(row_indices.Size());
|
||||||
qu.memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait();
|
qu->memcpy(row_indices_host.data(), row_indices.DataConst(), row_indices.Size()*sizeof(size_t)).wait();
|
||||||
|
|
||||||
if (!has_neg_hess) {
|
if (!has_neg_hess) {
|
||||||
for (size_t i = 0; i < num_rows; ++i) {
|
for (size_t i = 0; i < num_rows; ++i) {
|
||||||
@ -189,7 +189,7 @@ void TestHistUpdaterInitData(const xgboost::tree::TrainParam& param, bool has_ne
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
std::vector<GradientPair> gpair_host(num_rows);
|
std::vector<GradientPair> gpair_host(num_rows);
|
||||||
qu.memcpy(gpair_host.data(), gpair.Data(), num_rows*sizeof(GradientPair)).wait();
|
qu->memcpy(gpair_host.data(), gpair.Data(), num_rows*sizeof(GradientPair)).wait();
|
||||||
|
|
||||||
std::set<size_t> rows;
|
std::set<size_t> rows;
|
||||||
for (size_t i = 0; i < num_rows; ++i) {
|
for (size_t i = 0; i < num_rows; ++i) {
|
||||||
@ -224,9 +224,9 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa
|
|||||||
updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
|
updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
|
||||||
updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
|
updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
|
||||||
|
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
|
||||||
auto* gpair_ptr = gpair.Data();
|
auto* gpair_ptr = gpair.Data();
|
||||||
GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false);
|
GenerateRandomGPairs(qu, gpair_ptr, num_rows, false);
|
||||||
|
|
||||||
DeviceMatrix dmat;
|
DeviceMatrix dmat;
|
||||||
dmat.Init(qu, p_fmat.get());
|
dmat.Init(qu, p_fmat.get());
|
||||||
@ -255,10 +255,10 @@ void TestHistUpdaterBuildHistogramsLossGuide(const xgboost::tree::TrainParam& pa
|
|||||||
std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist0_host(n_bins);
|
std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist0_host(n_bins);
|
||||||
std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist1_host(n_bins);
|
std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist1_host(n_bins);
|
||||||
std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist2_host(n_bins);
|
std::vector<xgboost::detail::GradientPairInternal<GradientSumT>> hist2_host(n_bins);
|
||||||
qu.memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
|
qu->memcpy(hist0_host.data(), (*hist)[0].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
|
||||||
qu.memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
|
qu->memcpy(hist1_host.data(), (*hist)[1].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
|
||||||
qu.memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
|
qu->memcpy(hist2_host.data(), (*hist)[2].DataConst(), sizeof(xgboost::detail::GradientPairInternal<GradientSumT>) * n_bins);
|
||||||
qu.wait();
|
qu->wait();
|
||||||
|
|
||||||
for (size_t idx_bin = 0; idx_bin < n_bins; ++idx_bin) {
|
for (size_t idx_bin = 0; idx_bin < n_bins; ++idx_bin) {
|
||||||
EXPECT_NEAR(hist0_host[idx_bin].GetGrad(), hist1_host[idx_bin].GetGrad() + hist2_host[idx_bin].GetGrad(), 1e-6);
|
EXPECT_NEAR(hist0_host[idx_bin].GetGrad(), hist1_host[idx_bin].GetGrad() + hist2_host[idx_bin].GetGrad(), 1e-6);
|
||||||
@ -286,9 +286,9 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp
|
|||||||
updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
|
updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
|
||||||
updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
|
updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
|
||||||
|
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
|
||||||
auto* gpair_ptr = gpair.Data();
|
auto* gpair_ptr = gpair.Data();
|
||||||
GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false);
|
GenerateRandomGPairs(qu, gpair_ptr, num_rows, false);
|
||||||
|
|
||||||
DeviceMatrix dmat;
|
DeviceMatrix dmat;
|
||||||
dmat.Init(qu, p_fmat.get());
|
dmat.Init(qu, p_fmat.get());
|
||||||
@ -308,7 +308,7 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp
|
|||||||
GradStats<GradientSumT> grad_stat;
|
GradStats<GradientSumT> grad_stat;
|
||||||
{
|
{
|
||||||
::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
|
::sycl::buffer<GradStats<GradientSumT>> buff(&grad_stat, 1);
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
auto buff_acc = buff.template get_access<::sycl::access::mode::read_write>(cgh);
|
auto buff_acc = buff.template get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.single_task<>([=]() {
|
cgh.single_task<>([=]() {
|
||||||
for (size_t i = 0; i < num_rows; ++i) {
|
for (size_t i = 0; i < num_rows; ++i) {
|
||||||
@ -344,9 +344,9 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) {
|
|||||||
updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
|
updater.SetHistSynchronizer(new BatchHistSynchronizer<GradientSumT>());
|
||||||
updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
|
updater.SetHistRowsAdder(new BatchHistRowsAdder<GradientSumT>());
|
||||||
|
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
|
||||||
auto* gpair_ptr = gpair.Data();
|
auto* gpair_ptr = gpair.Data();
|
||||||
GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false);
|
GenerateRandomGPairs(qu, gpair_ptr, num_rows, false);
|
||||||
|
|
||||||
DeviceMatrix dmat;
|
DeviceMatrix dmat;
|
||||||
dmat.Init(qu, p_fmat.get());
|
dmat.Init(qu, p_fmat.get());
|
||||||
@ -378,7 +378,7 @@ void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) {
|
|||||||
std::vector<bst_float> best_loss_chg_des(1, -1);
|
std::vector<bst_float> best_loss_chg_des(1, -1);
|
||||||
{
|
{
|
||||||
::sycl::buffer<bst_float> best_loss_chg_buff(best_loss_chg_des.data(), 1);
|
::sycl::buffer<bst_float> best_loss_chg_buff(best_loss_chg_des.data(), 1);
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
auto best_loss_chg_acc = best_loss_chg_buff.template get_access<::sycl::access::mode::read_write>(cgh);
|
auto best_loss_chg_acc = best_loss_chg_buff.template get_access<::sycl::access::mode::read_write>(cgh);
|
||||||
cgh.single_task<>([=]() {
|
cgh.single_task<>([=]() {
|
||||||
for (size_t i = 1; i < size; ++i) {
|
for (size_t i = 1; i < size; ++i) {
|
||||||
@ -426,15 +426,15 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
|
|||||||
|
|
||||||
FeatureInteractionConstraintHost int_constraints;
|
FeatureInteractionConstraintHost int_constraints;
|
||||||
TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
|
TestHistUpdater<GradientSumT> updater(&ctx, qu, param, int_constraints, p_fmat.get());
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, num_rows);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, num_rows);
|
||||||
GenerateRandomGPairs(&qu, gpair.Data(), num_rows, false);
|
GenerateRandomGPairs(qu, gpair.Data(), num_rows, false);
|
||||||
|
|
||||||
auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree);
|
auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree);
|
||||||
updater.TestApplySplit(nodes, gmat, &tree);
|
updater.TestApplySplit(nodes, gmat, &tree);
|
||||||
|
|
||||||
// Copy indexes to host
|
// Copy indexes to host
|
||||||
std::vector<size_t> row_indices_host(num_rows);
|
std::vector<size_t> row_indices_host(num_rows);
|
||||||
qu.memcpy(row_indices_host.data(), row_set_collection->Data().Data(), sizeof(size_t)*num_rows).wait();
|
qu->memcpy(row_indices_host.data(), row_set_collection->Data().Data(), sizeof(size_t)*num_rows).wait();
|
||||||
|
|
||||||
// Reference Implementation
|
// Reference Implementation
|
||||||
std::vector<size_t> row_indices_desired_host(num_rows);
|
std::vector<size_t> row_indices_desired_host(num_rows);
|
||||||
@ -448,7 +448,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
|
|||||||
xgboost::tree::CommonRowPartitioner::FindSplitConditions(nodes, tree, gmat, &split_conditions);
|
xgboost::tree::CommonRowPartitioner::FindSplitConditions(nodes, tree, gmat, &split_conditions);
|
||||||
|
|
||||||
common::PartitionBuilder partition_builder;
|
common::PartitionBuilder partition_builder;
|
||||||
partition_builder.Init(&qu, n_nodes, [&](size_t node_in_set) {
|
partition_builder.Init(qu, n_nodes, [&](size_t node_in_set) {
|
||||||
const int32_t nid = nodes[node_in_set].nid;
|
const int32_t nid = nodes[node_in_set].nid;
|
||||||
return (*row_set_collection4verification)[nid].Size();
|
return (*row_set_collection4verification)[nid].Size();
|
||||||
});
|
});
|
||||||
@ -456,14 +456,14 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
|
|||||||
::sycl::event event;
|
::sycl::event event;
|
||||||
partition_builder.Partition(gmat, nodes, (*row_set_collection4verification),
|
partition_builder.Partition(gmat, nodes, (*row_set_collection4verification),
|
||||||
split_conditions, &tree, &event);
|
split_conditions, &tree, &event);
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
|
for (size_t node_in_set = 0; node_in_set < n_nodes; node_in_set++) {
|
||||||
const int32_t nid = nodes[node_in_set].nid;
|
const int32_t nid = nodes[node_in_set].nid;
|
||||||
size_t* data_result = const_cast<size_t*>((*row_set_collection4verification)[nid].begin);
|
size_t* data_result = const_cast<size_t*>((*row_set_collection4verification)[nid].begin);
|
||||||
partition_builder.MergeToArray(node_in_set, data_result, &event);
|
partition_builder.MergeToArray(node_in_set, data_result, &event);
|
||||||
}
|
}
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
const int32_t nid = nodes[0].nid;
|
const int32_t nid = nodes[0].nid;
|
||||||
n_left = partition_builder.GetNLeftElems(0);
|
n_left = partition_builder.GetNLeftElems(0);
|
||||||
@ -472,7 +472,7 @@ void TestHistUpdaterApplySplit(const xgboost::tree::TrainParam& param, float spa
|
|||||||
row_set_collection4verification->AddSplit(nid, tree[nid].LeftChild(),
|
row_set_collection4verification->AddSplit(nid, tree[nid].LeftChild(),
|
||||||
tree[nid].RightChild(), n_left, n_right);
|
tree[nid].RightChild(), n_left, n_right);
|
||||||
|
|
||||||
qu.memcpy(row_indices_desired_host.data(), row_set_collection4verification->Data().Data(), sizeof(size_t)*num_rows).wait();
|
qu->memcpy(row_indices_desired_host.data(), row_set_collection4verification->Data().Data(), sizeof(size_t)*num_rows).wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort(row_indices_desired_host.begin(), row_indices_desired_host.begin() + n_left);
|
std::sort(row_indices_desired_host.begin(), row_indices_desired_host.begin() + n_left);
|
||||||
@ -506,7 +506,7 @@ void TestHistUpdaterExpandWithLossGuide(const xgboost::tree::TrainParam& param)
|
|||||||
gmat.Init(qu, &ctx, dmat, n_bins);
|
gmat.Init(qu, &ctx, dmat, n_bins);
|
||||||
|
|
||||||
std::vector<GradientPair> gpair_host = {{1, 2}, {3, 1}, {1, 1}};
|
std::vector<GradientPair> gpair_host = {{1, 2}, {3, 1}, {1, 1}};
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, gpair_host);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, gpair_host);
|
||||||
|
|
||||||
RegTree tree;
|
RegTree tree;
|
||||||
FeatureInteractionConstraintHost int_constraints;
|
FeatureInteractionConstraintHost int_constraints;
|
||||||
@ -554,7 +554,7 @@ void TestHistUpdaterExpandWithDepthWise(const xgboost::tree::TrainParam& param)
|
|||||||
gmat.Init(qu, &ctx, dmat, n_bins);
|
gmat.Init(qu, &ctx, dmat, n_bins);
|
||||||
|
|
||||||
std::vector<GradientPair> gpair_host = {{1, 2}, {3, 1}, {1, 1}};
|
std::vector<GradientPair> gpair_host = {{1, 2}, {3, 1}, {1, 1}};
|
||||||
USMVector<GradientPair, MemoryType::on_device> gpair(&qu, gpair_host);
|
USMVector<GradientPair, MemoryType::on_device> gpair(qu, gpair_host);
|
||||||
|
|
||||||
RegTree tree;
|
RegTree tree;
|
||||||
FeatureInteractionConstraintHost int_constraints;
|
FeatureInteractionConstraintHost int_constraints;
|
||||||
|
|||||||
250
tests/cpp/plugin/test_sycl_host_device_vector.cc
Normal file
250
tests/cpp/plugin/test_sycl_host_device_vector.cc
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2018-2024, XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <numeric>
|
||||||
|
#pragma GCC diagnostic push
|
||||||
|
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||||
|
#include <xgboost/host_device_vector.h>
|
||||||
|
#pragma GCC diagnostic pop
|
||||||
|
|
||||||
|
#include "sycl_helpers.h"
|
||||||
|
|
||||||
|
namespace xgboost::common {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
|
||||||
|
// create the vector
|
||||||
|
v->SetDevice(device);
|
||||||
|
v->Resize(n);
|
||||||
|
|
||||||
|
ASSERT_EQ(v->Size(), n);
|
||||||
|
ASSERT_EQ(v->Device(), device);
|
||||||
|
// ensure that the device have read-write access
|
||||||
|
ASSERT_TRUE(v->DeviceCanRead());
|
||||||
|
ASSERT_TRUE(v->DeviceCanWrite());
|
||||||
|
// ensure that the host has no access
|
||||||
|
ASSERT_FALSE(v->HostCanRead());
|
||||||
|
ASSERT_FALSE(v->HostCanWrite());
|
||||||
|
|
||||||
|
// fill in the data on the host
|
||||||
|
std::vector<int>& data_h = v->HostVector();
|
||||||
|
// ensure that the host has full access, while the device have none
|
||||||
|
ASSERT_TRUE(v->HostCanRead());
|
||||||
|
ASSERT_TRUE(v->HostCanWrite());
|
||||||
|
ASSERT_FALSE(v->DeviceCanRead());
|
||||||
|
ASSERT_FALSE(v->DeviceCanWrite());
|
||||||
|
ASSERT_EQ(data_h.size(), n);
|
||||||
|
std::iota(data_h.begin(), data_h.end(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PlusOne(HostDeviceVector<int> *v) {
|
||||||
|
auto device = v->Device();
|
||||||
|
sycl::TransformOnDeviceData(v->Device(), v->DevicePointer(), v->Size(), [=](size_t a){ return a + 1; });
|
||||||
|
ASSERT_TRUE(v->DeviceCanWrite());
|
||||||
|
}
|
||||||
|
|
||||||
|
void CheckDevice(HostDeviceVector<int>* v,
|
||||||
|
size_t size,
|
||||||
|
unsigned int first,
|
||||||
|
GPUAccess access) {
|
||||||
|
ASSERT_EQ(v->Size(), size);
|
||||||
|
|
||||||
|
std::vector<int> desired_data(size);
|
||||||
|
std::iota(desired_data.begin(), desired_data.end(), first);
|
||||||
|
sycl::VerifyOnDeviceData(v->Device(), v->ConstDevicePointer(), desired_data.data(), size);
|
||||||
|
ASSERT_TRUE(v->DeviceCanRead());
|
||||||
|
// ensure that the device has at most the access specified by access
|
||||||
|
ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite);
|
||||||
|
ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead);
|
||||||
|
ASSERT_FALSE(v->HostCanWrite());
|
||||||
|
|
||||||
|
sycl::VerifyOnDeviceData(v->Device(), v->DevicePointer(), desired_data.data(), size);
|
||||||
|
ASSERT_TRUE(v->DeviceCanRead());
|
||||||
|
ASSERT_TRUE(v->DeviceCanWrite());
|
||||||
|
ASSERT_FALSE(v->HostCanRead());
|
||||||
|
ASSERT_FALSE(v->HostCanWrite());
|
||||||
|
}
|
||||||
|
|
||||||
|
void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
|
||||||
|
const std::vector<int>& data_h = access == GPUAccess::kNone ?
|
||||||
|
v->HostVector() : v->ConstHostVector();
|
||||||
|
for (size_t i = 0; i < v->Size(); ++i) {
|
||||||
|
ASSERT_EQ(data_h.at(i), i + 1);
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(v->HostCanRead());
|
||||||
|
ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone);
|
||||||
|
ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead);
|
||||||
|
// the devices should have no write access
|
||||||
|
ASSERT_FALSE(v->DeviceCanWrite());
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestHostDeviceVector(size_t n, DeviceOrd device) {
|
||||||
|
HostDeviceVector<int> v;
|
||||||
|
InitHostDeviceVector(n, device, &v);
|
||||||
|
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||||
|
PlusOne(&v);
|
||||||
|
CheckDevice(&v, n, 1, GPUAccess::kWrite);
|
||||||
|
CheckHost(&v, GPUAccess::kRead);
|
||||||
|
CheckHost(&v, GPUAccess::kNone);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Basic) {
|
||||||
|
size_t n = 1001;
|
||||||
|
DeviceOrd device = DeviceOrd::SyclDefault();
|
||||||
|
TestHostDeviceVector(n, device);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Copy) {
|
||||||
|
size_t n = 1001;
|
||||||
|
auto device = DeviceOrd::SyclDefault();
|
||||||
|
|
||||||
|
HostDeviceVector<int> v;
|
||||||
|
{
|
||||||
|
// a separate scope to ensure that v1 is gone before further checks
|
||||||
|
HostDeviceVector<int> v1;
|
||||||
|
InitHostDeviceVector(n, device, &v1);
|
||||||
|
v.Resize(v1.Size());
|
||||||
|
v.Copy(v1);
|
||||||
|
}
|
||||||
|
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||||
|
PlusOne(&v);
|
||||||
|
CheckDevice(&v, n, 1, GPUAccess::kWrite);
|
||||||
|
CheckHost(&v, GPUAccess::kRead);
|
||||||
|
CheckHost(&v, GPUAccess::kNone);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Fill) {
|
||||||
|
size_t n = 1001;
|
||||||
|
auto device = DeviceOrd::SyclDefault();
|
||||||
|
|
||||||
|
int val = 42;
|
||||||
|
HostDeviceVector<int> v;
|
||||||
|
v.SetDevice(device);
|
||||||
|
v.Resize(n);
|
||||||
|
|
||||||
|
ASSERT_TRUE(v.DeviceCanWrite());
|
||||||
|
v.Fill(val);
|
||||||
|
|
||||||
|
ASSERT_FALSE(v.HostCanRead());
|
||||||
|
ASSERT_FALSE(v.HostCanWrite());
|
||||||
|
ASSERT_TRUE(v.DeviceCanRead());
|
||||||
|
ASSERT_TRUE(v.DeviceCanWrite());
|
||||||
|
|
||||||
|
std::vector<int> desired_data(n, val);
|
||||||
|
sycl::VerifyOnDeviceData(v.Device(), v.ConstDevicePointer(), desired_data.data(), n);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Extend) {
|
||||||
|
size_t n0 = 1001;
|
||||||
|
size_t n1 = 17;
|
||||||
|
auto device = DeviceOrd::SyclDefault();
|
||||||
|
|
||||||
|
int val = 42;
|
||||||
|
HostDeviceVector<int> v0;
|
||||||
|
v0.SetDevice(device);
|
||||||
|
v0.Resize(n0);
|
||||||
|
v0.Fill(val);
|
||||||
|
|
||||||
|
HostDeviceVector<int> v1;
|
||||||
|
v1.SetDevice(device);
|
||||||
|
v1.Resize(n1);
|
||||||
|
v1.Fill(val);
|
||||||
|
|
||||||
|
v0.Extend(v1);
|
||||||
|
{
|
||||||
|
std::vector<int> desired_data(n0+n1, val);
|
||||||
|
sycl::VerifyOnDeviceData(v0.Device(), v0.ConstDevicePointer(), desired_data.data(), n0+n1);
|
||||||
|
}
|
||||||
|
v1.Extend(v0);
|
||||||
|
{
|
||||||
|
std::vector<int> desired_data(n0+2*n1, val);
|
||||||
|
sycl::VerifyOnDeviceData(v1.Device(), v1.ConstDevicePointer(), desired_data.data(), n0+2*n1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, SetDevice) {
|
||||||
|
std::vector<int> h_vec (2345);
|
||||||
|
for (size_t i = 0; i < h_vec.size(); ++i) {
|
||||||
|
h_vec[i] = i;
|
||||||
|
}
|
||||||
|
HostDeviceVector<int> vec (h_vec);
|
||||||
|
auto device = DeviceOrd::SyclDefault();
|
||||||
|
|
||||||
|
vec.SetDevice(device);
|
||||||
|
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||||
|
auto span = vec.DeviceSpan(); // sync to device
|
||||||
|
|
||||||
|
vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu.
|
||||||
|
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||||
|
ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
|
||||||
|
|
||||||
|
auto h_vec_1 = vec.HostVector();
|
||||||
|
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Span) {
|
||||||
|
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
|
vec.SetDevice(DeviceOrd::SyclDefault());
|
||||||
|
auto span = vec.DeviceSpan();
|
||||||
|
ASSERT_EQ(vec.Size(), span.size());
|
||||||
|
ASSERT_EQ(vec.DevicePointer(), span.data());
|
||||||
|
auto const_span = vec.ConstDeviceSpan();
|
||||||
|
ASSERT_EQ(vec.Size(), const_span.size());
|
||||||
|
ASSERT_EQ(vec.ConstDevicePointer(), const_span.data());
|
||||||
|
|
||||||
|
auto h_span = vec.ConstHostSpan();
|
||||||
|
ASSERT_TRUE(vec.HostCanRead());
|
||||||
|
ASSERT_FALSE(vec.HostCanWrite());
|
||||||
|
ASSERT_EQ(h_span.size(), vec.Size());
|
||||||
|
ASSERT_EQ(h_span.data(), vec.ConstHostPointer());
|
||||||
|
|
||||||
|
h_span = vec.HostSpan();
|
||||||
|
ASSERT_TRUE(vec.HostCanWrite());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Empty) {
|
||||||
|
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
|
HostDeviceVector<float> another { std::move(vec) };
|
||||||
|
ASSERT_FALSE(another.Empty());
|
||||||
|
ASSERT_TRUE(vec.Empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SyclHostDeviceVector, Resize) {
|
||||||
|
auto check = [&](HostDeviceVector<float> const& vec) {
|
||||||
|
auto const& h_vec = vec.ConstHostSpan();
|
||||||
|
for (std::size_t i = 0; i < 4; ++i) {
|
||||||
|
ASSERT_EQ(h_vec[i], i + 1);
|
||||||
|
}
|
||||||
|
for (std::size_t i = 4; i < vec.Size(); ++i) {
|
||||||
|
ASSERT_EQ(h_vec[i], 3.0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
|
vec.SetDevice(DeviceOrd::SyclDefault());
|
||||||
|
vec.ConstDeviceSpan();
|
||||||
|
ASSERT_TRUE(vec.DeviceCanRead());
|
||||||
|
ASSERT_FALSE(vec.DeviceCanWrite());
|
||||||
|
vec.DeviceSpan();
|
||||||
|
vec.Resize(7, 3.0f);
|
||||||
|
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||||
|
check(vec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::SyclDefault()};
|
||||||
|
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||||
|
vec.Resize(7, 3.0f);
|
||||||
|
ASSERT_TRUE(vec.DeviceCanWrite());
|
||||||
|
check(vec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> vec{1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
|
ASSERT_TRUE(vec.HostCanWrite());
|
||||||
|
vec.Resize(7, 3.0f);
|
||||||
|
ASSERT_TRUE(vec.HostCanWrite());
|
||||||
|
check(vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace xgboost::common
|
||||||
@ -32,10 +32,10 @@ void TestPartitioning(float sparsity, int max_bins) {
|
|||||||
|
|
||||||
RowSetCollection row_set_collection;
|
RowSetCollection row_set_collection;
|
||||||
auto& row_indices = row_set_collection.Data();
|
auto& row_indices = row_set_collection.Data();
|
||||||
row_indices.Resize(&qu, num_rows);
|
row_indices.Resize(qu, num_rows);
|
||||||
size_t* p_row_indices = row_indices.Data();
|
size_t* p_row_indices = row_indices.Data();
|
||||||
|
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
||||||
[p_row_indices](::sycl::item<1> pid) {
|
[p_row_indices](::sycl::item<1> pid) {
|
||||||
const size_t idx = pid.get_id(0);
|
const size_t idx = pid.get_id(0);
|
||||||
@ -49,7 +49,7 @@ void TestPartitioning(float sparsity, int max_bins) {
|
|||||||
|
|
||||||
const size_t n_nodes = row_set_collection.Size();
|
const size_t n_nodes = row_set_collection.Size();
|
||||||
PartitionBuilder partition_builder;
|
PartitionBuilder partition_builder;
|
||||||
partition_builder.Init(&qu, n_nodes, [&](size_t nid) {
|
partition_builder.Init(qu, n_nodes, [&](size_t nid) {
|
||||||
return row_set_collection[nid].Size();
|
return row_set_collection[nid].Size();
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -60,11 +60,11 @@ void TestPartitioning(float sparsity, int max_bins) {
|
|||||||
std::vector<int32_t> split_conditions = {2};
|
std::vector<int32_t> split_conditions = {2};
|
||||||
partition_builder.Partition(gmat, nodes, row_set_collection,
|
partition_builder.Partition(gmat, nodes, row_set_collection,
|
||||||
split_conditions, &tree, &event);
|
split_conditions, &tree, &event);
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
size_t* data_result = const_cast<size_t*>(row_set_collection[0].begin);
|
size_t* data_result = const_cast<size_t*>(row_set_collection[0].begin);
|
||||||
partition_builder.MergeToArray(0, data_result, &event);
|
partition_builder.MergeToArray(0, data_result, &event);
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
bst_float split_pt = gmat.cut.Values()[split_conditions[0]];
|
bst_float split_pt = gmat.cut.Values()[split_conditions[0]];
|
||||||
|
|
||||||
@ -99,8 +99,8 @@ void TestPartitioning(float sparsity, int max_bins) {
|
|||||||
auto n_right = std::accumulate(ridx_right.begin(), ridx_right.end(), 0);
|
auto n_right = std::accumulate(ridx_right.begin(), ridx_right.end(), 0);
|
||||||
|
|
||||||
std::vector<size_t> row_indices_host(num_rows);
|
std::vector<size_t> row_indices_host(num_rows);
|
||||||
qu.memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t));
|
qu->memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t));
|
||||||
qu.wait_and_throw();
|
qu->wait_and_throw();
|
||||||
|
|
||||||
ASSERT_EQ(n_left, partition_builder.GetNLeftElems(0));
|
ASSERT_EQ(n_left, partition_builder.GetNLeftElems(0));
|
||||||
for (size_t i = 0; i < n_left; ++i) {
|
for (size_t i = 0; i < n_left; ++i) {
|
||||||
@ -123,7 +123,7 @@ TEST(SyclPartitionBuilder, BasicTest) {
|
|||||||
DeviceManager device_manager;
|
DeviceManager device_manager;
|
||||||
auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
|
auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
|
||||||
PartitionBuilder builder;
|
PartitionBuilder builder;
|
||||||
builder.Init(&qu, kNodes, [&](size_t i) {
|
builder.Init(qu, kNodes, [&](size_t i) {
|
||||||
return rows[i];
|
return rows[i];
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -142,23 +142,23 @@ TEST(SyclPartitionBuilder, BasicTest) {
|
|||||||
size_t n_left = rows_for_left_node[nid];
|
size_t n_left = rows_for_left_node[nid];
|
||||||
size_t n_right = rows[nid] - n_left;
|
size_t n_right = rows[nid] - n_left;
|
||||||
|
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(n_left), [=](::sycl::id<1> pid) {
|
||||||
int row_id = first_row_id + pid[0];
|
int row_id = first_row_id + pid[0];
|
||||||
rid_buff_ptr[pid[0]] = row_id;
|
rid_buff_ptr[pid[0]] = row_id;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
qu.wait();
|
qu->wait();
|
||||||
first_row_id += n_left;
|
first_row_id += n_left;
|
||||||
|
|
||||||
// We are storing indexes for the right side in the tail of the array to save some memory
|
// We are storing indexes for the right side in the tail of the array to save some memory
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) {
|
cgh.parallel_for<>(::sycl::range<1>(n_right), [=](::sycl::id<1> pid) {
|
||||||
int row_id = first_row_id + pid[0];
|
int row_id = first_row_id + pid[0];
|
||||||
rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id;
|
rid_buff_ptr[rid_buff_size - pid[0] - 1] = row_id;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
qu.wait();
|
qu->wait();
|
||||||
first_row_id += n_right;
|
first_row_id += n_right;
|
||||||
|
|
||||||
builder.SetNLeftElems(nid, n_left);
|
builder.SetNLeftElems(nid, n_left);
|
||||||
@ -170,7 +170,7 @@ TEST(SyclPartitionBuilder, BasicTest) {
|
|||||||
size_t row_id = 0;
|
size_t row_id = 0;
|
||||||
for(size_t nid = 0; nid < kNodes; ++nid) {
|
for(size_t nid = 0; nid < kNodes; ++nid) {
|
||||||
builder.MergeToArray(nid, v.data(), &event);
|
builder.MergeToArray(nid, v.data(), &event);
|
||||||
qu.wait();
|
qu->wait();
|
||||||
|
|
||||||
// Check that row_id for left side are correct
|
// Check that row_id for left side are correct
|
||||||
for(size_t j = 0; j < rows_for_left_node[nid]; ++j) {
|
for(size_t j = 0; j < rows_for_left_node[nid]; ++j) {
|
||||||
|
|||||||
@ -46,14 +46,15 @@ TEST(SyclObjective, LogisticRawGPair) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(SyclObjective, CPUvsSycl) {
|
TEST(SyclObjective, CPUvsSycl) {
|
||||||
Context ctx;
|
Context ctx_sycl;
|
||||||
ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
|
ctx_sycl.UpdateAllowUnknown(Args{{"device", "sycl"}});
|
||||||
ObjFunction * obj_sycl =
|
ObjFunction * obj_sycl =
|
||||||
ObjFunction::Create("reg:squarederror_sycl", &ctx);
|
ObjFunction::Create("reg:squarederror_sycl", &ctx_sycl);
|
||||||
|
|
||||||
ctx = ctx.MakeCPU();
|
Context ctx_cpu;
|
||||||
|
ctx_cpu.UpdateAllowUnknown(Args{{"device", "cpu"}});
|
||||||
ObjFunction * obj_cpu =
|
ObjFunction * obj_cpu =
|
||||||
ObjFunction::Create("reg:squarederror", &ctx);
|
ObjFunction::Create("reg:squarederror", &ctx_cpu);
|
||||||
|
|
||||||
linalg::Matrix<GradientPair> cpu_out_preds;
|
linalg::Matrix<GradientPair> cpu_out_preds;
|
||||||
linalg::Matrix<GradientPair> sycl_out_preds;
|
linalg::Matrix<GradientPair> sycl_out_preds;
|
||||||
|
|||||||
@ -21,10 +21,10 @@ TEST(SyclRowSetCollection, AddSplits) {
|
|||||||
RowSetCollection row_set_collection;
|
RowSetCollection row_set_collection;
|
||||||
|
|
||||||
auto& row_indices = row_set_collection.Data();
|
auto& row_indices = row_set_collection.Data();
|
||||||
row_indices.Resize(&qu, num_rows);
|
row_indices.Resize(qu, num_rows);
|
||||||
size_t* p_row_indices = row_indices.Data();
|
size_t* p_row_indices = row_indices.Data();
|
||||||
|
|
||||||
qu.submit([&](::sycl::handler& cgh) {
|
qu->submit([&](::sycl::handler& cgh) {
|
||||||
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
cgh.parallel_for<>(::sycl::range<1>(num_rows),
|
||||||
[p_row_indices](::sycl::item<1> pid) {
|
[p_row_indices](::sycl::item<1> pid) {
|
||||||
const size_t idx = pid.get_id(0);
|
const size_t idx = pid.get_id(0);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user