merge latest changes
This commit is contained in:
@@ -1,10 +1,7 @@
|
||||
if(PLUGIN_SYCL)
|
||||
set(CMAKE_CXX_COMPILER "icpx")
|
||||
add_library(plugin_sycl OBJECT
|
||||
${xgboost_SOURCE_DIR}/plugin/sycl/objective/regression_obj.cc
|
||||
${xgboost_SOURCE_DIR}/plugin/sycl/objective/multiclass_obj.cc
|
||||
${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
|
||||
${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
|
||||
file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
|
||||
add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
|
||||
target_include_directories(plugin_sycl
|
||||
PRIVATE
|
||||
${xgboost_SOURCE_DIR}/include
|
||||
|
||||
@@ -21,6 +21,9 @@
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#include "../data.h"
|
||||
#include "row_set.h"
|
||||
#include "../data/gradient_index.h"
|
||||
#include "../tree/expand_entry.h"
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
@@ -28,6 +31,87 @@ namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace common {
|
||||
|
||||
// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
|
||||
// on comparison of indexes values (idx_span) and split point (split_cond)
|
||||
// Handle dense columns
|
||||
template <bool default_left, typename BinIdxType>
|
||||
inline ::sycl::event PartitionDenseKernel(
|
||||
::sycl::queue* qu,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const RowSetCollection::Elem& rid_span,
|
||||
const size_t fid,
|
||||
const int32_t split_cond,
|
||||
xgboost::common::Span<size_t>* rid_buf,
|
||||
size_t* parts_size,
|
||||
::sycl::event event) {
|
||||
const size_t row_stride = gmat.row_stride;
|
||||
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
|
||||
const size_t* rid = rid_span.begin;
|
||||
const size_t range_size = rid_span.Size();
|
||||
const size_t offset = gmat.cut.Ptrs()[fid];
|
||||
|
||||
size_t* p_rid_buf = rid_buf->data();
|
||||
|
||||
return qu->submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
|
||||
const size_t id = rid[nid.get_id(0)];
|
||||
const int32_t value = static_cast<int32_t>(gradient_index[id * row_stride + fid] + offset);
|
||||
const bool is_left = value <= split_cond;
|
||||
if (is_left) {
|
||||
AtomicRef<size_t> n_left(parts_size[0]);
|
||||
p_rid_buf[n_left.fetch_add(1)] = id;
|
||||
} else {
|
||||
AtomicRef<size_t> n_right(parts_size[1]);
|
||||
p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
|
||||
// on comparison of indexes values (idx_span) and split point (split_cond)
|
||||
// Handle sparce columns
|
||||
template <bool default_left, typename BinIdxType>
|
||||
inline ::sycl::event PartitionSparseKernel(::sycl::queue* qu,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const RowSetCollection::Elem& rid_span,
|
||||
const size_t fid,
|
||||
const int32_t split_cond,
|
||||
xgboost::common::Span<size_t>* rid_buf,
|
||||
size_t* parts_size,
|
||||
::sycl::event event) {
|
||||
const size_t row_stride = gmat.row_stride;
|
||||
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
|
||||
const size_t* rid = rid_span.begin;
|
||||
const size_t range_size = rid_span.Size();
|
||||
const uint32_t* cut_ptrs = gmat.cut_device.Ptrs().DataConst();
|
||||
|
||||
size_t* p_rid_buf = rid_buf->data();
|
||||
return qu->submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
|
||||
const size_t id = rid[nid.get_id(0)];
|
||||
|
||||
const BinIdxType* gr_index_local = gradient_index + row_stride * id;
|
||||
const int32_t fid_local = std::lower_bound(gr_index_local,
|
||||
gr_index_local + row_stride,
|
||||
cut_ptrs[fid]) - gr_index_local;
|
||||
const bool is_left = (fid_local >= row_stride ||
|
||||
gr_index_local[fid_local] >= cut_ptrs[fid + 1]) ?
|
||||
default_left :
|
||||
gr_index_local[fid_local] <= split_cond;
|
||||
if (is_left) {
|
||||
AtomicRef<size_t> n_left(parts_size[0]);
|
||||
p_rid_buf[n_left.fetch_add(1)] = id;
|
||||
} else {
|
||||
AtomicRef<size_t> n_right(parts_size[1]);
|
||||
p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// The builder is required for samples partition to left and rights children for set of nodes
|
||||
class PartitionBuilder {
|
||||
public:
|
||||
@@ -53,7 +137,6 @@ class PartitionBuilder {
|
||||
return result_rows_[2 * nid];
|
||||
}
|
||||
|
||||
|
||||
size_t GetNRightElems(int nid) const {
|
||||
return result_rows_[2 * nid + 1];
|
||||
}
|
||||
@@ -72,19 +155,97 @@ class PartitionBuilder {
|
||||
return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
|
||||
}
|
||||
|
||||
template <typename BinIdxType>
|
||||
::sycl::event Partition(const int32_t split_cond,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const RowSetCollection::Elem& rid_span,
|
||||
const xgboost::RegTree::Node& node,
|
||||
xgboost::common::Span<size_t>* rid_buf,
|
||||
size_t* parts_size,
|
||||
::sycl::event event) {
|
||||
const bst_uint fid = node.SplitIndex();
|
||||
const bool default_left = node.DefaultLeft();
|
||||
|
||||
if (gmat.IsDense()) {
|
||||
if (default_left) {
|
||||
return PartitionDenseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
|
||||
split_cond, rid_buf, parts_size, event);
|
||||
} else {
|
||||
return PartitionDenseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
|
||||
split_cond, rid_buf, parts_size, event);
|
||||
}
|
||||
} else {
|
||||
if (default_left) {
|
||||
return PartitionSparseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
|
||||
split_cond, rid_buf, parts_size, event);
|
||||
} else {
|
||||
return PartitionSparseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
|
||||
split_cond, rid_buf, parts_size, event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Entry point for Partition
|
||||
void Partition(const GHistIndexMatrix& gmat,
|
||||
const std::vector<tree::ExpandEntry> nodes,
|
||||
const RowSetCollection& row_set_collection,
|
||||
const std::vector<int32_t>& split_conditions,
|
||||
RegTree* p_tree,
|
||||
::sycl::event* general_event) {
|
||||
nodes_events_.resize(n_nodes_);
|
||||
|
||||
parts_size_.ResizeAndFill(qu_, 2 * n_nodes_, 0, general_event);
|
||||
|
||||
for (size_t node_in_set = 0; node_in_set < n_nodes_; node_in_set++) {
|
||||
const int32_t nid = nodes[node_in_set].nid;
|
||||
::sycl::event& node_event = nodes_events_[node_in_set];
|
||||
const auto& rid_span = row_set_collection[nid];
|
||||
if (rid_span.Size() > 0) {
|
||||
const RegTree::Node& node = (*p_tree)[nid];
|
||||
xgboost::common::Span<size_t> rid_buf = GetData(node_in_set);
|
||||
size_t* part_size = parts_size_.Data() + 2 * node_in_set;
|
||||
int32_t split_condition = split_conditions[node_in_set];
|
||||
switch (gmat.index.GetBinTypeSize()) {
|
||||
case common::BinTypeSize::kUint8BinsTypeSize:
|
||||
node_event = Partition<uint8_t>(split_condition, gmat, rid_span, node,
|
||||
&rid_buf, part_size, *general_event);
|
||||
break;
|
||||
case common::BinTypeSize::kUint16BinsTypeSize:
|
||||
node_event = Partition<uint16_t>(split_condition, gmat, rid_span, node,
|
||||
&rid_buf, part_size, *general_event);
|
||||
break;
|
||||
case common::BinTypeSize::kUint32BinsTypeSize:
|
||||
node_event = Partition<uint32_t>(split_condition, gmat, rid_span, node,
|
||||
&rid_buf, part_size, *general_event);
|
||||
break;
|
||||
default:
|
||||
CHECK(false); // no default behavior
|
||||
}
|
||||
} else {
|
||||
node_event = ::sycl::event();
|
||||
}
|
||||
}
|
||||
|
||||
*general_event = qu_->memcpy(result_rows_.data(),
|
||||
parts_size_.DataConst(),
|
||||
sizeof(size_t) * 2 * n_nodes_,
|
||||
nodes_events_);
|
||||
}
|
||||
|
||||
void MergeToArray(size_t nid,
|
||||
size_t* data_result,
|
||||
::sycl::event event) {
|
||||
::sycl::event* event) {
|
||||
size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
|
||||
if (n_nodes_total > 0) {
|
||||
const size_t* data = data_.Data() + nodes_offsets_[nid];
|
||||
qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
|
||||
qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, *event);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
std::vector<size_t> nodes_offsets_;
|
||||
std::vector<size_t> result_rows_;
|
||||
std::vector<::sycl::event> nodes_events_;
|
||||
size_t n_nodes_;
|
||||
|
||||
USMVector<size_t, MemoryType::on_device> parts_size_;
|
||||
|
||||
123
plugin/sycl/common/row_set.h
Normal file
123
plugin/sycl/common/row_set.h
Normal file
@@ -0,0 +1,123 @@
|
||||
/*!
|
||||
* Copyright 2017-2023 XGBoost contributors
|
||||
*/
|
||||
#ifndef PLUGIN_SYCL_COMMON_ROW_SET_H_
|
||||
#define PLUGIN_SYCL_COMMON_ROW_SET_H_
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||
#pragma GCC diagnostic ignored "-W#pragma-messages"
|
||||
#include <xgboost/data.h>
|
||||
#pragma GCC diagnostic pop
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
#include "../data.h"
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace common {
|
||||
|
||||
|
||||
/*! \brief Collection of rowsets stored on device in USM memory */
|
||||
class RowSetCollection {
|
||||
public:
|
||||
/*! \brief data structure to store an instance set, a subset of
|
||||
* rows (instances) associated with a particular node in a decision
|
||||
* tree. */
|
||||
struct Elem {
|
||||
const size_t* begin{nullptr};
|
||||
const size_t* end{nullptr};
|
||||
bst_node_t node_id{-1}; // id of node associated with this instance set; -1 means uninitialized
|
||||
Elem()
|
||||
= default;
|
||||
Elem(const size_t* begin,
|
||||
const size_t* end,
|
||||
bst_node_t node_id = -1)
|
||||
: begin(begin), end(end), node_id(node_id) {}
|
||||
|
||||
|
||||
inline size_t Size() const {
|
||||
return end - begin;
|
||||
}
|
||||
};
|
||||
|
||||
inline size_t Size() const {
|
||||
return elem_of_each_node_.size();
|
||||
}
|
||||
|
||||
/*! \brief return corresponding element set given the node_id */
|
||||
inline const Elem& operator[](unsigned node_id) const {
|
||||
const Elem& e = elem_of_each_node_[node_id];
|
||||
CHECK(e.begin != nullptr)
|
||||
<< "access element that is not in the set";
|
||||
return e;
|
||||
}
|
||||
|
||||
/*! \brief return corresponding element set given the node_id */
|
||||
inline Elem& operator[](unsigned node_id) {
|
||||
Elem& e = elem_of_each_node_[node_id];
|
||||
return e;
|
||||
}
|
||||
|
||||
// clear up things
|
||||
inline void Clear() {
|
||||
elem_of_each_node_.clear();
|
||||
}
|
||||
// initialize node id 0->everything
|
||||
inline void Init() {
|
||||
CHECK_EQ(elem_of_each_node_.size(), 0U);
|
||||
|
||||
const size_t* begin = row_indices_.Begin();
|
||||
const size_t* end = row_indices_.End();
|
||||
elem_of_each_node_.emplace_back(Elem(begin, end, 0));
|
||||
}
|
||||
|
||||
auto& Data() { return row_indices_; }
|
||||
|
||||
// split rowset into two
|
||||
inline void AddSplit(unsigned node_id,
|
||||
unsigned left_node_id,
|
||||
unsigned right_node_id,
|
||||
size_t n_left,
|
||||
size_t n_right) {
|
||||
const Elem e = elem_of_each_node_[node_id];
|
||||
CHECK(e.begin != nullptr);
|
||||
size_t* all_begin = row_indices_.Begin();
|
||||
size_t* begin = all_begin + (e.begin - all_begin);
|
||||
|
||||
|
||||
CHECK_EQ(n_left + n_right, e.Size());
|
||||
CHECK_LE(begin + n_left, e.end);
|
||||
CHECK_EQ(begin + n_left + n_right, e.end);
|
||||
|
||||
|
||||
if (left_node_id >= elem_of_each_node_.size()) {
|
||||
elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
|
||||
}
|
||||
if (right_node_id >= elem_of_each_node_.size()) {
|
||||
elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
|
||||
}
|
||||
|
||||
|
||||
elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
|
||||
elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
|
||||
elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
|
||||
}
|
||||
|
||||
private:
|
||||
// stores the row indexes in the set
|
||||
USMVector<size_t, MemoryType::on_device> row_indices_;
|
||||
// vector: node_id -> elements
|
||||
std::vector<Elem> elem_of_each_node_;
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
#endif // PLUGIN_SYCL_COMMON_ROW_SET_H_
|
||||
@@ -26,8 +26,13 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
enum class MemoryType { shared, on_device};
|
||||
template <typename T>
|
||||
using AtomicRef = ::sycl::atomic_ref<T,
|
||||
::sycl::memory_order::relaxed,
|
||||
::sycl::memory_scope::device,
|
||||
::sycl::access::address_space::ext_intel_global_device_space>;
|
||||
|
||||
enum class MemoryType { shared, on_device};
|
||||
|
||||
template <typename T>
|
||||
class USMDeleter {
|
||||
@@ -166,20 +171,20 @@ class USMVector {
|
||||
}
|
||||
}
|
||||
|
||||
::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) {
|
||||
void ResizeAndFill(::sycl::queue* qu, size_t size_new, int v, ::sycl::event* event) {
|
||||
if (size_new <= size_) {
|
||||
size_ = size_new;
|
||||
return qu->memset(data_.get(), v, size_new * sizeof(T));
|
||||
*event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
|
||||
} else if (size_new <= capacity_) {
|
||||
size_ = size_new;
|
||||
return qu->memset(data_.get(), v, size_new * sizeof(T));
|
||||
*event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
|
||||
} else {
|
||||
size_t size_old = size_;
|
||||
auto data_old = data_;
|
||||
size_ = size_new;
|
||||
capacity_ = size_new;
|
||||
data_ = allocate_memory_(qu, size_);
|
||||
return qu->memset(data_.get(), v, size_new * sizeof(T));
|
||||
*event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,11 +211,16 @@ class USMVector {
|
||||
struct DeviceMatrix {
|
||||
DMatrix* p_mat; // Pointer to the original matrix on the host
|
||||
::sycl::queue qu_;
|
||||
USMVector<size_t> row_ptr;
|
||||
USMVector<size_t, MemoryType::on_device> row_ptr;
|
||||
USMVector<Entry, MemoryType::on_device> data;
|
||||
size_t total_offset;
|
||||
|
||||
DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) {
|
||||
DeviceMatrix() = default;
|
||||
|
||||
void Init(::sycl::queue qu, DMatrix* dmat) {
|
||||
qu_ = qu;
|
||||
p_mat = dmat;
|
||||
|
||||
size_t num_row = 0;
|
||||
size_t num_nonzero = 0;
|
||||
for (auto &batch : dmat->GetBatches<SparsePage>()) {
|
||||
@@ -221,27 +231,41 @@ struct DeviceMatrix {
|
||||
}
|
||||
|
||||
row_ptr.Resize(&qu_, num_row + 1);
|
||||
size_t* rows = row_ptr.Data();
|
||||
data.Resize(&qu_, num_nonzero);
|
||||
|
||||
size_t data_offset = 0;
|
||||
::sycl::event event;
|
||||
for (auto &batch : dmat->GetBatches<SparsePage>()) {
|
||||
const auto& data_vec = batch.data.HostVector();
|
||||
const auto& offset_vec = batch.offset.HostVector();
|
||||
size_t batch_size = batch.Size();
|
||||
if (batch_size > 0) {
|
||||
std::copy(offset_vec.data(), offset_vec.data() + batch_size,
|
||||
row_ptr.Data() + batch.base_rowid);
|
||||
if (batch.base_rowid > 0) {
|
||||
for (size_t i = 0; i < batch_size; i++)
|
||||
row_ptr[i + batch.base_rowid] += batch.base_rowid;
|
||||
const auto base_rowid = batch.base_rowid;
|
||||
event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
|
||||
sizeof(size_t) * batch_size, event);
|
||||
if (base_rowid > 0) {
|
||||
qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
|
||||
int row_id = pid[0];
|
||||
rows[row_id] += base_rowid;
|
||||
});
|
||||
});
|
||||
}
|
||||
qu.memcpy(data.Data() + data_offset,
|
||||
data_vec.data(),
|
||||
offset_vec[batch_size] * sizeof(Entry)).wait();
|
||||
event = qu.memcpy(data.Data() + data_offset, data_vec.data(),
|
||||
sizeof(Entry) * offset_vec[batch_size], event);
|
||||
data_offset += offset_vec[batch_size];
|
||||
qu.wait();
|
||||
}
|
||||
}
|
||||
row_ptr[num_row] = data_offset;
|
||||
qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.depends_on(event);
|
||||
cgh.single_task<>([=] {
|
||||
rows[num_row] = data_offset;
|
||||
});
|
||||
});
|
||||
qu.wait();
|
||||
total_offset = data_offset;
|
||||
}
|
||||
|
||||
|
||||
177
plugin/sycl/data/gradient_index.cc
Normal file
177
plugin/sycl/data/gradient_index.cc
Normal file
@@ -0,0 +1,177 @@
|
||||
/*!
|
||||
* Copyright 2017-2024 by Contributors
|
||||
* \file gradient_index.cc
|
||||
*/
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
|
||||
#include "gradient_index.h"
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace common {
|
||||
|
||||
uint32_t SearchBin(const bst_float* cut_values, const uint32_t* cut_ptrs, Entry const& e) {
|
||||
auto beg = cut_ptrs[e.index];
|
||||
auto end = cut_ptrs[e.index + 1];
|
||||
auto it = std::upper_bound(cut_values + beg, cut_values + end, e.fvalue);
|
||||
uint32_t idx = it - cut_values;
|
||||
if (idx == end) {
|
||||
idx -= 1;
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
|
||||
template <typename BinIdxType>
|
||||
void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
|
||||
const size_t total_len = end - begin;
|
||||
for (size_t block_len = 1; block_len < total_len; block_len <<= 1) {
|
||||
for (size_t cur_block = 0; cur_block + block_len < total_len; cur_block += 2 * block_len) {
|
||||
size_t start = cur_block;
|
||||
size_t mid = start + block_len;
|
||||
size_t finish = mid + block_len < total_len ? mid + block_len : total_len;
|
||||
size_t left_pos = start;
|
||||
size_t right_pos = mid;
|
||||
size_t pos = start;
|
||||
while (left_pos < mid || right_pos < finish) {
|
||||
if (left_pos < mid && (right_pos == finish || begin[left_pos] < begin[right_pos])) {
|
||||
buf[pos++] = begin[left_pos++];
|
||||
} else {
|
||||
buf[pos++] = begin[right_pos++];
|
||||
}
|
||||
}
|
||||
for (size_t i = start; i < finish; i++) begin[i] = buf[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename BinIdxType>
|
||||
void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
|
||||
BinIdxType* index_data,
|
||||
const DeviceMatrix &dmat,
|
||||
size_t nbins,
|
||||
size_t row_stride,
|
||||
uint32_t* offsets) {
|
||||
if (nbins == 0) return;
|
||||
const xgboost::Entry *data_ptr = dmat.data.DataConst();
|
||||
const bst_row_t *offset_vec = dmat.row_ptr.DataConst();
|
||||
const size_t num_rows = dmat.row_ptr.Size() - 1;
|
||||
const bst_float* cut_values = cut_device.Values().DataConst();
|
||||
const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
|
||||
size_t* hit_count_ptr = hit_count_buff.Data();
|
||||
|
||||
// Sparse case only
|
||||
if (!offsets) {
|
||||
// sort_buff has type uint8_t
|
||||
sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
|
||||
}
|
||||
BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
|
||||
|
||||
auto event = qu.submit([&](::sycl::handler& cgh) {
|
||||
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
|
||||
const size_t i = pid.get_id(0);
|
||||
const size_t ibegin = offset_vec[i];
|
||||
const size_t iend = offset_vec[i + 1];
|
||||
const size_t size = iend - ibegin;
|
||||
const size_t start = i * row_stride;
|
||||
for (bst_uint j = 0; j < size; ++j) {
|
||||
uint32_t idx = SearchBin(cut_values, cut_ptrs, data_ptr[ibegin + j]);
|
||||
index_data[start + j] = offsets ? idx - offsets[j] : idx;
|
||||
AtomicRef<size_t> hit_count_ref(hit_count_ptr[idx]);
|
||||
hit_count_ref.fetch_add(1);
|
||||
}
|
||||
if (!offsets) {
|
||||
// Sparse case only
|
||||
mergeSort<BinIdxType>(index_data + start, index_data + start + size, sort_data + start);
|
||||
for (bst_uint j = size; j < row_stride; ++j) {
|
||||
index_data[start + j] = nbins;
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
|
||||
qu.wait();
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
|
||||
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
|
||||
index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
|
||||
index.Resize((sizeof(uint8_t)) * n_index);
|
||||
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
||||
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
|
||||
index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
|
||||
index.Resize((sizeof(uint16_t)) * n_index);
|
||||
} else {
|
||||
index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
|
||||
index.Resize((sizeof(uint32_t)) * n_index);
|
||||
}
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::Init(::sycl::queue qu,
|
||||
Context const * ctx,
|
||||
const DeviceMatrix& p_fmat_device,
|
||||
int max_bins) {
|
||||
nfeatures = p_fmat_device.p_mat->Info().num_col_;
|
||||
|
||||
cut = xgboost::common::SketchOnDMatrix(ctx, p_fmat_device.p_mat, max_bins);
|
||||
cut_device.Init(qu, cut);
|
||||
|
||||
max_num_bins = max_bins;
|
||||
const uint32_t nbins = cut.Ptrs().back();
|
||||
this->nbins = nbins;
|
||||
hit_count.resize(nbins, 0);
|
||||
hit_count_buff.Resize(&qu, nbins, 0);
|
||||
|
||||
this->p_fmat = p_fmat_device.p_mat;
|
||||
const bool isDense = p_fmat_device.p_mat->IsDense();
|
||||
this->isDense_ = isDense;
|
||||
|
||||
index.setQueue(qu);
|
||||
|
||||
row_stride = 0;
|
||||
for (const auto& batch : p_fmat_device.p_mat->GetBatches<SparsePage>()) {
|
||||
const auto& row_offset = batch.offset.ConstHostVector();
|
||||
for (auto i = 1ull; i < row_offset.size(); i++) {
|
||||
row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
|
||||
}
|
||||
}
|
||||
|
||||
const size_t n_offsets = cut_device.Ptrs().Size() - 1;
|
||||
const size_t n_rows = p_fmat_device.row_ptr.Size() - 1;
|
||||
const size_t n_index = n_rows * row_stride;
|
||||
ResizeIndex(n_index, isDense);
|
||||
|
||||
CHECK_GT(cut_device.Values().Size(), 0U);
|
||||
|
||||
uint32_t* offsets = nullptr;
|
||||
if (isDense) {
|
||||
index.ResizeOffset(n_offsets);
|
||||
offsets = index.Offset();
|
||||
qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
|
||||
sizeof(uint32_t) * n_offsets).wait_and_throw();
|
||||
}
|
||||
|
||||
if (isDense) {
|
||||
BinTypeSize curent_bin_size = index.GetBinTypeSize();
|
||||
if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
|
||||
SetIndexData(qu, index.data<uint8_t>(), p_fmat_device, nbins, row_stride, offsets);
|
||||
|
||||
} else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
|
||||
SetIndexData(qu, index.data<uint16_t>(), p_fmat_device, nbins, row_stride, offsets);
|
||||
} else {
|
||||
CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
|
||||
SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
|
||||
}
|
||||
/* For sparse DMatrix we have to store index of feature for each bin
|
||||
in index field to chose right offset. So offset is nullptr and index is not reduced */
|
||||
} else {
|
||||
SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
216
plugin/sycl/data/gradient_index.h
Normal file
216
plugin/sycl/data/gradient_index.h
Normal file
@@ -0,0 +1,216 @@
|
||||
/*!
|
||||
* Copyright 2017-2024 by Contributors
|
||||
* \file gradient_index.h
|
||||
*/
|
||||
#ifndef PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
|
||||
#define PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "../data.h"
|
||||
#include "../../src/common/hist_util.h"
|
||||
|
||||
#include <CL/sycl.hpp>
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace common {
|
||||
|
||||
/*!
|
||||
* \brief SYCL implementation of HistogramCuts stored in USM buffers to provide access from device kernels
|
||||
*/
|
||||
class HistogramCuts {
|
||||
protected:
|
||||
using BinIdx = uint32_t;
|
||||
|
||||
public:
|
||||
HistogramCuts() {}
|
||||
|
||||
explicit HistogramCuts(::sycl::queue qu) {}
|
||||
|
||||
~HistogramCuts() {
|
||||
}
|
||||
|
||||
void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) {
|
||||
qu_ = qu;
|
||||
cut_values_.Init(&qu_, cuts.cut_values_.HostVector());
|
||||
cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector());
|
||||
min_vals_.Init(&qu_, cuts.min_vals_.HostVector());
|
||||
}
|
||||
|
||||
// Getters for USM buffers to pass pointers into device kernels
|
||||
const USMVector<uint32_t>& Ptrs() const { return cut_ptrs_; }
|
||||
const USMVector<float>& Values() const { return cut_values_; }
|
||||
const USMVector<float>& MinValues() const { return min_vals_; }
|
||||
|
||||
private:
|
||||
USMVector<bst_float> cut_values_;
|
||||
USMVector<uint32_t> cut_ptrs_;
|
||||
USMVector<float> min_vals_;
|
||||
::sycl::queue qu_;
|
||||
};
|
||||
|
||||
using BinTypeSize = ::xgboost::common::BinTypeSize;
|
||||
|
||||
/*!
|
||||
* \brief Index data and offsets stored in USM buffers to provide access from device kernels
|
||||
*/
|
||||
struct Index {
|
||||
Index() {
|
||||
SetBinTypeSize(binTypeSize_);
|
||||
}
|
||||
Index(const Index& i) = delete;
|
||||
Index& operator=(Index i) = delete;
|
||||
Index(Index&& i) = delete;
|
||||
Index& operator=(Index&& i) = delete;
|
||||
uint32_t operator[](size_t i) const {
|
||||
if (!offset_.Empty()) {
|
||||
return func_(data_.DataConst(), i) + offset_[i%p_];
|
||||
} else {
|
||||
return func_(data_.DataConst(), i);
|
||||
}
|
||||
}
|
||||
void SetBinTypeSize(BinTypeSize binTypeSize) {
|
||||
binTypeSize_ = binTypeSize;
|
||||
switch (binTypeSize) {
|
||||
case BinTypeSize::kUint8BinsTypeSize:
|
||||
func_ = &GetValueFromUint8;
|
||||
break;
|
||||
case BinTypeSize::kUint16BinsTypeSize:
|
||||
func_ = &GetValueFromUint16;
|
||||
break;
|
||||
case BinTypeSize::kUint32BinsTypeSize:
|
||||
func_ = &GetValueFromUint32;
|
||||
break;
|
||||
default:
|
||||
CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize ||
|
||||
binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
|
||||
binTypeSize == BinTypeSize::kUint32BinsTypeSize);
|
||||
}
|
||||
}
|
||||
BinTypeSize GetBinTypeSize() const {
|
||||
return binTypeSize_;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* data() {
|
||||
return reinterpret_cast<T*>(data_.Data());
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
const T* data() const {
|
||||
return reinterpret_cast<const T*>(data_.DataConst());
|
||||
}
|
||||
|
||||
uint32_t* Offset() {
|
||||
return offset_.Data();
|
||||
}
|
||||
|
||||
const uint32_t* Offset() const {
|
||||
return offset_.DataConst();
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
return data_.Size() / (binTypeSize_);
|
||||
}
|
||||
|
||||
void Resize(const size_t nBytesData) {
|
||||
data_.Resize(&qu_, nBytesData);
|
||||
}
|
||||
|
||||
void ResizeOffset(const size_t nDisps) {
|
||||
offset_.Resize(&qu_, nDisps);
|
||||
p_ = nDisps;
|
||||
}
|
||||
|
||||
uint8_t* begin() const {
|
||||
return data_.Begin();
|
||||
}
|
||||
|
||||
uint8_t* end() const {
|
||||
return data_.End();
|
||||
}
|
||||
|
||||
void setQueue(::sycl::queue qu) {
|
||||
qu_ = qu;
|
||||
}
|
||||
|
||||
private:
|
||||
static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
|
||||
return reinterpret_cast<const uint8_t*>(t)[i];
|
||||
}
|
||||
static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
|
||||
return reinterpret_cast<const uint16_t*>(t)[i];
|
||||
}
|
||||
static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
|
||||
return reinterpret_cast<const uint32_t*>(t)[i];
|
||||
}
|
||||
|
||||
using Func = uint32_t (*)(const uint8_t*, size_t);
|
||||
|
||||
USMVector<uint8_t, MemoryType::on_device> data_;
|
||||
// size of this field is equal to number of features
|
||||
USMVector<uint32_t, MemoryType::on_device> offset_;
|
||||
BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
|
||||
size_t p_ {1};
|
||||
Func func_;
|
||||
|
||||
::sycl::queue qu_;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief Preprocessed global index matrix, in CSR format, stored in USM buffers
|
||||
*
|
||||
* Transform floating values to integer index in histogram
|
||||
*/
|
||||
struct GHistIndexMatrix {
|
||||
/*! \brief row pointer to rows by element position */
|
||||
/*! \brief The index data */
|
||||
Index index;
|
||||
/*! \brief hit count of each index */
|
||||
std::vector<size_t> hit_count;
|
||||
/*! \brief buffers for calculations */
|
||||
USMVector<size_t, MemoryType::on_device> hit_count_buff;
|
||||
USMVector<uint8_t, MemoryType::on_device> sort_buff;
|
||||
/*! \brief The corresponding cuts */
|
||||
xgboost::common::HistogramCuts cut;
|
||||
HistogramCuts cut_device;
|
||||
DMatrix* p_fmat;
|
||||
size_t max_num_bins;
|
||||
size_t nbins;
|
||||
size_t nfeatures;
|
||||
size_t row_stride;
|
||||
|
||||
// Create a global histogram matrix based on a given DMatrix device wrapper
|
||||
void Init(::sycl::queue qu, Context const * ctx,
|
||||
const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
|
||||
|
||||
template <typename BinIdxType>
|
||||
void SetIndexData(::sycl::queue qu, BinIdxType* index_data,
|
||||
const sycl::DeviceMatrix &dmat_device,
|
||||
size_t nbins, size_t row_stride, uint32_t* offsets);
|
||||
|
||||
void ResizeIndex(size_t n_index, bool isDense);
|
||||
|
||||
inline void GetFeatureCounts(size_t* counts) const {
|
||||
auto nfeature = cut_device.Ptrs().Size() - 1;
|
||||
for (unsigned fid = 0; fid < nfeature; ++fid) {
|
||||
auto ibegin = cut_device.Ptrs()[fid];
|
||||
auto iend = cut_device.Ptrs()[fid + 1];
|
||||
for (auto i = ibegin; i < iend; ++i) {
|
||||
*(counts + fid) += hit_count[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
inline bool IsDense() const {
|
||||
return isDense_;
|
||||
}
|
||||
|
||||
private:
|
||||
bool isDense_;
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
#endif // PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
|
||||
@@ -280,7 +280,8 @@ class Predictor : public xgboost::Predictor {
|
||||
uint32_t tree_end = 0) const override {
|
||||
::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
|
||||
// TODO(razdoburdin): remove temporary workaround after cache fix
|
||||
sycl::DeviceMatrix device_matrix(qu, dmat);
|
||||
sycl::DeviceMatrix device_matrix;
|
||||
device_matrix.Init(qu, dmat);
|
||||
|
||||
auto* out_preds = &predts->predictions;
|
||||
if (tree_end == 0) {
|
||||
|
||||
50
plugin/sycl/tree/expand_entry.h
Normal file
50
plugin/sycl/tree/expand_entry.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*!
|
||||
* Copyright 2017-2024 by Contributors
|
||||
* \file expand_entry.h
|
||||
*/
|
||||
#ifndef PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
|
||||
#define PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
|
||||
#include "../../src/tree/constraints.h"
|
||||
#pragma GCC diagnostic pop
|
||||
#include "../../src/tree/hist/expand_entry.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace sycl {
|
||||
namespace tree {
|
||||
/* tree growing policies */
|
||||
struct ExpandEntry : public xgboost::tree::ExpandEntryImpl<ExpandEntry> {
|
||||
static constexpr bst_node_t kRootNid = 0;
|
||||
|
||||
xgboost::tree::SplitEntry split;
|
||||
|
||||
ExpandEntry(int nid, int depth) : ExpandEntryImpl{nid, depth} {}
|
||||
|
||||
inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree) const {
|
||||
CHECK_EQ((*p_tree)[nid].IsRoot(), false);
|
||||
const size_t parent_id = (*p_tree)[nid].Parent();
|
||||
return GetSiblingId(p_tree, parent_id);
|
||||
}
|
||||
|
||||
inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree, size_t parent_id) const {
|
||||
return p_tree->IsLeftChild(nid) ? p_tree->RightChild(parent_id)
|
||||
: p_tree->LeftChild(parent_id);
|
||||
}
|
||||
|
||||
bool IsValidImpl(xgboost::tree::TrainParam const ¶m, int32_t num_leaves) const {
|
||||
if (split.loss_chg <= kRtEps) return false;
|
||||
if (split.loss_chg < param.min_split_loss) return false;
|
||||
if (param.max_depth > 0 && depth == param.max_depth) return false;
|
||||
if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace sycl
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
|
||||
Reference in New Issue
Block a user