Predict on Ellpack. (#5327)
* Unify GPU prediction node. * Add `PageExists`. * Dispatch prediction on input data for GPU Predictor.
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
#include "../gbm/gbtree_model.h"
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "../common/common.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
|
||||
@@ -22,78 +23,32 @@ namespace predictor {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(gpu_predictor);
|
||||
|
||||
/**
|
||||
* \struct DevicePredictionNode
|
||||
*
|
||||
* \brief Packed 16 byte representation of a tree node for use in device
|
||||
* prediction
|
||||
*/
|
||||
struct DevicePredictionNode {
|
||||
XGBOOST_DEVICE DevicePredictionNode()
|
||||
: fidx{-1}, left_child_idx{-1}, right_child_idx{-1} {}
|
||||
struct SparsePageView {
|
||||
common::Span<const Entry> d_data;
|
||||
common::Span<const bst_row_t> d_row_ptr;
|
||||
|
||||
union NodeValue {
|
||||
float leaf_weight;
|
||||
float fvalue;
|
||||
};
|
||||
|
||||
int fidx;
|
||||
int left_child_idx;
|
||||
int right_child_idx;
|
||||
NodeValue val{};
|
||||
|
||||
DevicePredictionNode(const RegTree::Node& n) { // NOLINT
|
||||
static_assert(sizeof(DevicePredictionNode) == 16, "Size is not 16 bytes");
|
||||
this->left_child_idx = n.LeftChild();
|
||||
this->right_child_idx = n.RightChild();
|
||||
this->fidx = n.SplitIndex();
|
||||
if (n.DefaultLeft()) {
|
||||
fidx |= (1U << 31);
|
||||
}
|
||||
|
||||
if (n.IsLeaf()) {
|
||||
this->val.leaf_weight = n.LeafValue();
|
||||
} else {
|
||||
this->val.fvalue = n.SplitCond();
|
||||
}
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool IsLeaf() const { return left_child_idx == -1; }
|
||||
|
||||
XGBOOST_DEVICE int GetFidx() const { return fidx & ((1U << 31) - 1U); }
|
||||
|
||||
XGBOOST_DEVICE bool MissingLeft() const { return (fidx >> 31) != 0; }
|
||||
|
||||
XGBOOST_DEVICE int MissingIdx() const {
|
||||
if (MissingLeft()) {
|
||||
return this->left_child_idx;
|
||||
} else {
|
||||
return this->right_child_idx;
|
||||
}
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE float GetFvalue() const { return val.fvalue; }
|
||||
|
||||
XGBOOST_DEVICE float GetWeight() const { return val.leaf_weight; }
|
||||
XGBOOST_DEVICE SparsePageView(common::Span<const Entry> data,
|
||||
common::Span<const bst_row_t> row_ptr) :
|
||||
d_data{data}, d_row_ptr{row_ptr} {}
|
||||
};
|
||||
|
||||
struct ElementLoader {
|
||||
struct SparsePageLoader {
|
||||
bool use_shared;
|
||||
common::Span<const bst_row_t> d_row_ptr;
|
||||
common::Span<const Entry> d_data;
|
||||
int num_features;
|
||||
bst_feature_t num_features;
|
||||
float* smem;
|
||||
size_t entry_start;
|
||||
|
||||
__device__ ElementLoader(bool use_shared, common::Span<const bst_row_t> row_ptr,
|
||||
common::Span<const Entry> entry, int num_features,
|
||||
float* smem, int num_rows, size_t entry_start)
|
||||
__device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
|
||||
bst_row_t num_rows, size_t entry_start)
|
||||
: use_shared(use_shared),
|
||||
d_row_ptr(row_ptr),
|
||||
d_data(entry),
|
||||
d_row_ptr(data.d_row_ptr),
|
||||
d_data(data.d_data),
|
||||
num_features(num_features),
|
||||
smem(smem),
|
||||
entry_start(entry_start) {
|
||||
extern __shared__ float _smem[];
|
||||
smem = _smem;
|
||||
// Copy instances
|
||||
if (use_shared) {
|
||||
bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
@@ -111,7 +66,7 @@ struct ElementLoader {
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
__device__ float GetFvalue(int ridx, int fidx) {
|
||||
__device__ float GetFvalue(int ridx, int fidx) const {
|
||||
if (use_shared) {
|
||||
return smem[threadIdx.x * num_features + fidx];
|
||||
} else {
|
||||
@@ -141,52 +96,69 @@ struct ElementLoader {
|
||||
}
|
||||
};
|
||||
|
||||
__device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree,
|
||||
ElementLoader* loader) {
|
||||
DevicePredictionNode n = tree[0];
|
||||
struct EllpackLoader {
|
||||
EllpackMatrix const& matrix;
|
||||
XGBOOST_DEVICE EllpackLoader(EllpackMatrix const& m, bool use_shared, bst_feature_t num_features,
|
||||
bst_row_t num_rows, size_t entry_start) : matrix{m} {}
|
||||
__device__ __forceinline__ float GetFvalue(int ridx, int fidx) const {
|
||||
auto gidx = matrix.GetBinIndex(ridx, fidx);
|
||||
if (gidx == -1) {
|
||||
return nan("");
|
||||
}
|
||||
// The gradient index needs to be shifted by one as min values are not included in the
|
||||
// cuts.
|
||||
if (gidx == matrix.info.feature_segments[fidx]) {
|
||||
return matrix.info.min_fvalue[fidx];
|
||||
}
|
||||
return matrix.info.gidx_fvalue_map[gidx - 1];
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Loader>
|
||||
__device__ float GetLeafWeight(bst_uint ridx, const RegTree::Node* tree,
|
||||
Loader* loader) {
|
||||
RegTree::Node n = tree[0];
|
||||
while (!n.IsLeaf()) {
|
||||
float fvalue = loader->GetFvalue(ridx, n.GetFidx());
|
||||
float fvalue = loader->GetFvalue(ridx, n.SplitIndex());
|
||||
// Missing value
|
||||
if (isnan(fvalue)) {
|
||||
n = tree[n.MissingIdx()];
|
||||
n = tree[n.DefaultChild()];
|
||||
} else {
|
||||
if (fvalue < n.GetFvalue()) {
|
||||
n = tree[n.left_child_idx];
|
||||
if (fvalue < n.SplitCond()) {
|
||||
n = tree[n.LeftChild()];
|
||||
} else {
|
||||
n = tree[n.right_child_idx];
|
||||
n = tree[n.RightChild()];
|
||||
}
|
||||
}
|
||||
}
|
||||
return n.GetWeight();
|
||||
return n.LeafValue();
|
||||
}
|
||||
|
||||
template <int BLOCK_THREADS>
|
||||
__global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
|
||||
template <typename Loader, typename Data>
|
||||
__global__ void PredictKernel(Data data,
|
||||
common::Span<const RegTree::Node> d_nodes,
|
||||
common::Span<float> d_out_predictions,
|
||||
common::Span<size_t> d_tree_segments,
|
||||
common::Span<int> d_tree_group,
|
||||
common::Span<const bst_row_t> d_row_ptr,
|
||||
common::Span<const Entry> d_data, size_t tree_begin,
|
||||
size_t tree_end, size_t num_features,
|
||||
size_t tree_begin, size_t tree_end, size_t num_features,
|
||||
size_t num_rows, size_t entry_start,
|
||||
bool use_shared, int num_group) {
|
||||
extern __shared__ float smem[];
|
||||
bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
ElementLoader loader(use_shared, d_row_ptr, d_data, num_features, smem,
|
||||
num_rows, entry_start);
|
||||
Loader loader(data, use_shared, num_features, num_rows, entry_start);
|
||||
if (global_idx >= num_rows) return;
|
||||
if (num_group == 1) {
|
||||
float sum = 0;
|
||||
for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
const DevicePredictionNode* d_tree =
|
||||
const RegTree::Node* d_tree =
|
||||
&d_nodes[d_tree_segments[tree_idx - tree_begin]];
|
||||
sum += GetLeafWeight(global_idx, d_tree, &loader);
|
||||
float leaf = GetLeafWeight(global_idx, d_tree, &loader);
|
||||
sum += leaf;
|
||||
}
|
||||
d_out_predictions[global_idx] += sum;
|
||||
} else {
|
||||
for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
int tree_group = d_tree_group[tree_idx];
|
||||
const DevicePredictionNode* d_tree =
|
||||
const RegTree::Node* d_tree =
|
||||
&d_nodes[d_tree_segments[tree_idx - tree_begin]];
|
||||
bst_uint out_prediction_idx = global_idx * num_group + tree_group;
|
||||
d_out_predictions[out_prediction_idx] +=
|
||||
@@ -198,13 +170,13 @@ __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
|
||||
class GPUPredictor : public xgboost::Predictor {
|
||||
private:
|
||||
void InitModel(const gbm::GBTreeModel& model,
|
||||
const thrust::host_vector<size_t>& h_tree_segments,
|
||||
const thrust::host_vector<DevicePredictionNode>& h_nodes,
|
||||
size_t tree_begin, size_t tree_end) {
|
||||
const thrust::host_vector<size_t>& h_tree_segments,
|
||||
const thrust::host_vector<RegTree::Node>& h_nodes,
|
||||
size_t tree_begin, size_t tree_end) {
|
||||
dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
|
||||
nodes_.resize(h_nodes.size());
|
||||
dh::safe_cuda(cudaMemcpyAsync(nodes_.data().get(), h_nodes.data(),
|
||||
sizeof(DevicePredictionNode) * h_nodes.size(),
|
||||
sizeof(RegTree::Node) * h_nodes.size(),
|
||||
cudaMemcpyHostToDevice));
|
||||
tree_segments_.resize(h_tree_segments.size());
|
||||
dh::safe_cuda(cudaMemcpyAsync(tree_segments_.data().get(), h_tree_segments.data(),
|
||||
@@ -219,15 +191,11 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
this->num_group_ = model.learner_model_param_->num_output_group;
|
||||
}
|
||||
|
||||
void PredictInternal(const SparsePage& batch,
|
||||
size_t num_features,
|
||||
void PredictInternal(const SparsePage& batch, size_t num_features,
|
||||
HostDeviceVector<bst_float>* predictions,
|
||||
size_t batch_offset) {
|
||||
dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
|
||||
batch.data.SetDevice(generic_param_->gpu_id);
|
||||
batch.offset.SetDevice(generic_param_->gpu_id);
|
||||
predictions->SetDevice(generic_param_->gpu_id);
|
||||
|
||||
batch.data.SetDevice(generic_param_->gpu_id);
|
||||
const uint32_t BLOCK_THREADS = 128;
|
||||
size_t num_rows = batch.Size();
|
||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
||||
@@ -240,12 +208,29 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
use_shared = false;
|
||||
}
|
||||
size_t entry_start = 0;
|
||||
|
||||
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan()};
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
|
||||
PredictKernel<BLOCK_THREADS>,
|
||||
PredictKernel<SparsePageLoader, SparsePageView>,
|
||||
data,
|
||||
dh::ToSpan(nodes_), predictions->DeviceSpan().subspan(batch_offset),
|
||||
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
|
||||
batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, num_rows,
|
||||
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
|
||||
this->tree_begin_, this->tree_end_, num_features, num_rows,
|
||||
entry_start, use_shared, this->num_group_);
|
||||
}
|
||||
void PredictInternal(EllpackMatrix const& batch, HostDeviceVector<bst_float>* out_preds,
|
||||
size_t batch_offset) {
|
||||
const uint32_t BLOCK_THREADS = 256;
|
||||
size_t num_rows = batch.n_rows;
|
||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
||||
|
||||
bool use_shared = false;
|
||||
size_t entry_start = 0;
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
|
||||
PredictKernel<EllpackLoader, EllpackMatrix>,
|
||||
batch,
|
||||
dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
|
||||
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
|
||||
this->tree_begin_, this->tree_end_, batch.info.NumFeatures(), num_rows,
|
||||
entry_start, use_shared, this->num_group_);
|
||||
}
|
||||
|
||||
@@ -261,7 +246,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
h_tree_segments.push_back(sum);
|
||||
}
|
||||
|
||||
thrust::host_vector<DevicePredictionNode> h_nodes(h_tree_segments.back());
|
||||
thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
|
||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||
auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
|
||||
std::copy(src_nodes.begin(), src_nodes.end(),
|
||||
@@ -270,26 +255,31 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
|
||||
}
|
||||
|
||||
void DevicePredictInternal(DMatrix* dmat,
|
||||
HostDeviceVector<bst_float>* out_preds,
|
||||
void DevicePredictInternal(DMatrix* dmat, HostDeviceVector<float>* out_preds,
|
||||
const gbm::GBTreeModel& model, size_t tree_begin,
|
||||
size_t tree_end) {
|
||||
dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
|
||||
if (tree_end - tree_begin == 0) {
|
||||
return;
|
||||
}
|
||||
monitor_.StartCuda("DevicePredictInternal");
|
||||
|
||||
InitModel(model, tree_begin, tree_end);
|
||||
out_preds->SetDevice(generic_param_->gpu_id);
|
||||
|
||||
size_t batch_offset = 0;
|
||||
for (auto &batch : dmat->GetBatches<SparsePage>()) {
|
||||
batch.offset.SetDevice(generic_param_->gpu_id);
|
||||
batch.data.SetDevice(generic_param_->gpu_id);
|
||||
PredictInternal(batch, model.learner_model_param_->num_feature,
|
||||
out_preds, batch_offset);
|
||||
batch_offset += batch.Size() * model.learner_model_param_->num_output_group;
|
||||
if (dmat->PageExists<EllpackPage>()) {
|
||||
size_t batch_offset = 0;
|
||||
for (auto const& page : dmat->GetBatches<EllpackPage>()) {
|
||||
this->PredictInternal(page.Impl()->matrix, out_preds, batch_offset);
|
||||
batch_offset += page.Impl()->matrix.n_rows;
|
||||
}
|
||||
} else {
|
||||
size_t batch_offset = 0;
|
||||
for (auto &batch : dmat->GetBatches<SparsePage>()) {
|
||||
this->PredictInternal(batch, model.learner_model_param_->num_feature,
|
||||
out_preds, batch_offset);
|
||||
batch_offset += batch.Size() * model.learner_model_param_->num_output_group;
|
||||
}
|
||||
}
|
||||
|
||||
monitor_.StopCuda("DevicePredictInternal");
|
||||
}
|
||||
|
||||
@@ -418,7 +408,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
|
||||
common::Monitor monitor_;
|
||||
dh::device_vector<DevicePredictionNode> nodes_;
|
||||
dh::device_vector<RegTree::Node> nodes_;
|
||||
dh::device_vector<size_t> tree_segments_;
|
||||
dh::device_vector<int> tree_group_;
|
||||
size_t max_shared_memory_bytes_;
|
||||
|
||||
Reference in New Issue
Block a user