Optimize CPU prediction (#6696)
Co-authored-by: Shvets Kirill <kirill.shvets@intel.com>
This commit is contained in:
parent
dc97b5f19f
commit
9f15b9e322
@ -65,10 +65,13 @@ inline void PredictByAllTrees(gbm::GBTreeModel const &model, const size_t tree_b
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename DataView>
|
template <typename DataView>
|
||||||
void FVecFill(const size_t block_size, const size_t batch_offset, DataView* batch,
|
void FVecFill(const size_t block_size, const size_t batch_offset, const int num_feature,
|
||||||
const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
|
DataView* batch, const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
|
||||||
for (size_t i = 0; i < block_size; ++i) {
|
for (size_t i = 0; i < block_size; ++i) {
|
||||||
RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
|
RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
|
||||||
|
if (feats.Size() == 0) {
|
||||||
|
feats.Init(num_feature);
|
||||||
|
}
|
||||||
const SparsePage::Inst inst = (*batch)[batch_offset + i];
|
const SparsePage::Inst inst = (*batch)[batch_offset + i];
|
||||||
feats.Fill(inst);
|
feats.Fill(inst);
|
||||||
}
|
}
|
||||||
@ -152,7 +155,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, std::vector<bst_float> *out
|
|||||||
<< "size_leaf_vector is enforced to 0 so far";
|
<< "size_leaf_vector is enforced to 0 so far";
|
||||||
// parallel over local batch
|
// parallel over local batch
|
||||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||||
|
const int num_feature = model.learner_model_param->num_feature;
|
||||||
const bst_omp_uint n_row_blocks = (nsize) / block_of_rows_size + !!((nsize) % block_of_rows_size);
|
const bst_omp_uint n_row_blocks = (nsize) / block_of_rows_size + !!((nsize) % block_of_rows_size);
|
||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint block_id = 0; block_id < n_row_blocks; ++block_id) {
|
for (bst_omp_uint block_id = 0; block_id < n_row_blocks; ++block_id) {
|
||||||
@ -160,7 +163,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, std::vector<bst_float> *out
|
|||||||
const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
|
const size_t block_size = std::min(nsize - batch_offset, block_of_rows_size);
|
||||||
const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
|
const size_t fvec_offset = omp_get_thread_num() * block_of_rows_size;
|
||||||
|
|
||||||
FVecFill(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
|
FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, p_thread_temp);
|
||||||
// process block of rows through all trees to keep cache locality
|
// process block of rows through all trees to keep cache locality
|
||||||
PredictByAllTrees(model, tree_begin, tree_end, out_preds, batch_offset + batch.base_rowid,
|
PredictByAllTrees(model, tree_begin, tree_end, out_preds, batch_offset + batch.base_rowid,
|
||||||
num_group, thread_temp, fvec_offset, block_size);
|
num_group, thread_temp, fvec_offset, block_size);
|
||||||
@ -175,9 +178,6 @@ class CPUPredictor : public Predictor {
|
|||||||
int prev_thread_temp_size = out->size();
|
int prev_thread_temp_size = out->size();
|
||||||
if (prev_thread_temp_size < nthread) {
|
if (prev_thread_temp_size < nthread) {
|
||||||
out->resize(nthread, RegTree::FVec());
|
out->resize(nthread, RegTree::FVec());
|
||||||
for (int i = prev_thread_temp_size; i < nthread; ++i) {
|
|
||||||
(*out)[i].Init(num_feature);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -321,7 +321,8 @@ class CPUPredictor : public Predictor {
|
|||||||
const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
|
const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
|
||||||
const int nthread = omp_get_max_threads();
|
const int nthread = omp_get_max_threads();
|
||||||
std::vector<RegTree::FVec> feat_vecs;
|
std::vector<RegTree::FVec> feat_vecs;
|
||||||
InitThreadTemp(nthread, model.learner_model_param->num_feature, &feat_vecs);
|
const int num_feature = model.learner_model_param->num_feature;
|
||||||
|
InitThreadTemp(nthread, num_feature, &feat_vecs);
|
||||||
const MetaInfo& info = p_fmat->Info();
|
const MetaInfo& info = p_fmat->Info();
|
||||||
// number of valid trees
|
// number of valid trees
|
||||||
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
||||||
@ -339,6 +340,9 @@ class CPUPredictor : public Predictor {
|
|||||||
const int tid = omp_get_thread_num();
|
const int tid = omp_get_thread_num();
|
||||||
auto ridx = static_cast<size_t>(batch.base_rowid + i);
|
auto ridx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
RegTree::FVec &feats = feat_vecs[tid];
|
RegTree::FVec &feats = feat_vecs[tid];
|
||||||
|
if (feats.Size() == 0) {
|
||||||
|
feats.Init(num_feature);
|
||||||
|
}
|
||||||
feats.Fill(page[i]);
|
feats.Fill(page[i]);
|
||||||
for (unsigned j = 0; j < ntree_limit; ++j) {
|
for (unsigned j = 0; j < ntree_limit; ++j) {
|
||||||
int tid = model.trees[j]->GetLeafIndex(feats);
|
int tid = model.trees[j]->GetLeafIndex(feats);
|
||||||
@ -355,8 +359,9 @@ class CPUPredictor : public Predictor {
|
|||||||
bool approximate, int condition,
|
bool approximate, int condition,
|
||||||
unsigned condition_feature) const override {
|
unsigned condition_feature) const override {
|
||||||
const int nthread = omp_get_max_threads();
|
const int nthread = omp_get_max_threads();
|
||||||
|
const int num_feature = model.learner_model_param->num_feature;
|
||||||
std::vector<RegTree::FVec> feat_vecs;
|
std::vector<RegTree::FVec> feat_vecs;
|
||||||
InitThreadTemp(nthread, model.learner_model_param->num_feature, &feat_vecs);
|
InitThreadTemp(nthread, num_feature, &feat_vecs);
|
||||||
const MetaInfo& info = p_fmat->Info();
|
const MetaInfo& info = p_fmat->Info();
|
||||||
// number of valid trees
|
// number of valid trees
|
||||||
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
|
||||||
@ -364,7 +369,7 @@ class CPUPredictor : public Predictor {
|
|||||||
}
|
}
|
||||||
const int ngroup = model.learner_model_param->num_output_group;
|
const int ngroup = model.learner_model_param->num_output_group;
|
||||||
CHECK_NE(ngroup, 0);
|
CHECK_NE(ngroup, 0);
|
||||||
size_t const ncolumns = model.learner_model_param->num_feature + 1;
|
size_t const ncolumns = num_feature + 1;
|
||||||
CHECK_NE(ncolumns, 0);
|
CHECK_NE(ncolumns, 0);
|
||||||
// allocate space for (number of features + bias) times the number of rows
|
// allocate space for (number of features + bias) times the number of rows
|
||||||
std::vector<bst_float>& contribs = out_contribs->HostVector();
|
std::vector<bst_float>& contribs = out_contribs->HostVector();
|
||||||
@ -387,6 +392,9 @@ class CPUPredictor : public Predictor {
|
|||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
|
||||||
RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
|
RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
|
||||||
|
if (feats.Size() == 0) {
|
||||||
|
feats.Init(num_feature);
|
||||||
|
}
|
||||||
std::vector<bst_float> this_tree_contribs(ncolumns);
|
std::vector<bst_float> this_tree_contribs(ncolumns);
|
||||||
// loop over all classes
|
// loop over all classes
|
||||||
for (int gid = 0; gid < ngroup; ++gid) {
|
for (int gid = 0; gid < ngroup; ++gid) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user