Optimize prediction with QuantileDMatrix. (#9096)

- Reduce overhead in `FVecDrop`. - Reduce overhead caused by `HostVector()` calls.
2023-04-28 00:51:41 +08:00
parent fa267ad093
commit 0e470ef606
5 changed files with 49 additions and 35 deletions
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -75,7 +75,7 @@ bst_float PredValue(const SparsePage::Inst &inst,
      psum += (*trees[i])[nidx].LeafValue();
    }
  }
-  p_feats->Drop(inst);
+  p_feats->Drop();
  return psum;
 }

@@ -172,13 +172,11 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
  }
 }

-template <typename DataView>
-void FVecDrop(const size_t block_size, const size_t batch_offset, DataView *batch,
-              const size_t fvec_offset, std::vector<RegTree::FVec> *p_feats) {
+void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
+              std::vector<RegTree::FVec> *p_feats) {
  for (size_t i = 0; i < block_size; ++i) {
    RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
-    const SparsePage::Inst inst = (*batch)[batch_offset + i];
-    feats.Drop(inst);
+    feats.Drop();
  }
 }

@@ -196,11 +194,15 @@ struct SparsePageView {
 struct GHistIndexMatrixView {
 private:
  GHistIndexMatrix const &page_;
-  uint64_t n_features_;
+  std::uint64_t const n_features_;
  common::Span<FeatureType const> ft_;
  common::Span<Entry> workspace_;
  std::vector<size_t> current_unroll_;

+  std::vector<std::uint32_t> const& ptrs_;
+  std::vector<float> const& mins_;
+  std::vector<float> const& values_;
+
 public:
  size_t base_rowid;

@@ -213,6 +215,9 @@ struct GHistIndexMatrixView {
        ft_{ft},
        workspace_{workplace},
        current_unroll_(n_threads > 0 ? n_threads : 1, 0),
+        ptrs_{_page.cut.Ptrs()},
+        mins_{_page.cut.MinValues()},
+        values_{_page.cut.Values()},
        base_rowid{_page.base_rowid} {}

  SparsePage::Inst operator[](size_t r) {
@@ -221,7 +226,7 @@ struct GHistIndexMatrixView {
    size_t non_missing{static_cast<std::size_t>(beg)};

    for (bst_feature_t c = 0; c < n_features_; ++c) {
-      float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
+      float f = page_.GetFvalue(ptrs_, values_, mins_, r, c, common::IsCat(ft_, c));
      if (!common::CheckNAN(f)) {
        workspace_[non_missing] = Entry{c, f};
        ++non_missing;
@@ -301,7 +306,7 @@ void PredictBatchByBlockOfRowsKernel(DataView batch, gbm::GBTreeModel const &mod
    // process block of rows through all trees to keep cache locality
    PredictByAllTrees(model, tree_begin, tree_end, batch_offset + batch.base_rowid, thread_temp,
                      fvec_offset, block_size, out_predt);
-    FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
+    FVecDrop(block_size, fvec_offset, p_thread_temp);
  });
 }

@@ -529,7 +534,7 @@ class ColumnSplitHelper {

      FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
      MaskAllTrees(batch_offset, fvec_offset, block_size);
-      FVecDrop(block_size, batch_offset, &batch, fvec_offset, &feat_vecs_);
+      FVecDrop(block_size, fvec_offset, &feat_vecs_);
    });

    AllreduceBitVectors();
@@ -780,7 +785,7 @@ class CPUPredictor : public Predictor {
          }
          preds[ridx * ntree_limit + j] = static_cast<bst_float>(nidx);
        }
-        feats.Drop(page[i]);
+        feats.Drop();
      });
    }
  }
@@ -853,7 +858,7 @@ class CPUPredictor : public Predictor {
                  (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
            }
          }
-          feats.Drop(page[i]);
+          feats.Drop();
          // add base margin to BIAS
          if (base_margin.Size() != 0) {
            CHECK_EQ(base_margin.Shape(1), ngroup);