[GPU-Plugin] Add GPU accelerated prediction (#2593)

* [GPU-Plugin] Add GPU accelerated prediction * Improve allocation message * Update documentation * Resolve linker error for predictor * Add unit tests
2017-08-16 12:31:59 +12:00
parent 71e5e622b1
commit ef23e424f1
25 changed files with 876 additions and 203 deletions
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -45,6 +45,7 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
  int process_type;
  // flag to print out detailed breakdown of runtime
  int debug_verbose;
+  std::string predictor;
  // declare parameters
  DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
    DMLC_DECLARE_FIELD(num_parallel_tree)
@@ -67,6 +68,9 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
        .describe("flag to print out detailed breakdown of runtime");
    // add alias
    DMLC_DECLARE_ALIAS(updater_seq, updater);
+    DMLC_DECLARE_FIELD(predictor)
+      .set_default("cpu_predictor")
+      .describe("Predictor algorithm type");
  }
 };

@@ -130,13 +134,10 @@ struct CacheEntry {
 // gradient boosted trees
 class GBTree : public GradientBooster {
 public:
-  explicit GBTree(bst_float base_margin)
-      : model_(base_margin),
-        predictor(
-            std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor"))) {}
+  explicit GBTree(bst_float base_margin) : model_(base_margin) {}

  void InitCache(const std::vector<std::shared_ptr<DMatrix> > &cache) {
-    predictor->InitCache(cache);
+    cache_ = cache;
  }

  void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
@@ -153,6 +154,10 @@ class GBTree : public GradientBooster {
    if (tparam.process_type == kUpdate) {
      model_.InitTreesToUpdate();
    }
+
+    // configure predictor
+    predictor = std::unique_ptr<Predictor>(Predictor::Create(tparam.predictor));
+    predictor->Init(cfg, cache_);
  }

  void Load(dmlc::Stream* fi) override {
@@ -300,7 +305,8 @@ class GBTree : public GradientBooster {
  std::vector<std::pair<std::string, std::string> > cfg;
  // the updaters that can be applied to each of tree
  std::vector<std::unique_ptr<TreeUpdater>> updaters;
-
+  // Cached matrices
+  std::vector<std::shared_ptr<DMatrix>> cache_;
  std::unique_ptr<Predictor> predictor;
 };

--- a/src/learner.cc
+++ b/src/learner.cc
@@ -165,9 +165,19 @@ class LearnerImpl : public Learner {
                   << "grow_fast_histmaker.";
      cfg_["updater"] = "grow_fast_histmaker";
    } else if (tparam.tree_method == 4) {
-      cfg_["updater"] = "grow_gpu,prune";
+      if (cfg_.count("updater") == 0) {
+        cfg_["updater"] = "grow_gpu,prune";
+      }
+      if (cfg_.count("predictor") == 0) {
+        cfg_["predictor"] = "gpu_predictor";
+      }
    } else if (tparam.tree_method == 5) {
-      cfg_["updater"] = "grow_gpu_hist";
+      if (cfg_.count("updater") == 0) {
+        cfg_["updater"] = "grow_gpu_hist";
+      }
+      if (cfg_.count("predictor") == 0) {
+        cfg_["predictor"] = "gpu_predictor";
+      }
    }
  }

--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -9,6 +9,8 @@
 namespace xgboost {
 namespace predictor {

+DMLC_REGISTRY_FILE_TAG(cpu_predictor);
+
 class CPUPredictor : public Predictor {
 protected:
  static bst_float PredValue(const RowBatch::Inst& inst,
@@ -28,19 +30,6 @@ class CPUPredictor : public Predictor {
    return psum;
  }

-  void InitOutPredictions(const MetaInfo& info,
-                          std::vector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const {
-    size_t n = model.param.num_output_group * info.num_row;
-    const std::vector<bst_float>& base_margin = info.base_margin;
-    out_preds->resize(n);
-    if (base_margin.size() != 0) {
-      CHECK_EQ(out_preds->size(), n);
-      std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
-    } else {
-      std::fill(out_preds->begin(), out_preds->end(), model.base_margin);
-    }
-  }
  // init thread buffers
  inline void InitThreadTemp(int nthread, int num_feature) {
    int prev_thread_temp_size = thread_temp.size();
@@ -106,33 +95,6 @@ class CPUPredictor : public Predictor {
    }
  }

-  /**
-   * \fn  bool PredictFromCache(DMatrix* dmat, std::vector<bst_float>*
-   * out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit = 0)
-   *
-   * \brief Attempt to predict from cache.
-   *
-   * \return  True if it succeeds, false if it fails.
-   */
-  bool PredictFromCache(DMatrix* dmat, std::vector<bst_float>* out_preds,
-                        const gbm::GBTreeModel& model,
-                        unsigned ntree_limit = 0) {
-    if (ntree_limit == 0 ||
-        ntree_limit * model.param.num_output_group >= model.trees.size()) {
-      auto it = cache_.find(dmat);
-      if (it != cache_.end()) {
-        std::vector<bst_float>& y = it->second.predictions;
-        if (y.size() != 0) {
-          out_preds->resize(y.size());
-          std::copy(y.begin(), y.end(), out_preds->begin());
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
  void PredLoopInternal(DMatrix* dmat, std::vector<bst_float>* out_preds,
                        const gbm::GBTreeModel& model, int tree_begin,
                        unsigned ntree_limit) {
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -8,13 +8,47 @@ namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::PredictorReg);
 }  // namespace dmlc
 namespace xgboost {
-void Predictor::InitCache(const std::vector<std::shared_ptr<DMatrix> >& cache) {
+void Predictor::Init(
+    const std::vector<std::pair<std::string, std::string>>& cfg,
+    const std::vector<std::shared_ptr<DMatrix>>& cache) {
  for (const std::shared_ptr<DMatrix>& d : cache) {
    PredictionCacheEntry e;
    e.data = d;
    cache_[d.get()] = std::move(e);
  }
 }
+bool Predictor::PredictFromCache(DMatrix* dmat,
+                                 std::vector<bst_float>* out_preds,
+                                 const gbm::GBTreeModel& model,
+                                 unsigned ntree_limit) {
+  if (ntree_limit == 0 ||
+      ntree_limit * model.param.num_output_group >= model.trees.size()) {
+    auto it = cache_.find(dmat);
+    if (it != cache_.end()) {
+      std::vector<bst_float>& y = it->second.predictions;
+      if (y.size() != 0) {
+        out_preds->resize(y.size());
+        std::copy(y.begin(), y.end(), out_preds->begin());
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+void Predictor::InitOutPredictions(const MetaInfo& info,
+                                   std::vector<bst_float>* out_preds,
+                                   const gbm::GBTreeModel& model) const {
+  size_t n = model.param.num_output_group * info.num_row;
+  const std::vector<bst_float>& base_margin = info.base_margin;
+  out_preds->resize(n);
+  if (base_margin.size() != 0) {
+    CHECK_EQ(out_preds->size(), n);
+    std::copy(base_margin.begin(), base_margin.end(), out_preds->begin());
+  } else {
+    std::fill(out_preds->begin(), out_preds->end(), model.base_margin);
+  }
+}
 Predictor* Predictor::Create(std::string name) {
  auto* e = ::dmlc::Registry<PredictorReg>::Get()->Find(name);
  if (e == nullptr) {
@@ -23,3 +57,13 @@ Predictor* Predictor::Create(std::string name) {
  return (e->body)();
 }
 }  // namespace xgboost
+
+namespace xgboost {
+namespace predictor {
+// List of files that will be force linked in static links.
+#ifdef XGBOOST_USE_CUDA
+DMLC_REGISTRY_LINK_TAG(gpu_predictor);
+#endif
+DMLC_REGISTRY_LINK_TAG(cpu_predictor);
+}  // namespace predictor
+}  // namespace xgboost