Objective function evaluation on GPU with minimal PCIe transfers (#2935)

* Added GPU objective function and no-copy interface. - xgboost::HostDeviceVector<T> syncs automatically between host and device - no-copy interfaces have been added - default implementations just sync the data to host and call the implementations with std::vector - GPU objective function, predictor, histogram updater process data directly on GPU
2018-01-12 14:03:39 +05:30
parent a187ed6c8f
commit 84ab74f3a5
23 changed files with 1036 additions and 127 deletions
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -18,6 +18,7 @@
 #include "./data.h"
 #include "./objective.h"
 #include "./feature_map.h"
+#include "../../src/common/host_device_vector.h"

 namespace xgboost {
 /*!
@@ -70,6 +71,10 @@ class GradientBooster {
  virtual void DoBoost(DMatrix* p_fmat,
                       std::vector<bst_gpair>* in_gpair,
                       ObjFunction* obj = nullptr) = 0;
+  virtual void DoBoost(DMatrix* p_fmat,
+                       HostDeviceVector<bst_gpair>* in_gpair,
+                       ObjFunction* obj = nullptr);
+
  /*!
   * \brief generate predictions for given feature matrix
   * \param dmat feature matrix
@@ -80,6 +85,9 @@ class GradientBooster {
  virtual void PredictBatch(DMatrix* dmat,
                       std::vector<bst_float>* out_preds,
                       unsigned ntree_limit = 0) = 0;
+  virtual void PredictBatch(DMatrix* dmat,
+                            HostDeviceVector<bst_float>* out_preds,
+                            unsigned ntree_limit = 0);
  /*!
   * \brief online prediction function, predict score for one instance at a time
   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -14,8 +14,11 @@
 #include <functional>
 #include "./data.h"
 #include "./base.h"
+#include "../../src/common/host_device_vector.h"
+

 namespace xgboost {
+
 /*! \brief interface of objective function */
 class ObjFunction {
 public:
@@ -45,6 +48,11 @@ class ObjFunction {
                           const MetaInfo& info,
                           int iteration,
                           std::vector<bst_gpair>* out_gpair) = 0;
+  virtual void GetGradient(HostDeviceVector<bst_float>* preds,
+                           const MetaInfo& info,
+                           int iteration,
+                           HostDeviceVector<bst_gpair>* out_gpair);
+
  /*! \return the default evaluation metric for the objective */
  virtual const char* DefaultEvalMetric() const = 0;
  // the following functions are optional, most of time default implementation is good enough
@@ -53,6 +61,8 @@ class ObjFunction {
   * \param io_preds prediction values, saves to this vector as well
   */
  virtual void PredTransform(std::vector<bst_float> *io_preds) {}
+  virtual void PredTransform(HostDeviceVector<bst_float> *io_preds);
+
  /*!
   * \brief transform prediction values, this is only called when Eval is called,
   *  usually it redirect to PredTransform
@@ -61,6 +71,9 @@ class ObjFunction {
  virtual void EvalTransform(std::vector<bst_float> *io_preds) {
    this->PredTransform(io_preds);
  }
+  virtual void EvalTransform(HostDeviceVector<bst_float> *io_preds) {
+    this->PredTransform(io_preds);
+  }
  /*!
   * \brief transform probability value back to margin
   * this is used to transform user-set base_score back to margin
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -13,6 +13,7 @@
 #include <utility>
 #include <vector>
 #include "../../src/gbm/gbtree_model.h"
+#include "../../src/common/host_device_vector.h"

 // Forward declarations
 namespace xgboost {
@@ -51,10 +52,6 @@ class Predictor {
                    const std::vector<std::shared_ptr<DMatrix>>& cache);

  /**
-   * \fn  virtual void Predictor::PredictBatch( DMatrix* dmat,
-   * std::vector<bst_float>* out_preds, const gbm::GBTreeModel &model, int
-   * tree_begin, unsigned ntree_limit = 0) = 0;
-   *
   * \brief Generate batch predictions for a given feature matrix. May use
   * cached predictions if available instead of calculating from scratch.
   *
@@ -70,6 +67,22 @@ class Predictor {
                            const gbm::GBTreeModel& model, int tree_begin,
                            unsigned ntree_limit = 0) = 0;

+  /**
+   * \brief Generate batch predictions for a given feature matrix. May use
+   * cached predictions if available instead of calculating from scratch.
+   *
+   * \param [in,out]  dmat        Feature matrix.
+   * \param [in,out]  out_preds   The output preds.
+   * \param           model       The model to predict from.
+   * \param           tree_begin  The tree begin index.
+   * \param           ntree_limit (Optional) The ntree limit. 0 means do not
+   * limit trees.
+   */
+
+  virtual void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
+                            const gbm::GBTreeModel& model, int tree_begin,
+                            unsigned ntree_limit = 0) = 0;
+
  /**
   * \fn  virtual void Predictor::UpdatePredictionCache( const gbm::GBTreeModel
   * &model, std::vector<std::unique_ptr<TreeUpdater> >* updaters, int
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -16,6 +16,7 @@
 #include "./base.h"
 #include "./data.h"
 #include "./tree_model.h"
+#include "../../src/common/host_device_vector.h"

 namespace xgboost {
 /*!
@@ -42,6 +43,9 @@ class TreeUpdater {
  virtual void Update(const std::vector<bst_gpair>& gpair,
                      DMatrix* data,
                      const std::vector<RegTree*>& trees) = 0;
+  virtual void Update(HostDeviceVector<bst_gpair>* gpair,
+                      DMatrix* data,
+                      const std::vector<RegTree*>& trees);

  /*!
   * \brief determines whether updater has enough knowledge about a given dataset
@@ -57,6 +61,9 @@ class TreeUpdater {
                                     std::vector<bst_float>* out_preds) {
    return false;
  }
+  virtual bool UpdatePredictionCache(const DMatrix* data,
+                                     HostDeviceVector<bst_float>* out_preds);
+
  /*!
   * \brief Create a tree updater given name
   * \param name Name of the tree updater.