Objective function evaluation on GPU with minimal PCIe transfers (#2935)

* Added GPU objective function and no-copy interface. - xgboost::HostDeviceVector<T> syncs automatically between host and device - no-copy interfaces have been added - default implementations just sync the data to host and call the implementations with std::vector - GPU objective function, predictor, histogram updater process data directly on GPU
2018-01-12 14:03:39 +05:30
parent a187ed6c8f
commit 84ab74f3a5
23 changed files with 1036 additions and 127 deletions
--- a/src/gbm/gbm.cc
+++ b/src/gbm/gbm.cc
@@ -21,6 +21,19 @@ GradientBooster* GradientBooster::Create(
  }
  return (e->body)(cache_mats, base_margin);
 }
+
+void GradientBooster::DoBoost(DMatrix* p_fmat,
+                     HostDeviceVector<bst_gpair>* in_gpair,
+                     ObjFunction* obj) {
+  DoBoost(p_fmat, &in_gpair->data_h(), obj);
+}
+
+void GradientBooster::PredictBatch(DMatrix* dmat,
+                                   HostDeviceVector<bst_float>* out_preds,
+                                   unsigned ntree_limit) {
+  PredictBatch(dmat, &out_preds->data_h(), ntree_limit);
+}
+
 }  // namespace xgboost

 namespace xgboost {
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,6 +18,7 @@
 #include <limits>
 #include <algorithm>
 #include "../common/common.h"
+#include "../common/host_device_vector.h"
 #include "../common/random.h"
 #include "gbtree_model.h"
 #include "../common/timer.h"
@@ -182,35 +183,13 @@ class GBTree : public GradientBooster {
  void DoBoost(DMatrix* p_fmat,
               std::vector<bst_gpair>* in_gpair,
               ObjFunction* obj) override {
-    const std::vector<bst_gpair>& gpair = *in_gpair;
-    std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
-    const int ngroup = model_.param.num_output_group;
-    monitor.Start("BoostNewTrees");
-    if (ngroup == 1) {
-      std::vector<std::unique_ptr<RegTree> > ret;
-      BoostNewTrees(gpair, p_fmat, 0, &ret);
-      new_trees.push_back(std::move(ret));
-    } else {
-      CHECK_EQ(gpair.size() % ngroup, 0U)
-          << "must have exactly ngroup*nrow gpairs";
-      std::vector<bst_gpair> tmp(gpair.size() / ngroup);
-      for (int gid = 0; gid < ngroup; ++gid) {
-        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          tmp[i] = gpair[i * ngroup + gid];
-        }
-        std::vector<std::unique_ptr<RegTree> > ret;
-        BoostNewTrees(tmp, p_fmat, gid, &ret);
-        new_trees.push_back(std::move(ret));
-      }
-    }
-    monitor.Stop("BoostNewTrees");
-    monitor.Start("CommitModel");
-    for (int gid = 0; gid < ngroup; ++gid) {
-      this->CommitModel(std::move(new_trees[gid]), gid);
-    }
-    monitor.Stop("CommitModel");
+    DoBoostHelper(p_fmat, in_gpair, obj);
+  }
+
+  void DoBoost(DMatrix* p_fmat,
+               HostDeviceVector<bst_gpair>* in_gpair,
+               ObjFunction* obj) override {
+    DoBoostHelper(p_fmat, in_gpair, obj);
  }

  void PredictBatch(DMatrix* p_fmat,
@@ -219,6 +198,12 @@ class GBTree : public GradientBooster {
    predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
  }

+  void PredictBatch(DMatrix* p_fmat,
+               HostDeviceVector<bst_float>* out_preds,
+               unsigned ntree_limit) override {
+    predictor->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
+  }
+
  void PredictInstance(const SparseBatch::Inst& inst,
               std::vector<bst_float>* out_preds,
               unsigned ntree_limit,
@@ -257,9 +242,48 @@ class GBTree : public GradientBooster {
      updaters.push_back(std::move(up));
    }
  }
+
+  // TVec is either std::vector<bst_gpair> or HostDeviceVector<bst_gpair>
+  template <typename TVec>
+  void DoBoostHelper(DMatrix* p_fmat,
+               TVec* in_gpair,
+               ObjFunction* obj) {
+    std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
+    const int ngroup = model_.param.num_output_group;
+    monitor.Start("BoostNewTrees");
+    if (ngroup == 1) {
+      std::vector<std::unique_ptr<RegTree> > ret;
+      BoostNewTrees(in_gpair, p_fmat, 0, &ret);
+      new_trees.push_back(std::move(ret));
+    } else {
+      CHECK_EQ(in_gpair->size() % ngroup, 0U)
+          << "must have exactly ngroup*nrow gpairs";
+      std::vector<bst_gpair> tmp(in_gpair->size() / ngroup);
+      auto& gpair_h = HostDeviceVector<bst_gpair>::data_h(in_gpair);
+      for (int gid = 0; gid < ngroup; ++gid) {
+        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          tmp[i] = gpair_h[i * ngroup + gid];
+        }
+        std::vector<std::unique_ptr<RegTree> > ret;
+        BoostNewTrees(&tmp, p_fmat, gid, &ret);
+        new_trees.push_back(std::move(ret));
+      }
+    }
+    monitor.Stop("BoostNewTrees");
+    monitor.Start("CommitModel");
+    for (int gid = 0; gid < ngroup; ++gid) {
+      this->CommitModel(std::move(new_trees[gid]), gid);
+    }
+    monitor.Stop("CommitModel");
+  }
+
  // do group specific group
+  // TVec is either const std::vector<bst_gpair> or HostDeviceVector<bst_gpair>
+  template <typename TVec>
  inline void
-  BoostNewTrees(const std::vector<bst_gpair> &gpair,
+  BoostNewTrees(TVec* gpair,
                DMatrix *p_fmat,
                int bst_group,
                std::vector<std::unique_ptr<RegTree> >* ret) {
@@ -286,9 +310,24 @@ class GBTree : public GradientBooster {
    }
    // update the trees
    for (auto& up : updaters) {
-      up->Update(gpair, p_fmat, new_trees);
+      UpdateHelper(up.get(), gpair, p_fmat, new_trees);
    }
  }
+
+  void UpdateHelper(TreeUpdater* updater,
+               std::vector<bst_gpair>* gpair,
+               DMatrix *p_fmat,
+               const std::vector<RegTree*>& new_trees) {
+    updater->Update(*gpair, p_fmat, new_trees);
+  }
+
+  void UpdateHelper(TreeUpdater* updater,
+               HostDeviceVector<bst_gpair>* gpair,
+               DMatrix *p_fmat,
+               const std::vector<RegTree*>& new_trees) {
+    updater->Update(gpair, p_fmat, new_trees);
+  }
+
  // commit new trees all at once
  virtual void
  CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,