Improve multi-threaded performance (#2104)

* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
2017-03-25 10:35:01 -07:00
parent 332aea26a3
commit 14fba01b5a
14 changed files with 719 additions and 171 deletions
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -44,6 +44,8 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
  std::string updater_seq;
  /*! \brief type of boosting process to run */
  int process_type;
+  // flag to print out detailed breakdown of runtime
+  int debug_verbose;
  // declare parameters
  DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
    DMLC_DECLARE_FIELD(num_parallel_tree)
@@ -60,6 +62,10 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
        .add_enum("update", kUpdate)
        .describe("Whether to run the normal boosting process that creates new trees,"\
                  " or to update the trees in an existing model.");
+    DMLC_DECLARE_FIELD(debug_verbose)
+        .set_lower_bound(0)
+        .set_default(0)
+        .describe("flag to print out detailed breakdown of runtime");
    // add alias
    DMLC_DECLARE_ALIAS(updater_seq, updater);
  }
@@ -260,9 +266,13 @@ class GBTree : public GradientBooster {
        new_trees.push_back(std::move(ret));
      }
    }
+    double tstart = dmlc::GetTime();
    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
      this->CommitModel(std::move(new_trees[gid]), gid);
    }
+    if (tparam.debug_verbose > 0) {
+      LOG(INFO) << "CommitModel(): " << dmlc::GetTime() - tstart << " sec";
+    }
  }

  void Predict(DMatrix* p_fmat,
@@ -474,14 +484,20 @@ class GBTree : public GradientBooster {
    // update cache entry
    for (auto &kv : cache_) {
      CacheEntry& e = kv.second;
+
      if (e.predictions.size() == 0) {
        PredLoopInternal<GBTree>(
            e.data.get(), &(e.predictions),
            0, trees.size(), true);
      } else {
-        PredLoopInternal<GBTree>(
-            e.data.get(), &(e.predictions),
-            old_ntree, trees.size(), false);
+        if (mparam.num_output_group == 1 && updaters.size() > 0 && new_trees.size() == 1
+          && updaters.back()->UpdatePredictionCache(e.data.get(), &(e.predictions)) ) {
+          {}  // do nothing
+        } else {
+          PredLoopInternal<GBTree>(
+              e.data.get(), &(e.predictions),
+              old_ntree, trees.size(), false);
+        }
      }
    }
  }