[core] fix slow predict-caching with many classes (#3109)

* fix prediction caching inefficiency for multiclass * silence some warnings * redundant if * workaround for R v3.4.3 bug; fixes #3081
2018-02-15 18:31:42 -06:00 · 2018-02-15 18:31:42 -06:00 · 9ffe8596f2
commit 9ffe8596f2
parent cf19caa46a
5 changed files with 31 additions and 32 deletions
--- a/R-package/configure.win
+++ b/R-package/configure.win
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -281,9 +281,7 @@ class GBTree : public GradientBooster {
    }
    monitor.Stop("BoostNewTrees");
    monitor.Start("CommitModel");
-    for (int gid = 0; gid < ngroup; ++gid) {
+    this->CommitModel(std::move(new_trees));
      this->CommitModel(std::move(new_trees[gid]), gid);
    }
    monitor.Stop("CommitModel");
  }
@ -338,11 +336,13 @@ class GBTree : public GradientBooster {
  // commit new trees all at once
  virtual void
-  CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
+  CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) {
-              int bst_group) {
+    int num_new_trees = 0;
-    model_.CommitModel(std::move(new_trees), bst_group);
+    for (int gid = 0; gid < model_.param.num_output_group; ++gid) {
-
+      num_new_trees += new_trees[gid].size();
-    predictor->UpdatePredictionCache(model_, &updaters, new_trees.size());
+      model_.CommitModel(std::move(new_trees[gid]), gid);
    }
    predictor->UpdatePredictionCache(model_, &updaters, num_new_trees);
  }
  // --- data structure ---
@ -514,20 +514,22 @@ class Dart : public GBTree {
      }
    }
  }
  // commit new trees all at once
-  void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
+  void
-                   int bst_group) override {
+  CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& new_trees) override {
-    for (size_t i = 0; i < new_trees.size(); ++i) {
+    int num_new_trees = 0;
-      model_.trees.push_back(std::move(new_trees[i]));
+    for (int gid = 0; gid < model_.param.num_output_group; ++gid) {
-      model_.tree_info.push_back(bst_group);
+      num_new_trees += new_trees[gid].size();
      model_.CommitModel(std::move(new_trees[gid]), gid);
    }
-    model_.param.num_trees += static_cast<int>(new_trees.size());
+    size_t num_drop = NormalizeTrees(num_new_trees);
    size_t num_drop = NormalizeTrees(new_trees.size());
    if (dparam.silent != 1) {
      LOG(INFO) << "drop " << num_drop << " trees, "
                << "weight = " << weight_drop.back();
    }
  }
  // predict the leaf scores without dropped trees
  inline bst_float PredValue(const RowBatch::Inst &inst,
                             int bst_group,
@ -550,16 +552,17 @@ class Dart : public GBTree {
    return psum;
  }
-  // select dropped trees
+  // select which trees to drop
  inline void DropTrees(unsigned ntree_limit_drop) {
    idx_drop.clear();
    if (ntree_limit_drop > 0) return;
    std::uniform_real_distribution<> runif(0.0, 1.0);
    auto& rnd = common::GlobalRandom();
    // reset
    idx_drop.clear();
    // sample dropped trees
    bool skip = false;
    if (dparam.skip_drop > 0.0) skip = (runif(rnd) < dparam.skip_drop);
-    if (ntree_limit_drop == 0 && !skip) {
+    // sample some trees to drop
    if (!skip) {
      if (dparam.sample_type == 1) {
        bst_float sum_weight = 0.0;
        for (size_t i = 0; i < weight_drop.size(); ++i) {
@ -594,6 +597,7 @@ class Dart : public GBTree {
      }
    }
  }
  // set normalization factors
  inline size_t NormalizeTrees(size_t size_new_trees) {
    float lr = 1.0 * dparam.learning_rate / size_new_trees;
--- a/src/objective/regression_obj.cc
+++ b/src/objective/regression_obj.cc
@ -56,7 +56,7 @@ class RegLossObj : public ObjFunction {
    int nthread = omp_get_max_threads();
    // Use a maximum of 8 threads
 #pragma omp parallel for schedule(static) num_threads(std::min(8, nthread))
-    for (int i = 0; i < n - remainder; i += 8) {
+    for (omp_ulong i = 0; i < n - remainder; i += 8) {
      avx::Float8 y(&info.labels[i]);
      avx::Float8 p = Loss::PredTransform(avx::Float8(&preds[i]));
      avx::Float8 w = info.weights.empty() ? avx::Float8(1.0f)
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@ -100,13 +100,9 @@ class CPUPredictor : public Predictor {
                        const gbm::GBTreeModel& model, int tree_begin,
                        unsigned ntree_limit) {
    // TODO(Rory): Check if this specialisation actually improves performance
    if (model.param.num_output_group == 1) {
      PredLoopSpecalize(dmat, out_preds, model, 1, tree_begin, ntree_limit);
    } else {
    PredLoopSpecalize(dmat, out_preds, model, model.param.num_output_group,
                      tree_begin, ntree_limit);
  }
  }
 public:
  void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
@ -132,8 +128,7 @@ class CPUPredictor : public Predictor {
    this->PredLoopInternal(dmat, out_preds, model, tree_begin, ntree_limit);
  }
-  void UpdatePredictionCache(
+  void UpdatePredictionCache(const gbm::GBTreeModel& model,
      const gbm::GBTreeModel& model,
                             std::vector<std::unique_ptr<TreeUpdater>>* updaters,
                             int num_new_trees) override {
    int old_ntree = model.trees.size() - num_new_trees;