Improve multi-threaded performance (#2104)

* Add UpdatePredictionCache() option to updaters Some updaters (e.g. fast_hist) has enough information to quickly compute prediction cache for the training data. Each updater may override UpdaterPredictionCache() method to update the prediction cache. Note: this trick does not apply to validation data. * Respond to code review * Disable some debug messages by default * Document UpdatePredictionCache() interface * Remove base_margin logic from UpdatePredictionCache() implementation * Do not take pointer to cfg, as reference may get stale * Improve multi-threaded performance * Use columnwise accessor to accelerate ApplySplit() step, with support for a compressed representation * Parallel sort for evaluation step * Inline BuildHist() function * Cache gradient pairs when building histograms in BuildHist() * Add missing #if macro * Respond to code review * Use wrapper to enable parallel sort on Linux * Fix C++ compatibility issues * MSVC doesn't support unsigned in OpenMP loops * gcc 4.6 doesn't support using keyword * Fix lint issues * Respond to code review * Fix bug in ApplySplitSparseData() * Attempting to read beyond the end of a sparse column * Mishandling the case where an entire range of rows have missing values * Fix training continuation bug Disable UpdatePredictionCache() in the first iteration. This way, we can accomodate the scenario where we build off of an existing (nonempty) ensemble. * Add regression test for fast_hist * Respond to code review * Add back old version of ApplySplitSparseData
2017-03-25 10:35:01 -07:00
parent 332aea26a3
commit 14fba01b5a
14 changed files with 719 additions and 171 deletions
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -97,44 +97,40 @@ struct EvalAuc : public Metric {
    // sum statistics
    bst_float sum_auc = 0.0f;
    int auc_error = 0;
-    #pragma omp parallel reduction(+:sum_auc)
-    {
-      // each thread takes a local rec
-      std::vector< std::pair<bst_float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.push_back(std::make_pair(preds[j], j));
-        }
-        std::sort(rec.begin(), rec.end(), common::CmpFirst);
-        // calculate AUC
-        double sum_pospair = 0.0;
-        double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
-        for (size_t j = 0; j < rec.size(); ++j) {
-          const bst_float wt = info.GetWeight(rec[j].second);
-          const bst_float ctr = info.labels[rec[j].second];
-          // keep bucketing predictions in same bucket
-          if (j != 0 && rec[j].first != rec[j - 1].first) {
-            sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-            sum_npos += buf_pos;
-            sum_nneg += buf_neg;
-            buf_neg = buf_pos = 0.0f;
-          }
-          buf_pos += ctr * wt;
-          buf_neg += (1.0f - ctr) * wt;
-        }
-        sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-        sum_npos += buf_pos;
-        sum_nneg += buf_neg;
-        // check weird conditions
-        if (sum_npos <= 0.0 || sum_nneg <= 0.0) {
-          auc_error = 1;
-          continue;
-        }
-        // this is the AUC
-        sum_auc += sum_pospair / (sum_npos*sum_nneg);
+    // each thread takes a local rec
+    std::vector< std::pair<bst_float, unsigned> > rec;
+    for (bst_omp_uint k = 0; k < ngroup; ++k) {
+      rec.clear();
+      for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
+        rec.push_back(std::make_pair(preds[j], j));
      }
+      XGBOOST_PARALLEL_SORT(rec.begin(), rec.end(), common::CmpFirst);
+      // calculate AUC
+      double sum_pospair = 0.0;
+      double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
+      for (size_t j = 0; j < rec.size(); ++j) {
+        const bst_float wt = info.GetWeight(rec[j].second);
+        const bst_float ctr = info.labels[rec[j].second];
+        // keep bucketing predictions in same bucket
+        if (j != 0 && rec[j].first != rec[j - 1].first) {
+          sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
+          sum_npos += buf_pos;
+          sum_nneg += buf_neg;
+          buf_neg = buf_pos = 0.0f;
+        }
+        buf_pos += ctr * wt;
+        buf_neg += (1.0f - ctr) * wt;
+      }
+      sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
+      sum_npos += buf_pos;
+      sum_nneg += buf_neg;
+      // check weird conditions
+      if (sum_npos <= 0.0 || sum_nneg <= 0.0) {
+        auc_error = 1;
+        continue;
+      }
+      // this is the AUC
+      sum_auc += sum_pospair / (sum_npos*sum_nneg);
    }
    CHECK(!auc_error)
      << "AUC: the dataset only contains pos or neg samples";
@@ -262,9 +258,9 @@ struct EvalNDCG : public EvalRankList{
    return sumdcg;
  }
  virtual bst_float EvalMetric(std::vector<std::pair<bst_float, unsigned> > &rec) const { // NOLINT(*)
-    std::stable_sort(rec.begin(), rec.end(), common::CmpFirst);
+    XGBOOST_PARALLEL_STABLE_SORT(rec.begin(), rec.end(), common::CmpFirst);
    bst_float dcg = this->CalcDCG(rec);
-    std::stable_sort(rec.begin(), rec.end(), common::CmpSecond);
+    XGBOOST_PARALLEL_STABLE_SORT(rec.begin(), rec.end(), common::CmpSecond);
    bst_float idcg = this->CalcDCG(rec);
    if (idcg == 0.0f) {
      if (minus_) {