checkin some micro optimization

2015-05-15 23:54:03 -07:00
parent 9c52fc8e22
commit 792cff5abc
4 changed files with 193 additions and 14 deletions
--- a/2
+++ b/2
@@ -2,7 +2,7 @@ export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas 
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops

 ifeq ($(OS), Windows_NT)
 	export CXX = g++ -m64
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -48,6 +48,8 @@ struct TrainParam{
  int size_leaf_vector;  
  // option for parallelization
  int parallel_option;
+  // option to open cacheline optimizaton
+  int cache_opt;
  // number of threads to be used for tree construction,
  // if OpenMP is enabled, if equals 0, use system default
  int nthread;
@@ -70,6 +72,7 @@ struct TrainParam{
    parallel_option = 2;
    sketch_eps = 0.1f;
    sketch_ratio = 2.0f;
+    cache_opt = 0;
  }
  /*! 
   * \brief set parameters from outside 
@@ -96,6 +99,7 @@ struct TrainParam{
    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
+    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
    if (!strcmp(name, "nthread")) nthread = atoi(val);
    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
@@ -192,6 +196,11 @@ struct GradStats {
  double sum_grad;
  /*! \brief sum hessian statistics */
  double sum_hess;
+  /*!
+   * \brief whether this is simply statistics and we only need to call
+   *   Add(gpair), instead of Add(gpair, info, ridx)
+   */
+  static const int kSimpleStats = 1;
  /*! \brief constructor, the object must be cleared during construction */
  explicit GradStats(const TrainParam &param) {
    this->Clear();
@@ -204,7 +213,14 @@ struct GradStats {
  inline static void CheckInfo(const BoosterInfo &info) {
  }
  /*!
-   * \brief accumulate statistics,
+   * \brief accumulate statistics 
+   * \param p the gradient pair
+   */
+  inline void Add(bst_gpair p) {
+    this->Add(p.grad, p.hess);
+  }
+  /*!
+   * \brief accumulate statistics, more complicated version
   * \param gpair the vector storing the gradient statistics
   * \param info the additional information 
   * \param ridx instance index of this instance
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -356,7 +356,100 @@ class ColMaker: public IUpdater {
          }
        }
      }
-    }    
+    }
+    // update enumeration solution
+    inline void UpdateEnumeration(int nid, bst_gpair gstats,
+                                  float fvalue, int d_step, bst_uint fid,
+                                  TStats &c, std::vector<ThreadEntry> &temp) {
+      // get the statistics of nid
+      ThreadEntry &e = temp[nid];
+      // test if first hit, this is fine, because we set 0 during init
+      if (e.stats.Empty()) {
+        e.stats.Add(gstats);
+        e.last_fvalue = fvalue;
+      } else {
+        // try to find a split
+        if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
+          c.SetSubstract(snode[nid].stats, e.stats);
+          if (c.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
+          }
+        }
+        // update the statistics
+        e.stats.Add(gstats);
+        e.last_fvalue = fvalue;
+      }
+    }
+    // same as EnumerateSplit, with cacheline prefetch optimization
+    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
+                                       const ColBatch::Entry *end,
+                                       int d_step,
+                                       bst_uint fid,
+                                       const std::vector<bst_gpair> &gpair,
+                                       std::vector<ThreadEntry> &temp) {
+      const std::vector<int> &qexpand = qexpand_;
+      // clear all the temp statistics
+      for (size_t j = 0; j < qexpand.size(); ++j) {
+        temp[qexpand[j]].stats.Clear();
+      }
+      // left statistics
+      TStats c(param);
+      // local cache buffer for position and gradient pair
+      const int kBuffer = 32;
+      int buf_position[kBuffer];
+      bst_gpair buf_gpair[kBuffer];
+      // aligned ending position
+      const ColBatch::Entry *align_end;
+      if (d_step > 0) {
+        align_end = begin + (end - begin) / kBuffer * kBuffer;
+      } else {
+        align_end = begin - (begin - end) / kBuffer * kBuffer;
+      }
+      int i;
+      const ColBatch::Entry *it;
+      const int align_step = d_step * kBuffer;
+      // internal cached loop
+      for (it = begin; it != align_end; it += align_step) {
+        const ColBatch::Entry *p;
+        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
+          buf_position[i] = position[p->index];
+          buf_gpair[i] = gpair[p->index];
+        }
+        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
+          const int nid = buf_position[i];
+          if (nid < 0) continue;
+          this->UpdateEnumeration(nid, buf_gpair[i],
+                                  p->fvalue, d_step,
+                                  fid, c, temp);
+        }        
+      }
+      // finish up the ending piece
+      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
+        buf_position[i] = position[it->index];
+        buf_gpair[i] = gpair[it->index];
+      }
+      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
+        const int nid = buf_position[i];
+        if (nid < 0) continue;
+        this->UpdateEnumeration(nid, buf_gpair[i],
+                                it->fvalue, d_step,
+                                fid, c, temp);
+      }            
+      // finish updating all statistics, check if it is possible to include all sum statistics
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        ThreadEntry &e = temp[nid];
+        c.SetSubstract(snode[nid].stats, e.stats);
+        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
+          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+          const float gap = std::abs(e.last_fvalue) + rt_eps;
+          const float delta = d_step == +1 ? gap: -gap;
+          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
+        }
+      }
+    }
+
    // enumerate the split values of specific feature
    inline void EnumerateSplit(const ColBatch::Entry *begin,
                               const ColBatch::Entry *end,
@@ -365,6 +458,11 @@ class ColMaker: public IUpdater {
                               const std::vector<bst_gpair> &gpair,
                               const BoosterInfo &info,
                               std::vector<ThreadEntry> &temp) {
+      // use cacheline aware optimization
+      if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
+        EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
+        return;
+      }
      const std::vector<int> &qexpand = qexpand_;
      // clear all the temp statistics
      for (size_t j = 0; j < qexpand.size(); ++j) {
@@ -411,6 +509,7 @@ class ColMaker: public IUpdater {
        }
      }
    }
+
    // update the solution candidate 
    virtual void UpdateSolution(const ColBatch &batch,
                                const std::vector<bst_gpair> &gpair,
@@ -550,8 +649,8 @@ class ColMaker: public IUpdater {
          #pragma omp parallel for schedule(static)
          for (bst_omp_uint j = 0; j < ndata; ++j) {
            const bst_uint ridx = col[j].index;
-            const float fvalue = col[j].fvalue;
            const int nid = this->DecodePosition(ridx);
+            const float fvalue = col[j].fvalue;
            // go back to parent, correct those who are not default
            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
              if(fvalue < tree[nid].split_cond()) {
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker<TStats> {
      utils::Assert(istart != hist.size, "the bound variable must be max");
      hist.data[istart].Add(gpair, info, ridx);
    }
+    /*! 
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    bst_gpair gstats) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gstats);
+    }
  };
  // sketch type used for this
  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
@@ -479,11 +489,38 @@ class CQHistMaker: public HistMaker<TStats> {
      hbuilder[nid].istart = 0;
      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
    }
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
+      const bst_uint kBuffer = 32;
+      bst_uint align_length = c.length / kBuffer * kBuffer;
+      int buf_position[kBuffer];
+      bst_gpair buf_gpair[kBuffer];
+      for (bst_uint j = 0; j < align_length; j += kBuffer) {
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          bst_uint ridx = c[j + i].index;
+          buf_position[i] = this->position[ridx];
+          buf_gpair[i] = gpair[ridx];
+        }
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          const int nid = buf_position[i];
+          if (nid >= 0) {
+            hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
+          }
+        }
+      }
+      for (bst_uint j = align_length; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
+        }
+      }
+    } else {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+        }
      }
    }
  }
@@ -536,11 +573,38 @@ class CQHistMaker: public HistMaker<TStats> {
      sbuilder[nid].Init(max_size);
    }
    // second pass, build the sketch
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
+      const bst_uint kBuffer = 32;
+      bst_uint align_length = c.length / kBuffer * kBuffer;
+      int buf_position[kBuffer];
+      bst_float buf_hess[kBuffer];
+      for (bst_uint j = 0; j < align_length; j += kBuffer) {
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          bst_uint ridx = c[j + i].index;
+          buf_position[i] = this->position[ridx];
+          buf_hess[i] = gpair[ridx].hess;
+        }
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          const int nid = buf_position[i];
+          if (nid >= 0) {
+            sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
+          }
+        }        
+      }
+      for (bst_uint j = align_length; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        }
+      }
+    } else {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        }
      }
    }
    for (size_t i = 0; i < this->qexpand.size(); ++i) {