From 792cff5abc325ee841607ab41a04300e1040ec55 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 15 May 2015 23:54:03 -0700 Subject: [PATCH] checkin some micro optimization --- Makefile | 2 +- src/tree/param.h | 18 ++++- src/tree/updater_colmaker-inl.hpp | 103 ++++++++++++++++++++++++++++- src/tree/updater_histmaker-inl.hpp | 84 ++++++++++++++++++++--- 4 files changed, 193 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 140537879..e568222c2 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ export CC = gcc export CXX = g++ export MPICXX = mpicxx export LDFLAGS= -pthread -lm -export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas +export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops ifeq ($(OS), Windows_NT) export CXX = g++ -m64 diff --git a/src/tree/param.h b/src/tree/param.h index 3458a93a4..118540119 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -48,6 +48,8 @@ struct TrainParam{ int size_leaf_vector; // option for parallelization int parallel_option; + // option to open cacheline optimizaton + int cache_opt; // number of threads to be used for tree construction, // if OpenMP is enabled, if equals 0, use system default int nthread; @@ -70,6 +72,7 @@ struct TrainParam{ parallel_option = 2; sketch_eps = 0.1f; sketch_ratio = 2.0f; + cache_opt = 0; } /*! * \brief set parameters from outside @@ -96,6 +99,7 @@ struct TrainParam{ if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast(atof(val)); if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast(atof(val)); if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val); + if (!strcmp(name, "cache_opt")) cache_opt = atoi(val); if (!strcmp(name, "max_depth")) max_depth = atoi(val); if (!strcmp(name, "nthread")) nthread = atoi(val); if (!strcmp(name, "parallel_option")) parallel_option = atoi(val); @@ -192,6 +196,11 @@ struct GradStats { double sum_grad; /*! \brief sum hessian statistics */ double sum_hess; + /*! + * \brief whether this is simply statistics and we only need to call + * Add(gpair), instead of Add(gpair, info, ridx) + */ + static const int kSimpleStats = 1; /*! \brief constructor, the object must be cleared during construction */ explicit GradStats(const TrainParam ¶m) { this->Clear(); @@ -204,7 +213,14 @@ struct GradStats { inline static void CheckInfo(const BoosterInfo &info) { } /*! - * \brief accumulate statistics, + * \brief accumulate statistics + * \param p the gradient pair + */ + inline void Add(bst_gpair p) { + this->Add(p.grad, p.hess); + } + /*! + * \brief accumulate statistics, more complicated version * \param gpair the vector storing the gradient statistics * \param info the additional information * \param ridx instance index of this instance diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index f657c0335..b52842a93 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -356,7 +356,100 @@ class ColMaker: public IUpdater { } } } - } + } + // update enumeration solution + inline void UpdateEnumeration(int nid, bst_gpair gstats, + float fvalue, int d_step, bst_uint fid, + TStats &c, std::vector &temp) { + // get the statistics of nid + ThreadEntry &e = temp[nid]; + // test if first hit, this is fine, because we set 0 during init + if (e.stats.Empty()) { + e.stats.Add(gstats); + e.last_fvalue = fvalue; + } else { + // try to find a split + if (std::abs(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) { + c.SetSubstract(snode[nid].stats, e.stats); + if (c.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1); + } + } + // update the statistics + e.stats.Add(gstats); + e.last_fvalue = fvalue; + } + } + // same as EnumerateSplit, with cacheline prefetch optimization + inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin, + const ColBatch::Entry *end, + int d_step, + bst_uint fid, + const std::vector &gpair, + std::vector &temp) { + const std::vector &qexpand = qexpand_; + // clear all the temp statistics + for (size_t j = 0; j < qexpand.size(); ++j) { + temp[qexpand[j]].stats.Clear(); + } + // left statistics + TStats c(param); + // local cache buffer for position and gradient pair + const int kBuffer = 32; + int buf_position[kBuffer]; + bst_gpair buf_gpair[kBuffer]; + // aligned ending position + const ColBatch::Entry *align_end; + if (d_step > 0) { + align_end = begin + (end - begin) / kBuffer * kBuffer; + } else { + align_end = begin - (begin - end) / kBuffer * kBuffer; + } + int i; + const ColBatch::Entry *it; + const int align_step = d_step * kBuffer; + // internal cached loop + for (it = begin; it != align_end; it += align_step) { + const ColBatch::Entry *p; + for (i = 0, p = it; i < kBuffer; ++i, p += d_step) { + buf_position[i] = position[p->index]; + buf_gpair[i] = gpair[p->index]; + } + for (i = 0, p = it; i < kBuffer; ++i, p += d_step) { + const int nid = buf_position[i]; + if (nid < 0) continue; + this->UpdateEnumeration(nid, buf_gpair[i], + p->fvalue, d_step, + fid, c, temp); + } + } + // finish up the ending piece + for (it = align_end, i = 0; it != end; ++i, it += d_step) { + buf_position[i] = position[it->index]; + buf_gpair[i] = gpair[it->index]; + } + for (it = align_end, i = 0; it != end; ++i, it += d_step) { + const int nid = buf_position[i]; + if (nid < 0) continue; + this->UpdateEnumeration(nid, buf_gpair[i], + it->fvalue, d_step, + fid, c, temp); + } + // finish updating all statistics, check if it is possible to include all sum statistics + for (size_t i = 0; i < qexpand.size(); ++i) { + const int nid = qexpand[i]; + ThreadEntry &e = temp[nid]; + c.SetSubstract(snode[nid].stats, e.stats); + if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { + bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); + const float gap = std::abs(e.last_fvalue) + rt_eps; + const float delta = d_step == +1 ? gap: -gap; + e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1); + } + } + } + // enumerate the split values of specific feature inline void EnumerateSplit(const ColBatch::Entry *begin, const ColBatch::Entry *end, @@ -365,6 +458,11 @@ class ColMaker: public IUpdater { const std::vector &gpair, const BoosterInfo &info, std::vector &temp) { + // use cacheline aware optimization + if (TStats::kSimpleStats != 0 && param.cache_opt != 0) { + EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp); + return; + } const std::vector &qexpand = qexpand_; // clear all the temp statistics for (size_t j = 0; j < qexpand.size(); ++j) { @@ -411,6 +509,7 @@ class ColMaker: public IUpdater { } } } + // update the solution candidate virtual void UpdateSolution(const ColBatch &batch, const std::vector &gpair, @@ -550,8 +649,8 @@ class ColMaker: public IUpdater { #pragma omp parallel for schedule(static) for (bst_omp_uint j = 0; j < ndata; ++j) { const bst_uint ridx = col[j].index; - const float fvalue = col[j].fvalue; const int nid = this->DecodePosition(ridx); + const float fvalue = col[j].fvalue; // go back to parent, correct those who are not default if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) { if(fvalue < tree[nid].split_cond()) { diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp index d6279592f..f739f23f3 100644 --- a/src/tree/updater_histmaker-inl.hpp +++ b/src/tree/updater_histmaker-inl.hpp @@ -282,6 +282,16 @@ class CQHistMaker: public HistMaker { utils::Assert(istart != hist.size, "the bound variable must be max"); hist.data[istart].Add(gpair, info, ridx); } + /*! + * \brief add a histogram to data, + * do linear scan, start from istart + */ + inline void Add(bst_float fv, + bst_gpair gstats) { + while (istart < hist.size && !(fv < hist.cut[istart])) ++istart; + utils::Assert(istart != hist.size, "the bound variable must be max"); + hist.data[istart].Add(gstats); + } }; // sketch type used for this typedef utils::WXQuantileSketch WXQSketch; @@ -479,11 +489,38 @@ class CQHistMaker: public HistMaker { hbuilder[nid].istart = 0; hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)]; } - for (bst_uint j = 0; j < c.length; ++j) { - const bst_uint ridx = c[j].index; - const int nid = this->position[ridx]; - if (nid >= 0) { - hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx); + if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) { + const bst_uint kBuffer = 32; + bst_uint align_length = c.length / kBuffer * kBuffer; + int buf_position[kBuffer]; + bst_gpair buf_gpair[kBuffer]; + for (bst_uint j = 0; j < align_length; j += kBuffer) { + for (bst_uint i = 0; i < kBuffer; ++i) { + bst_uint ridx = c[j + i].index; + buf_position[i] = this->position[ridx]; + buf_gpair[i] = gpair[ridx]; + } + for (bst_uint i = 0; i < kBuffer; ++i) { + const int nid = buf_position[i]; + if (nid >= 0) { + hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]); + } + } + } + for (bst_uint j = align_length; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + hbuilder[nid].Add(c[j].fvalue, gpair[ridx]); + } + } + } else { + for (bst_uint j = 0; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx); + } } } } @@ -536,11 +573,38 @@ class CQHistMaker: public HistMaker { sbuilder[nid].Init(max_size); } // second pass, build the sketch - for (bst_uint j = 0; j < c.length; ++j) { - const bst_uint ridx = c[j].index; - const int nid = this->position[ridx]; - if (nid >= 0) { - sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); + if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) { + const bst_uint kBuffer = 32; + bst_uint align_length = c.length / kBuffer * kBuffer; + int buf_position[kBuffer]; + bst_float buf_hess[kBuffer]; + for (bst_uint j = 0; j < align_length; j += kBuffer) { + for (bst_uint i = 0; i < kBuffer; ++i) { + bst_uint ridx = c[j + i].index; + buf_position[i] = this->position[ridx]; + buf_hess[i] = gpair[ridx].hess; + } + for (bst_uint i = 0; i < kBuffer; ++i) { + const int nid = buf_position[i]; + if (nid >= 0) { + sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size); + } + } + } + for (bst_uint j = align_length; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); + } + } + } else { + for (bst_uint j = 0; j < c.length; ++j) { + const bst_uint ridx = c[j].index; + const int nid = this->position[ridx]; + if (nid >= 0) { + sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size); + } } } for (size_t i = 0; i < this->qexpand.size(); ++i) {