From 32beb56ba3b4eb55e7270f2987066ff50f997982 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 18 Nov 2014 22:21:41 -0800 Subject: [PATCH] only need to add in create hist col base --- src/tree/updater_basemaker-inl.hpp | 77 ++++++++++- src/tree/updater_histmaker-inl.hpp | 215 ++++++++--------------------- src/tree/updater_skmaker-inl.hpp | 74 ---------- 3 files changed, 135 insertions(+), 231 deletions(-) diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp index 8152b86cb..f414752d9 100644 --- a/src/tree/updater_basemaker-inl.hpp +++ b/src/tree/updater_basemaker-inl.hpp @@ -8,6 +8,7 @@ #include #include #include "../utils/random.h" +#include "../utils/quantile.h" namespace xgboost { namespace tree { @@ -238,7 +239,81 @@ class BaseMaker: public IUpdater { s.Add(thread_temp[tid][nid]); } } - } + } + /*! \brief common helper data structure to build sketch*/ + struct SketchEntry { + /*! \brief total sum of amount to be met */ + bst_float sum_total; + /*! \brief statistics used in the sketch */ + bst_float rmin, wmin; + /*! \brief last seen feature value */ + bst_float last_fvalue; + /*! \brief current size of sketch */ + bst_float next_goal; + // pointer to the sketch to put things in + utils::WXQuantileSketch *sketch; + // initialize the space + inline void Init(unsigned max_size) { + next_goal = -1.0f; + rmin = wmin = 0.0f; + sketch->temp.Reserve(max_size + 1); + sketch->temp.size = 0; + } + /*! + * \brief push a new element to sketch + * \param fvalue feature value, comes in sorted ascending order + * \param w weight + * \param max_size + */ + inline void Push(bst_float fvalue, bst_float w, unsigned max_size) { + if (next_goal == -1.0f) { + next_goal = 0.0f; + last_fvalue = fvalue; + wmin = w; + return; + } + if (last_fvalue != fvalue) { + bst_float rmax = rmin + wmin; + if (rmax >= next_goal) { + if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { + // push to sketch + sketch->temp.data[sketch->temp.size] = + utils::WXQuantileSketch:: + Entry(rmin, rmax, wmin, last_fvalue); + utils::Assert(sketch->temp.size < max_size, + "invalid maximum size max_size=%u, stemp.size=%lu\n", + max_size, sketch->temp.size); + ++sketch->temp.size; + } + if (sketch->temp.size == max_size) { + next_goal = sum_total * 2.0f + 1e-5f; + } else{ + next_goal = static_cast(sketch->temp.size * sum_total / max_size); + } + } + rmin = rmax; + wmin = w; + last_fvalue = fvalue; + } else { + wmin += w; + } + } + /*! \brief push final unfinished value to the sketch */ + inline void Finalize(unsigned max_size) { + bst_float rmax = rmin + wmin; + if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { + utils::Assert(sketch->temp.size <= max_size, + "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu", + sketch->temp.size, max_size ); + // push to sketch + sketch->temp.data[sketch->temp.size] = + utils::WXQuantileSketch:: + Entry(rmin, rmax, wmin, last_fvalue); + ++sketch->temp.size; + } + sketch->PushTemp(); + } + }; /*! \brief training parameter of tree grower */ TrainParam param; /*! \brief queue of nodes to be expanded */ diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp index d893de52d..63b5e99f4 100644 --- a/src/tree/updater_histmaker-inl.hpp +++ b/src/tree/updater_histmaker-inl.hpp @@ -79,7 +79,7 @@ class HistMaker: public BaseMaker { /*! \brief cut field */ std::vector cut; // per thread histset - std::vector hset; + std::vector hset; // initialize the hist set inline void Init(const TrainParam ¶m, int nthread) { hset.resize(nthread); @@ -111,28 +111,26 @@ class HistMaker: public BaseMaker { inline size_t Size(void) const { return rptr.size() - 1; } - }; + }; // workspace of thread ThreadWSpace wspace; // reducer for histogram - sync::Reducer histred; - - // this function does two jobs - // (1) reset the position in array position, to be the latest leaf id - // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly - virtual void ResetPosAndPropose(const std::vector &gpair, - IFMatrix *p_fmat, - const BoosterInfo &info, - const RegTree &tree) = 0; - + sync::Reducer histred; + // update function implementation virtual void Update(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, RegTree *p_tree) { this->InitData(gpair, *p_fmat, info.root_index, *p_tree); for (int depth = 0; depth < param.max_depth; ++depth) { + // reset and propose candidate split + this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree); + // create histogram + this->CreateHist(gpair, p_fmat, info, *p_tree); + // find split based on histogram statistics this->FindSplit(depth, gpair, p_fmat, info, p_tree); - this->ResetPositionCol(this->qexpand, p_fmat, *p_tree); + // reset position after split + this->ResetPositionAfterSplit(p_fmat, *p_tree); this->UpdateQueueExpand(*p_tree); // if nothing left to be expand, break if (qexpand.size() == 0) break; @@ -142,12 +140,21 @@ class HistMaker: public BaseMaker { (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate); } } - - private: - inline void CreateHist(const std::vector &gpair, - IFMatrix *p_fmat, - const BoosterInfo &info, - const RegTree &tree) { + // this function does two jobs + // (1) reset the position in array position, to be the latest leaf id + // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly + virtual void ResetPosAndPropose(const std::vector &gpair, + IFMatrix *p_fmat, + const BoosterInfo &info, + const RegTree &tree) = 0; + // reset position after split, this is not a must, depending on implementation + virtual void ResetPositionAfterSplit(IFMatrix *p_fmat, + const RegTree &tree) { + } + virtual void CreateHist(const std::vector &gpair, + IFMatrix *p_fmat, + const BoosterInfo &info, + const RegTree &tree) { bst_uint num_feature = tree.param.num_feature; int nthread; #pragma omp parallel @@ -190,6 +197,8 @@ class HistMaker: public BaseMaker { // sync the histogram histred.AllReduce(BeginPtr(wspace.hset[0].data), wspace.hset[0].data.size()); } + + private: inline void EnumerateSplit(const HistUnit &hist, const TStats &node_sum, bst_uint fid, @@ -231,10 +240,6 @@ class HistMaker: public BaseMaker { const BoosterInfo &info, RegTree *p_tree) { const bst_uint num_feature = p_tree->param.num_feature; - // reset and propose candidate split - this->ResetPosAndPropose(gpair, p_fmat, info, *p_tree); - // create histogram - this->CreateHist(gpair, p_fmat, info, *p_tree); // get the best split condition for each node std::vector sol(qexpand.size()); std::vector left_sum(qexpand.size()); @@ -288,17 +293,23 @@ class HistMaker: public BaseMaker { template class CQHistMaker: public HistMaker { protected: + virtual void ResetPositionAfterSplit(IFMatrix *p_fmat, + const RegTree &tree) { + this->ResetPositionCol(this->qexpand, p_fmat, tree); + } virtual void ResetPosAndPropose(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info, const RegTree &tree) { + this->GetNodeStats(gpair, *p_fmat, tree, info, + &thread_stats, &node_stats); sketchs.resize(this->qexpand.size() * tree.param.num_feature); for (size_t i = 0; i < sketchs.size(); ++i) { sketchs[i].Init(info.num_row, this->param.sketch_eps); } - thread_temp.resize(this->get_nthread()); - std::vector root_stats; - this->GetRootStats(gpair, *p_fmat, tree, &root_stats); + thread_sketch.resize(this->get_nthread()); + // number of rows in + const size_t nrows = p_fmat->buffered_rowset().size(); // start accumulating statistics utils::IIterator *iter = p_fmat->ColIterator(); iter->BeforeFirst(); @@ -308,15 +319,15 @@ class CQHistMaker: public HistMaker { const bst_omp_uint nsize = static_cast(batch.size); #pragma omp parallel for schedule(dynamic, 1) for (bst_omp_uint i = 0; i < nsize; ++i) { - this->MakeSketch(gpair, batch[i], tree, - root_stats, - batch.col_index[i], - p_fmat->GetColDensity(batch.col_index[i]), - &thread_temp[omp_get_thread_num()]); + this->UpdateSketchCol(gpair, batch[i], tree, + node_stats, + batch.col_index[i], + batch[i].length == nrows, + &thread_sketch[omp_get_thread_num()]); } } // setup maximum size - size_t max_size = static_cast(this->param.sketch_ratio / this->param.sketch_eps); + unsigned max_size = this->param.max_sketch_size(); // synchronize sketch summary_array.Init(sketchs.size(), max_size); for (size_t i = 0; i < sketchs.size(); ++i) { @@ -356,93 +367,18 @@ class CQHistMaker: public HistMaker { (tree.param.num_feature + 1) * this->qexpand.size() + 1, "cut space inconsistent"); } - // temporal space to build a sketch - struct SketchEntry { - /*! \brief total sum of */ - bst_float sum_total; - /*! \brief statistics used in the sketch */ - bst_float rmin, wmin; - /*! \brief last seen feature value */ - bst_float last_fvalue; - /*! \brief current size of sketch */ - bst_float next_goal; - // pointer to the sketch to put things in - utils::WXQuantileSketch *sketch; - // initialize the space - inline void Init(unsigned max_size) { - next_goal = 0.0f; - rmin = wmin = 0.0f; - sketch->temp.Reserve(max_size + 1); - sketch->temp.size = 0; - } - /*! - * \brief push a new element to sketch - * \param fvalue feature value, comes in sorted ascending order - * \param w weight - * \param max_size - */ - inline void Push(bst_float fvalue, bst_float w, unsigned max_size) { - if (wmin == 0.0f) { - last_fvalue = fvalue; - wmin = w; - return; - } - if (last_fvalue != fvalue) { - bst_float rmax = rmin + wmin; - if (rmax >= next_goal) { - if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { - // push to sketch - sketch->temp.data[sketch->temp.size] = - utils::WXQuantileSketch:: - Entry(rmin, rmax, wmin, last_fvalue); - utils::Assert(sketch->temp.size < max_size, - "invalid maximum size max_size=%u, stemp.size=%lu\n", - max_size, sketch->temp.size); - ++sketch->temp.size; - } - if (sketch->temp.size == max_size) { - next_goal = sum_total * 2.0f + 1e-5f; - } else{ - next_goal = static_cast(sketch->temp.size * sum_total / max_size); - } - } - rmin = rmax; - wmin = w; - last_fvalue = fvalue; - } else { - wmin += w; - } - } - /*! \brief push final unfinished value to the sketch */ - inline void Finalize(unsigned max_size) { - bst_float rmax = rmin + wmin; - //utils::Assert(fabs(rmax - sum_total) < 1e-4 + sum_total * 1e-5, - //"invalid sum value, rmax=%f, sum_total=%lf", rmax, sum_total); - if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { - utils::Assert(sketch->temp.size <= max_size, - "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu", - sketch->temp.size, max_size ); - // push to sketch - sketch->temp.data[sketch->temp.size] = - utils::WXQuantileSketch:: - Entry(rmin, rmax, wmin, last_fvalue); - ++sketch->temp.size; - } - sketch->PushTemp(); - } - }; private: - inline void MakeSketch(const std::vector &gpair, - const ColBatch::Inst &c, - const RegTree &tree, - const std::vector &root_stats, - bst_uint fid, - float col_density, - std::vector *p_temp) { + inline void UpdateSketchCol(const std::vector &gpair, + const ColBatch::Inst &c, + const RegTree &tree, + const std::vector &nstats, + bst_uint fid, + bool col_full, + std::vector *p_temp) { if (c.length == 0) return; // initialize sbuilder for use - std::vector &sbuilder = *p_temp; + std::vector &sbuilder = *p_temp; sbuilder.resize(tree.param.num_nodes); for (size_t i = 0; i < this->qexpand.size(); ++i) { const unsigned nid = this->qexpand[i]; @@ -451,7 +387,7 @@ class CQHistMaker: public HistMaker { sbuilder[nid].sketch = &sketchs[wid * tree.param.num_feature + fid]; } - if (col_density != 1.0f) { + if (!col_full) { // first pass, get sum of weight, TODO, optimization to skip first pass for (bst_uint j = 0; j < c.length; ++j) { const bst_uint ridx = c[j].index; @@ -463,7 +399,7 @@ class CQHistMaker: public HistMaker { } else { for (size_t i = 0; i < this->qexpand.size(); ++i) { const unsigned nid = this->qexpand[i]; - sbuilder[nid].sum_total = root_stats[nid]; + sbuilder[nid].sum_total = nstats[nid].sum_hess; } } // if only one value, no need to do second pass @@ -475,7 +411,7 @@ class CQHistMaker: public HistMaker { return; } // two pass scan - unsigned max_size = static_cast(this->param.sketch_ratio / this->param.sketch_eps); + unsigned max_size = this->param.max_sketch_size(); for (size_t i = 0; i < this->qexpand.size(); ++i) { const int nid = this->qexpand[i]; sbuilder[nid].Init(max_size); @@ -493,47 +429,14 @@ class CQHistMaker: public HistMaker { sbuilder[nid].Finalize(max_size); } } - inline void GetRootStats(const std::vector &gpair, - const IFMatrix &fmat, - const RegTree &tree, - std::vector *p_snode) { - std::vector &snode = *p_snode; - thread_temp.resize(this->get_nthread()); - snode.resize(tree.param.num_nodes); - #pragma omp parallel - { - const int tid = omp_get_thread_num(); - thread_temp[tid].resize(tree.param.num_nodes); - for (size_t i = 0; i < this->qexpand.size(); ++i) { - const unsigned nid = this->qexpand[i]; - thread_temp[tid][nid].sum_total = 0.0f; - } - } - const std::vector &rowset = fmat.buffered_rowset(); - // setup position - const bst_omp_uint ndata = static_cast(rowset.size()); - #pragma omp parallel for schedule(static) - for (bst_omp_uint i = 0; i < ndata; ++i) { - const bst_uint ridx = rowset[i]; - const int tid = omp_get_thread_num(); - if (this->position[ridx] < 0) continue; - thread_temp[tid][this->position[ridx]].sum_total += gpair[ridx].hess; - } - // sum the per thread statistics together - for (size_t j = 0; j < this->qexpand.size(); ++j) { - const int nid = this->qexpand[j]; - double wsum = 0.0f; - for (size_t tid = 0; tid < thread_temp.size(); ++tid) { - wsum += thread_temp[tid][nid].sum_total; - } - // update node statistics - snode[nid] = static_cast(wsum); - } - } typedef utils::WXQuantileSketch WXQSketch; // thread temp data - std::vector< std::vector > thread_temp; + std::vector< std::vector > thread_sketch; + // used to hold statistics + std::vector< std::vector > thread_stats; + // node statistics + std::vector node_stats; // summary array WXQSketch::SummaryArray summary_array; // reducer for summary diff --git a/src/tree/updater_skmaker-inl.hpp b/src/tree/updater_skmaker-inl.hpp index 58f150ee6..dd23b22c1 100644 --- a/src/tree/updater_skmaker-inl.hpp +++ b/src/tree/updater_skmaker-inl.hpp @@ -130,80 +130,6 @@ class SketchMaker: public BaseMaker { inline void SetLeafVec(const TrainParam ¶m, bst_float *vec) const { } }; - // temporal space to build a sketch - struct SketchEntry { - /*! \brief total sum of amount to be met */ - bst_float sum_total; - /*! \brief statistics used in the sketch */ - bst_float rmin, wmin; - /*! \brief last seen feature value */ - bst_float last_fvalue; - /*! \brief current size of sketch */ - bst_float next_goal; - // pointer to the sketch to put things in - utils::WXQuantileSketch *sketch; - // initialize the space - inline void Init(unsigned max_size) { - next_goal = -1.0f; - rmin = wmin = 0.0f; - sketch->temp.Reserve(max_size + 1); - sketch->temp.size = 0; - } - /*! - * \brief push a new element to sketch - * \param fvalue feature value, comes in sorted ascending order - * \param w weight - * \param max_size - */ - inline void Push(bst_float fvalue, bst_float w, unsigned max_size) { - if (next_goal == -1.0f) { - next_goal = 0.0f; - last_fvalue = fvalue; - wmin = w; - return; - } - if (last_fvalue != fvalue) { - bst_float rmax = rmin + wmin; - if (rmax >= next_goal) { - if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { - // push to sketch - sketch->temp.data[sketch->temp.size] = - utils::WXQuantileSketch:: - Entry(rmin, rmax, wmin, last_fvalue); - utils::Assert(sketch->temp.size < max_size, - "invalid maximum size max_size=%u, stemp.size=%lu\n", - max_size, sketch->temp.size); - ++sketch->temp.size; - } - if (sketch->temp.size == max_size) { - next_goal = sum_total * 2.0f + 1e-5f; - } else{ - next_goal = static_cast(sketch->temp.size * sum_total / max_size); - } - } - rmin = rmax; - wmin = w; - last_fvalue = fvalue; - } else { - wmin += w; - } - } - /*! \brief push final unfinished value to the sketch */ - inline void Finalize(unsigned max_size) { - bst_float rmax = rmin + wmin; - if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) { - utils::Assert(sketch->temp.size <= max_size, - "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu", - sketch->temp.size, max_size ); - // push to sketch - sketch->temp.data[sketch->temp.size] = - utils::WXQuantileSketch:: - Entry(rmin, rmax, wmin, last_fvalue); - ++sketch->temp.size; - } - sketch->PushTemp(); - } - }; inline void BuildSketch(const std::vector &gpair, IFMatrix *p_fmat, const BoosterInfo &info,