From 9caccd3b36c4f811ae908c0d126b7f08522a1ed6 Mon Sep 17 00:00:00 2001 From: "tqchen@graphlab.com" Date: Tue, 19 Aug 2014 12:07:52 -0700 Subject: [PATCH] change row subsample to prob --- src/data.h | 65 ++++++++++++++++--------------- src/learner/learner-inl.hpp | 8 ++-- src/tree/updater_colmaker-inl.hpp | 51 ++++++++++++++---------- 3 files changed, 69 insertions(+), 55 deletions(-) diff --git a/src/data.h b/src/data.h index 603334b5c..f5e02a562 100644 --- a/src/data.h +++ b/src/data.h @@ -14,6 +14,7 @@ #include "utils/io.h" #include "utils/utils.h" #include "utils/iterator.h" +#include "utils/random.h" #include "utils/matrix_csr.h" namespace xgboost { @@ -184,7 +185,6 @@ class FMatrixS : public FMatrixInterface{ /*! \brief constructor */ FMatrixS(void) { iter_ = NULL; - num_buffered_row_ = 0; } // destructor ~FMatrixS(void) { @@ -200,8 +200,8 @@ class FMatrixS : public FMatrixInterface{ return col_ptr_.size() - 1; } /*! \brief get number of buffered rows */ - inline size_t NumBufferedRow(void) const { - return num_buffered_row_; + inline const std::vector buffered_rowset(void) const { + return buffered_rowset_; } /*! \brief get col sorted iterator */ inline ColIter GetSortedCol(size_t cidx) const { @@ -224,12 +224,12 @@ class FMatrixS : public FMatrixInterface{ } /*! \brief get column density */ inline float GetColDensity(size_t cidx) const { - size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]); - return 1.0f - (static_cast(nmiss)) / num_buffered_row_; + size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); + return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); } - inline void InitColAccess(size_t max_nrow = ULONG_MAX) { + inline void InitColAccess(float pkeep = 1.0f) { if (this->HaveColAccess()) return; - this->InitColData(max_nrow); + this->InitColData(pkeep); } /*! * \brief get the row iterator associated with FMatrix @@ -248,8 +248,8 @@ class FMatrixS : public FMatrixInterface{ * \param fo output stream to save to */ inline void SaveColAccess(utils::IStream &fo) const { - fo.Write(&num_buffered_row_, sizeof(num_buffered_row_)); - if (num_buffered_row_ != 0) { + fo.Write(buffered_rowset_); + if (buffered_rowset_.size() != 0) { SaveBinary(fo, col_ptr_, col_data_); } } @@ -258,9 +258,8 @@ class FMatrixS : public FMatrixInterface{ * \param fo output stream to load from */ inline void LoadColAccess(utils::IStream &fi) { - utils::Check(fi.Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0, - "invalid input file format"); - if (num_buffered_row_ != 0) { + utils::Check(fi.Read(&buffered_rowset_), "invalid input file format"); + if (buffered_rowset_.size() != 0) { LoadBinary(fi, &col_ptr_, &col_data_); } } @@ -304,39 +303,43 @@ class FMatrixS : public FMatrixInterface{ protected: /*! * \brief intialize column data - * \param max_nrow maximum number of rows supported + * \param pkeep probability to keep a row */ - inline void InitColData(size_t max_nrow) { + inline void InitColData(float pkeep) { + buffered_rowset_.clear(); // note: this part of code is serial, todo, parallelize this transformer utils::SparseCSRMBuilder builder(col_ptr_, col_data_); builder.InitBudget(0); // start working iter_->BeforeFirst(); - num_buffered_row_ = 0; while (iter_->Next()) { const SparseBatch &batch = iter_->Value(); - if (batch.base_rowid >= max_nrow) break; - const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid); - for (size_t i = 0; i < nbatch; ++i, ++num_buffered_row_) { - SparseBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - builder.AddBudget(inst[j].findex); + for (size_t i = 0; i < batch.size; ++i) { + if (pkeep==1.0f || random::SampleBinary(pkeep)) { + buffered_rowset_.push_back(batch.base_rowid+i); + SparseBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + builder.AddBudget(inst[j].findex); + } } } } builder.InitStorage(); iter_->BeforeFirst(); + size_t ktop = 0; while (iter_->Next()) { const SparseBatch &batch = iter_->Value(); - if (batch.base_rowid >= max_nrow) break; - const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid); - for (size_t i = 0; i < nbatch; ++i) { - SparseBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - builder.PushElem(inst[j].findex, - Entry((bst_uint)(batch.base_rowid+i), - inst[j].fvalue)); + for (size_t i = 0; i < batch.size; ++i) { + if (ktop < buffered_rowset_.size() && + buffered_rowset_[ktop] == batch.base_rowid+i) { + ++ ktop; + SparseBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + builder.PushElem(inst[j].findex, + Entry((bst_uint)(batch.base_rowid+i), + inst[j].fvalue)); + } } } } @@ -353,8 +356,8 @@ class FMatrixS : public FMatrixInterface{ private: // --- data structure used to support InitColAccess -- utils::IIterator *iter_; - /*! \brief number */ - size_t num_buffered_row_; + /*! \brief list of row index that are buffered */ + std::vector buffered_rowset_; /*! \brief column pointer of CSC format */ std::vector col_ptr_; /*! \brief column datas in CSC format */ diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index a8cad7ebd..bd5cf6e3b 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -30,7 +30,7 @@ class BoostLearner { name_obj_ = "reg:linear"; name_gbm_ = "gbtree"; silent= 0; - max_buffer_row = std::numeric_limits::max(); + prob_buffer_row = 1.0f; } ~BoostLearner(void) { if (obj_ != NULL) delete obj_; @@ -80,7 +80,7 @@ class BoostLearner { */ inline void SetParam(const char *name, const char *val) { if (!strcmp(name, "silent")) silent = atoi(val); - if (!strcmp(name, "max_buffer_row")) sscanf(val, "%lu", &max_buffer_row); + if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast(atof(val)); if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); if (!strcmp("seed", name)) random::Seed(atoi(val)); if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val); @@ -151,7 +151,7 @@ class BoostLearner { * \param p_train pointer to the matrix used by training */ inline void CheckInit(DMatrix *p_train) { - p_train->fmat.InitColAccess(max_buffer_row); + p_train->fmat.InitColAccess(prob_buffer_row); } /*! * \brief update the model for one iteration @@ -293,7 +293,7 @@ class BoostLearner { // silent during training int silent; // maximum buffred row value - size_t max_buffer_row; + float prob_buffer_row; // evaluation set EvalSet evaluator_; // model parameter diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index b7d3f4e2f..35880b70e 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -80,13 +80,13 @@ class ColMaker: public IUpdater { const std::vector &root_index, RegTree *p_tree) { this->InitData(gpair, fmat, root_index, *p_tree); - this->InitNewNode(qexpand, gpair, *p_tree); + this->InitNewNode(qexpand, gpair, fmat, *p_tree); for (int depth = 0; depth < param.max_depth; ++depth) { this->FindSplit(depth, this->qexpand, gpair, fmat, p_tree); this->ResetPosition(this->qexpand, fmat, *p_tree); this->UpdateQueueExpand(*p_tree, &this->qexpand); - this->InitNewNode(qexpand, gpair, *p_tree); + this->InitNewNode(qexpand, gpair, fmat, *p_tree); // if nothing left to be expand, break if (qexpand.size() == 0) break; } @@ -109,25 +109,31 @@ class ColMaker: public IUpdater { const FMatrix &fmat, const std::vector &root_index, const RegTree &tree) { utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree"); + const std::vector &rowset = fmat.buffered_rowset(); {// setup position - position.resize(fmat.NumBufferedRow()); + position.resize(gpair.size()); if (root_index.size() == 0) { - std::fill(position.begin(), position.end(), 0); + for (size_t i = 0; i < rowset.size(); ++i) { + position[rowset[i]] = 0; + } } else { - for (size_t i = 0; i < position.size(); ++i) { - position[i] = root_index[i]; - utils::Assert(root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting"); + for (size_t i = 0; i < rowset.size(); ++i) { + const bst_uint ridx = rowset[i]; + position[ridx] = root_index[ridx]; + utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots, "root index exceed setting"); } } // mark delete for the deleted datas - for (size_t i = 0; i < position.size(); ++i) { - if (gpair[i].hess < 0.0f) position[i] = -1; + for (size_t i = 0; i < rowset.size(); ++i) { + const bst_uint ridx = rowset[i]; + if (gpair[ridx].hess < 0.0f) position[ridx] = -1; } // mark subsample if (param.subsample < 1.0f) { - for (size_t i = 0; i < position.size(); ++i) { - if (gpair[i].hess < 0.0f) continue; - if (random::SampleBinary(param.subsample) == 0) position[i] = -1; + for (size_t i = 0; i < rowset.size(); ++i) { + const bst_uint ridx = rowset[i]; + if (gpair[ridx].hess < 0.0f) continue; + if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1; } } } @@ -168,6 +174,7 @@ class ColMaker: public IUpdater { /*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */ inline void InitNewNode(const std::vector &qexpand, const std::vector &gpair, + const FMatrix &fmat, const RegTree &tree) { {// setup statistics space for each tree node for (size_t i = 0; i < stemp.size(); ++i) { @@ -175,13 +182,15 @@ class ColMaker: public IUpdater { } snode.resize(tree.param.num_nodes, NodeEntry()); } + const std::vector &rowset = fmat.buffered_rowset(); // setup position - const unsigned ndata = static_cast(position.size()); + const unsigned ndata = static_cast(rowset.size()); #pragma omp parallel for schedule(static) for (unsigned i = 0; i < ndata; ++i) { + const bst_uint ridx = rowset[i]; const int tid = omp_get_thread_num(); - if (position[i] < 0) continue; - stemp[tid][position[i]].stats.Add(gpair[i]); + if (position[ridx] < 0) continue; + stemp[tid][position[ridx]].stats.Add(gpair[ridx]); } // sum the per thread statistics together for (size_t j = 0; j < qexpand.size(); ++j) { @@ -303,17 +312,19 @@ class ColMaker: public IUpdater { } // reset position of each data points after split is created in the tree inline void ResetPosition(const std::vector &qexpand, const FMatrix &fmat, const RegTree &tree) { + const std::vector &rowset = fmat.buffered_rowset(); // step 1, set default direct nodes to default, and leaf nodes to -1 - const unsigned ndata = static_cast(position.size()); + const unsigned ndata = static_cast(rowset.size()); #pragma omp parallel for schedule(static) - for (unsigned i = 0; i < ndata; ++i) { - const int nid = position[i]; + for (unsigned i = 0; i < ndata; ++i) { + const bst_uint ridx = rowset[i]; + const int nid = position[ridx]; if (nid >= 0) { if (tree[nid].is_leaf()) { - position[i] = -1; + position[ridx] = -1; } else { // push to default branch, correct latter - position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright(); + position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright(); } } }