From a59f8945dca8eca2985352888c9c8e9ed89b106c Mon Sep 17 00:00:00 2001 From: "tqchen@graphlab.com" Date: Wed, 27 Aug 2014 10:56:55 -0700 Subject: [PATCH] rename SparseBatch to RowBatch --- src/data.h | 46 +++++++++++++++++++------------- src/gbm/gblinear-inl.hpp | 6 ++--- src/gbm/gbtree-inl.hpp | 6 ++--- src/io/simple_dmatrix-inl.hpp | 22 +++++++-------- src/tree/model.h | 4 +-- src/tree/updater_refresh-inl.hpp | 6 ++--- wrapper/xgboost_wrapper.cpp | 12 ++++----- 7 files changed, 56 insertions(+), 46 deletions(-) diff --git a/src/data.h b/src/data.h index 9cd352584..e6c13ae03 100644 --- a/src/data.h +++ b/src/data.h @@ -96,8 +96,6 @@ struct SparseBatch { }; /*! \brief batch size */ size_t size; - /*! \brief the offset of rowid of this batch */ - size_t base_rowid; /*! \brief array[size+1], row pointer of each of the elements */ const size_t *row_ptr; /*! \brief array[row_ptr.back()], content of the sparse element */ @@ -107,7 +105,19 @@ struct SparseBatch { return Inst(data_ptr + row_ptr[i], static_cast(row_ptr[i+1] - row_ptr[i])); } }; - +/*! \brief read-only row batch, used to access row continuously */ +struct RowBatch : public SparseBatch { + /*! \brief the offset of rowid of this batch */ + size_t base_rowid; +}; +/*! + * \brief read-only column batch, used to access columns, + * the columns are not required to be continuous + */ +struct ColBatch : public RowBatch { + /*! \brief column index of each columns in the data */ + bst_uint *col_index; +}; /** * \brief This is a interface convention via template, defining the way to access features, * column access rule is defined by template, for efficiency purpose, @@ -168,7 +178,7 @@ class FMatrixInterface { */ inline float GetColDensity(size_t cidx) const; /*! \brief get the row iterator associated with FMatrix */ - inline utils::IIterator* RowIterator(void) const; + inline utils::IIterator* RowIterator(void) const; }; /*! @@ -176,7 +186,7 @@ class FMatrixInterface { */ class FMatrixS : public FMatrixInterface{ public: - typedef SparseBatch::Entry Entry; + typedef RowBatch::Entry Entry; /*! \brief row iterator */ struct ColIter{ const Entry *dptr_, *end_; @@ -261,12 +271,12 @@ class FMatrixS : public FMatrixInterface{ * \brief get the row iterator associated with FMatrix * this function is not threadsafe, returns iterator stored in FMatrixS */ - inline utils::IIterator* RowIterator(void) const { + inline utils::IIterator* RowIterator(void) const { iter_->BeforeFirst(); return iter_; } /*! \brief set iterator */ - inline void set_iter(utils::IIterator *iter) { + inline void set_iter(utils::IIterator *iter) { this->iter_ = iter; } /*! @@ -297,12 +307,12 @@ class FMatrixS : public FMatrixInterface{ */ inline static void SaveBinary(utils::IStream &fo, const std::vector &ptr, - const std::vector &data) { + const std::vector &data) { size_t nrow = ptr.size() - 1; fo.Write(&nrow, sizeof(size_t)); fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); if (data.size() != 0) { - fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry)); + fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry)); } } /*! @@ -313,7 +323,7 @@ class FMatrixS : public FMatrixInterface{ */ inline static void LoadBinary(utils::IStream &fi, std::vector *out_ptr, - std::vector *out_data) { + std::vector *out_data) { size_t nrow; utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); out_ptr->resize(nrow + 1); @@ -321,7 +331,7 @@ class FMatrixS : public FMatrixInterface{ "invalid input file format"); out_data->resize(out_ptr->back()); if (out_data->size() != 0) { - utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0, + utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0, "invalid input file format"); } } @@ -334,16 +344,16 @@ class FMatrixS : public FMatrixInterface{ inline void InitColData(float pkeep) { buffered_rowset_.clear(); // note: this part of code is serial, todo, parallelize this transformer - utils::SparseCSRMBuilder builder(col_ptr_, col_data_); + utils::SparseCSRMBuilder builder(col_ptr_, col_data_); builder.InitBudget(0); // start working iter_->BeforeFirst(); while (iter_->Next()) { - const SparseBatch &batch = iter_->Value(); + const RowBatch &batch = iter_->Value(); for (size_t i = 0; i < batch.size; ++i) { if (pkeep == 1.0f || random::SampleBinary(pkeep)) { buffered_rowset_.push_back(static_cast(batch.base_rowid+i)); - SparseBatch::Inst inst = batch[i]; + RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { builder.AddBudget(inst[j].index); } @@ -355,12 +365,12 @@ class FMatrixS : public FMatrixInterface{ iter_->BeforeFirst(); size_t ktop = 0; while (iter_->Next()) { - const SparseBatch &batch = iter_->Value(); + const RowBatch &batch = iter_->Value(); for (size_t i = 0; i < batch.size; ++i) { if (ktop < buffered_rowset_.size() && buffered_rowset_[ktop] == batch.base_rowid+i) { ++ktop; - SparseBatch::Inst inst = batch[i]; + RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { builder.PushElem(inst[j].index, Entry((bst_uint)(batch.base_rowid+i), @@ -381,13 +391,13 @@ class FMatrixS : public FMatrixInterface{ private: // --- data structure used to support InitColAccess -- - utils::IIterator *iter_; + utils::IIterator *iter_; /*! \brief list of row index that are buffered */ std::vector buffered_rowset_; /*! \brief column pointer of CSC format */ std::vector col_ptr_; /*! \brief column datas in CSC format */ - std::vector col_data_; + std::vector col_data_; }; } // namespace xgboost #endif // XGBOOST_DATA_H diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp index 6fe0dcf83..07edda034 100644 --- a/src/gbm/gblinear-inl.hpp +++ b/src/gbm/gblinear-inl.hpp @@ -106,11 +106,11 @@ class GBLinear : public IGradBooster { std::vector &preds = *out_preds; preds.resize(0); // start collecting the prediction - utils::IIterator *iter = fmat.RowIterator(); + utils::IIterator *iter = fmat.RowIterator(); iter->BeforeFirst(); const int ngroup = model.param.num_output_group; while (iter->Next()) { - const SparseBatch &batch = iter->Value(); + const RowBatch &batch = iter->Value(); utils::Assert(batch.base_rowid * ngroup == preds.size(), "base_rowid is not set correctly"); // output convention: nrow * k, where nrow is number of rows @@ -146,7 +146,7 @@ class GBLinear : public IGradBooster { } random::Shuffle(feat_index); } - inline void Pred(const SparseBatch::Inst &inst, float *preds) { + inline void Pred(const RowBatch::Inst &inst, float *preds) { for (int gid = 0; gid < model.param.num_output_group; ++gid) { float psum = model.bias()[gid]; for (bst_uint i = 0; i < inst.length; ++i) { diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index 1f34aa55b..2660f2a5f 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -121,10 +121,10 @@ class GBTree : public IGradBooster { const size_t stride = info.num_row * mparam.num_output_group; preds.resize(stride * (mparam.size_leaf_vector+1)); // start collecting the prediction - utils::IIterator *iter = fmat.RowIterator(); + utils::IIterator *iter = fmat.RowIterator(); iter->BeforeFirst(); while (iter->Next()) { - const SparseBatch &batch = iter->Value(); + const RowBatch &batch = iter->Value(); // parallel over local batch const bst_omp_uint nsize = static_cast(batch.size); #pragma omp parallel for schedule(static) @@ -208,7 +208,7 @@ class GBTree : public IGradBooster { mparam.num_trees += tparam.num_parallel_tree; } // make a prediction for a single instance - inline void Pred(const SparseBatch::Inst &inst, + inline void Pred(const RowBatch::Inst &inst, int64_t buffer_index, int bst_group, unsigned root_index, diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 583d4ba2a..87aabba4d 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -41,15 +41,15 @@ class DMatrixSimple : public DataMatrix { this->info = src.info; this->Clear(); // clone data content in thos matrix - utils::IIterator *iter = src.fmat.RowIterator(); + utils::IIterator *iter = src.fmat.RowIterator(); iter->BeforeFirst(); while (iter->Next()) { - const SparseBatch &batch = iter->Value(); + const RowBatch &batch = iter->Value(); for (size_t i = 0; i < batch.size; ++i) { - SparseBatch::Inst inst = batch[i]; + RowBatch::Inst inst = batch[i]; row_data_.resize(row_data_.size() + inst.length); memcpy(&row_data_[row_ptr_.back()], inst.data, - sizeof(SparseBatch::Entry) * inst.length); + sizeof(RowBatch::Entry) * inst.length); row_ptr_.push_back(row_ptr_.back() + inst.length); } } @@ -59,7 +59,7 @@ class DMatrixSimple : public DataMatrix { * \param feats features * \return the index of added row */ - inline size_t AddRow(const std::vector &feats) { + inline size_t AddRow(const std::vector &feats) { for (size_t i = 0; i < feats.size(); ++i) { row_data_.push_back(feats[i]); info.info.num_col = std::max(info.info.num_col, static_cast(feats[i].index+1)); @@ -78,9 +78,9 @@ class DMatrixSimple : public DataMatrix { FILE* file = utils::FopenCheck(fname, "r"); float label; bool init = true; char tmp[1024]; - std::vector feats; + std::vector feats; while (fscanf(file, "%s", tmp) == 1) { - SparseBatch::Entry e; + RowBatch::Entry e; if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) { feats.push_back(e); } else { @@ -211,13 +211,13 @@ class DMatrixSimple : public DataMatrix { /*! \brief row pointer of CSR sparse storage */ std::vector row_ptr_; /*! \brief data in the row */ - std::vector row_data_; + std::vector row_data_; /*! \brief magic number used to identify DMatrix */ static const int kMagic = 0xffffab01; protected: // one batch iterator that return content in the matrix - struct OneBatchIter: utils::IIterator { + struct OneBatchIter: utils::IIterator { explicit OneBatchIter(DMatrixSimple *parent) : at_first_(true), parent_(parent) {} virtual ~OneBatchIter(void) {} @@ -233,7 +233,7 @@ class DMatrixSimple : public DataMatrix { batch_.data_ptr = &parent_->row_data_[0]; return true; } - virtual const SparseBatch &Value(void) const { + virtual const RowBatch &Value(void) const { return batch_; } @@ -243,7 +243,7 @@ class DMatrixSimple : public DataMatrix { // pointer to parient DMatrixSimple *parent_; // temporal space for batch - SparseBatch batch_; + RowBatch batch_; }; }; } // namespace io diff --git a/src/tree/model.h b/src/tree/model.h index 6d4935355..6d885faa7 100644 --- a/src/tree/model.h +++ b/src/tree/model.h @@ -491,13 +491,13 @@ class RegTree: public TreeModel{ std::fill(data.begin(), data.end(), e); } /*! \brief fill the vector with sparse vector */ - inline void Fill(const SparseBatch::Inst &inst) { + inline void Fill(const RowBatch::Inst &inst) { for (bst_uint i = 0; i < inst.length; ++i) { data[inst[i].index].fvalue = inst[i].fvalue; } } /*! \brief drop the trace after fill, must be called after fill */ - inline void Drop(const SparseBatch::Inst &inst) { + inline void Drop(const RowBatch::Inst &inst) { for (bst_uint i = 0; i < inst.length; ++i) { data[inst[i].index].flag = -1; } diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp index 299f8414a..2327070b5 100644 --- a/src/tree/updater_refresh-inl.hpp +++ b/src/tree/updater_refresh-inl.hpp @@ -50,16 +50,16 @@ class TreeRefresher: public IUpdater { fvec_temp[tid].Init(trees[0]->param.num_feature); } // start accumulating statistics - utils::IIterator *iter = fmat.RowIterator(); + utils::IIterator *iter = fmat.RowIterator(); iter->BeforeFirst(); while (iter->Next()) { - const SparseBatch &batch = iter->Value(); + const RowBatch &batch = iter->Value(); utils::Check(batch.size < std::numeric_limits::max(), "too large batch size "); const bst_omp_uint nbatch = static_cast(batch.size); #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < nbatch; ++i) { - SparseBatch::Inst inst = batch[i]; + RowBatch::Inst inst = batch[i]; const int tid = omp_get_thread_num(); const bst_uint ridx = static_cast(batch.base_rowid + i); RegTree::FVec &feats = fvec_temp[tid]; diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index d2ae0b846..be19a57a0 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -89,7 +89,7 @@ extern "C"{ } mat.row_data_.resize(nelem); for (bst_ulong i = 0; i < nelem; ++i) { - mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]); + mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); mat.info.info.num_col = std::max(mat.info.info.num_col, static_cast(indices[i]+1)); } @@ -108,7 +108,7 @@ extern "C"{ bst_ulong nelem = 0; for (bst_ulong j = 0; j < ncol; ++j) { if (data[j] != missing) { - mat.row_data_.push_back(SparseBatch::Entry(j, data[j])); + mat.row_data_.push_back(RowBatch::Entry(j, data[j])); ++nelem; } } @@ -135,17 +135,17 @@ extern "C"{ ret.info.info.num_row = len; ret.info.info.num_col = src.info.num_col(); - utils::IIterator *iter = src.fmat.RowIterator(); + utils::IIterator *iter = src.fmat.RowIterator(); iter->BeforeFirst(); utils::Assert(iter->Next(), "slice"); - const SparseBatch &batch = iter->Value(); + const RowBatch &batch = iter->Value(); for (bst_ulong i = 0; i < len; ++i) { const int ridx = idxset[i]; - SparseBatch::Inst inst = batch[ridx]; + RowBatch::Inst inst = batch[ridx]; utils::Check(static_cast(ridx) < batch.size, "slice index exceed number of rows"); ret.row_data_.resize(ret.row_data_.size() + inst.length); memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, - sizeof(SparseBatch::Entry) * inst.length); + sizeof(RowBatch::Entry) * inst.length); ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); if (src.info.labels.size() != 0) { ret.info.labels.push_back(src.info.labels[ridx]);