rename SparseBatch to RowBatch

This commit is contained in:
tqchen@graphlab.com 2014-08-27 10:56:55 -07:00
parent d5a5e0a42a
commit a59f8945dc
7 changed files with 56 additions and 46 deletions

View File

@ -96,8 +96,6 @@ struct SparseBatch {
}; };
/*! \brief batch size */ /*! \brief batch size */
size_t size; size_t size;
/*! \brief the offset of rowid of this batch */
size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */ /*! \brief array[size+1], row pointer of each of the elements */
const size_t *row_ptr; const size_t *row_ptr;
/*! \brief array[row_ptr.back()], content of the sparse element */ /*! \brief array[row_ptr.back()], content of the sparse element */
@ -107,7 +105,19 @@ struct SparseBatch {
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i])); return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i]));
} }
}; };
/*! \brief read-only row batch, used to access row continuously */
struct RowBatch : public SparseBatch {
/*! \brief the offset of rowid of this batch */
size_t base_rowid;
};
/*!
* \brief read-only column batch, used to access columns,
* the columns are not required to be continuous
*/
struct ColBatch : public RowBatch {
/*! \brief column index of each columns in the data */
bst_uint *col_index;
};
/** /**
* \brief This is a interface convention via template, defining the way to access features, * \brief This is a interface convention via template, defining the way to access features,
* column access rule is defined by template, for efficiency purpose, * column access rule is defined by template, for efficiency purpose,
@ -168,7 +178,7 @@ class FMatrixInterface {
*/ */
inline float GetColDensity(size_t cidx) const; inline float GetColDensity(size_t cidx) const;
/*! \brief get the row iterator associated with FMatrix */ /*! \brief get the row iterator associated with FMatrix */
inline utils::IIterator<SparseBatch>* RowIterator(void) const; inline utils::IIterator<RowBatch>* RowIterator(void) const;
}; };
/*! /*!
@ -176,7 +186,7 @@ class FMatrixInterface {
*/ */
class FMatrixS : public FMatrixInterface<FMatrixS>{ class FMatrixS : public FMatrixInterface<FMatrixS>{
public: public:
typedef SparseBatch::Entry Entry; typedef RowBatch::Entry Entry;
/*! \brief row iterator */ /*! \brief row iterator */
struct ColIter{ struct ColIter{
const Entry *dptr_, *end_; const Entry *dptr_, *end_;
@ -261,12 +271,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
* \brief get the row iterator associated with FMatrix * \brief get the row iterator associated with FMatrix
* this function is not threadsafe, returns iterator stored in FMatrixS * this function is not threadsafe, returns iterator stored in FMatrixS
*/ */
inline utils::IIterator<SparseBatch>* RowIterator(void) const { inline utils::IIterator<RowBatch>* RowIterator(void) const {
iter_->BeforeFirst(); iter_->BeforeFirst();
return iter_; return iter_;
} }
/*! \brief set iterator */ /*! \brief set iterator */
inline void set_iter(utils::IIterator<SparseBatch> *iter) { inline void set_iter(utils::IIterator<RowBatch> *iter) {
this->iter_ = iter; this->iter_ = iter;
} }
/*! /*!
@ -297,12 +307,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
*/ */
inline static void SaveBinary(utils::IStream &fo, inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr, const std::vector<size_t> &ptr,
const std::vector<SparseBatch::Entry> &data) { const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1; size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t)); fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0) { if (data.size() != 0) {
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry)); fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry));
} }
} }
/*! /*!
@ -313,7 +323,7 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
*/ */
inline static void LoadBinary(utils::IStream &fi, inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr, std::vector<size_t> *out_ptr,
std::vector<SparseBatch::Entry> *out_data) { std::vector<RowBatch::Entry> *out_data) {
size_t nrow; size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1); out_ptr->resize(nrow + 1);
@ -321,7 +331,7 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
"invalid input file format"); "invalid input file format");
out_data->resize(out_ptr->back()); out_data->resize(out_ptr->back());
if (out_data->size() != 0) { if (out_data->size() != 0) {
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0, utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0,
"invalid input file format"); "invalid input file format");
} }
} }
@ -334,16 +344,16 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
inline void InitColData(float pkeep) { inline void InitColData(float pkeep) {
buffered_rowset_.clear(); buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer // note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_); utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0); builder.InitBudget(0);
// start working // start working
iter_->BeforeFirst(); iter_->BeforeFirst();
while (iter_->Next()) { while (iter_->Next()) {
const SparseBatch &batch = iter_->Value(); const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) { for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) { if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i)); buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) { for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index); builder.AddBudget(inst[j].index);
} }
@ -355,12 +365,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
iter_->BeforeFirst(); iter_->BeforeFirst();
size_t ktop = 0; size_t ktop = 0;
while (iter_->Next()) { while (iter_->Next()) {
const SparseBatch &batch = iter_->Value(); const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) { for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() && if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid+i) { buffered_rowset_[ktop] == batch.base_rowid+i) {
++ktop; ++ktop;
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) { for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].index, builder.PushElem(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i), Entry((bst_uint)(batch.base_rowid+i),
@ -381,13 +391,13 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
private: private:
// --- data structure used to support InitColAccess -- // --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_; utils::IIterator<RowBatch> *iter_;
/*! \brief list of row index that are buffered */ /*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_; std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */ /*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_; std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */ /*! \brief column datas in CSC format */
std::vector<SparseBatch::Entry> col_data_; std::vector<RowBatch::Entry> col_data_;
}; };
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_DATA_H #endif // XGBOOST_DATA_H

View File

@ -106,11 +106,11 @@ class GBLinear : public IGradBooster<FMatrix> {
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); preds.resize(0);
// start collecting the prediction // start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = fmat.RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * ngroup == preds.size(), utils::Assert(batch.base_rowid * ngroup == preds.size(),
"base_rowid is not set correctly"); "base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows // output convention: nrow * k, where nrow is number of rows
@ -146,7 +146,7 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
random::Shuffle(feat_index); random::Shuffle(feat_index);
} }
inline void Pred(const SparseBatch::Inst &inst, float *preds) { inline void Pred(const RowBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) { for (int gid = 0; gid < model.param.num_output_group; ++gid) {
float psum = model.bias()[gid]; float psum = model.bias()[gid];
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {

View File

@ -121,10 +121,10 @@ class GBTree : public IGradBooster<FMatrix> {
const size_t stride = info.num_row * mparam.num_output_group; const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1)); preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction // start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = fmat.RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
// parallel over local batch // parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -208,7 +208,7 @@ class GBTree : public IGradBooster<FMatrix> {
mparam.num_trees += tparam.num_parallel_tree; mparam.num_trees += tparam.num_parallel_tree;
} }
// make a prediction for a single instance // make a prediction for a single instance
inline void Pred(const SparseBatch::Inst &inst, inline void Pred(const RowBatch::Inst &inst,
int64_t buffer_index, int64_t buffer_index,
int bst_group, int bst_group,
unsigned root_index, unsigned root_index,

View File

@ -41,15 +41,15 @@ class DMatrixSimple : public DataMatrix {
this->info = src.info; this->info = src.info;
this->Clear(); this->Clear();
// clone data content in thos matrix // clone data content in thos matrix
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator(); utils::IIterator<RowBatch> *iter = src.fmat.RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) { for (size_t i = 0; i < batch.size; ++i) {
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
row_data_.resize(row_data_.size() + inst.length); row_data_.resize(row_data_.size() + inst.length);
memcpy(&row_data_[row_ptr_.back()], inst.data, memcpy(&row_data_[row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length); sizeof(RowBatch::Entry) * inst.length);
row_ptr_.push_back(row_ptr_.back() + inst.length); row_ptr_.push_back(row_ptr_.back() + inst.length);
} }
} }
@ -59,7 +59,7 @@ class DMatrixSimple : public DataMatrix {
* \param feats features * \param feats features
* \return the index of added row * \return the index of added row
*/ */
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) { inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
for (size_t i = 0; i < feats.size(); ++i) { for (size_t i = 0; i < feats.size(); ++i) {
row_data_.push_back(feats[i]); row_data_.push_back(feats[i]);
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1)); info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
@ -78,9 +78,9 @@ class DMatrixSimple : public DataMatrix {
FILE* file = utils::FopenCheck(fname, "r"); FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true; float label; bool init = true;
char tmp[1024]; char tmp[1024];
std::vector<SparseBatch::Entry> feats; std::vector<RowBatch::Entry> feats;
while (fscanf(file, "%s", tmp) == 1) { while (fscanf(file, "%s", tmp) == 1) {
SparseBatch::Entry e; RowBatch::Entry e;
if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) { if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) {
feats.push_back(e); feats.push_back(e);
} else { } else {
@ -211,13 +211,13 @@ class DMatrixSimple : public DataMatrix {
/*! \brief row pointer of CSR sparse storage */ /*! \brief row pointer of CSR sparse storage */
std::vector<size_t> row_ptr_; std::vector<size_t> row_ptr_;
/*! \brief data in the row */ /*! \brief data in the row */
std::vector<SparseBatch::Entry> row_data_; std::vector<RowBatch::Entry> row_data_;
/*! \brief magic number used to identify DMatrix */ /*! \brief magic number used to identify DMatrix */
static const int kMagic = 0xffffab01; static const int kMagic = 0xffffab01;
protected: protected:
// one batch iterator that return content in the matrix // one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<SparseBatch> { struct OneBatchIter: utils::IIterator<RowBatch> {
explicit OneBatchIter(DMatrixSimple *parent) explicit OneBatchIter(DMatrixSimple *parent)
: at_first_(true), parent_(parent) {} : at_first_(true), parent_(parent) {}
virtual ~OneBatchIter(void) {} virtual ~OneBatchIter(void) {}
@ -233,7 +233,7 @@ class DMatrixSimple : public DataMatrix {
batch_.data_ptr = &parent_->row_data_[0]; batch_.data_ptr = &parent_->row_data_[0];
return true; return true;
} }
virtual const SparseBatch &Value(void) const { virtual const RowBatch &Value(void) const {
return batch_; return batch_;
} }
@ -243,7 +243,7 @@ class DMatrixSimple : public DataMatrix {
// pointer to parient // pointer to parient
DMatrixSimple *parent_; DMatrixSimple *parent_;
// temporal space for batch // temporal space for batch
SparseBatch batch_; RowBatch batch_;
}; };
}; };
} // namespace io } // namespace io

View File

@ -491,13 +491,13 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
std::fill(data.begin(), data.end(), e); std::fill(data.begin(), data.end(), e);
} }
/*! \brief fill the vector with sparse vector */ /*! \brief fill the vector with sparse vector */
inline void Fill(const SparseBatch::Inst &inst) { inline void Fill(const RowBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].index].fvalue = inst[i].fvalue; data[inst[i].index].fvalue = inst[i].fvalue;
} }
} }
/*! \brief drop the trace after fill, must be called after fill */ /*! \brief drop the trace after fill, must be called after fill */
inline void Drop(const SparseBatch::Inst &inst) { inline void Drop(const RowBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) { for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].index].flag = -1; data[inst[i].index].flag = -1;
} }

View File

@ -50,16 +50,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
fvec_temp[tid].Init(trees[0]->param.num_feature); fvec_temp[tid].Init(trees[0]->param.num_feature);
} }
// start accumulating statistics // start accumulating statistics
utils::IIterator<SparseBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = fmat.RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Check(batch.size < std::numeric_limits<unsigned>::max(), utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
"too large batch size "); "too large batch size ");
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) { for (bst_omp_uint i = 0; i < nbatch; ++i) {
SparseBatch::Inst inst = batch[i]; RowBatch::Inst inst = batch[i];
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i); const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
RegTree::FVec &feats = fvec_temp[tid]; RegTree::FVec &feats = fvec_temp[tid];

View File

@ -89,7 +89,7 @@ extern "C"{
} }
mat.row_data_.resize(nelem); mat.row_data_.resize(nelem);
for (bst_ulong i = 0; i < nelem; ++i) { for (bst_ulong i = 0; i < nelem; ++i) {
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]); mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
mat.info.info.num_col = std::max(mat.info.info.num_col, mat.info.info.num_col = std::max(mat.info.info.num_col,
static_cast<size_t>(indices[i]+1)); static_cast<size_t>(indices[i]+1));
} }
@ -108,7 +108,7 @@ extern "C"{
bst_ulong nelem = 0; bst_ulong nelem = 0;
for (bst_ulong j = 0; j < ncol; ++j) { for (bst_ulong j = 0; j < ncol; ++j) {
if (data[j] != missing) { if (data[j] != missing) {
mat.row_data_.push_back(SparseBatch::Entry(j, data[j])); mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
++nelem; ++nelem;
} }
} }
@ -135,17 +135,17 @@ extern "C"{
ret.info.info.num_row = len; ret.info.info.num_row = len;
ret.info.info.num_col = src.info.num_col(); ret.info.info.num_col = src.info.num_col();
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator(); utils::IIterator<RowBatch> *iter = src.fmat.RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
utils::Assert(iter->Next(), "slice"); utils::Assert(iter->Next(), "slice");
const SparseBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
for (bst_ulong i = 0; i < len; ++i) { for (bst_ulong i = 0; i < len; ++i) {
const int ridx = idxset[i]; const int ridx = idxset[i];
SparseBatch::Inst inst = batch[ridx]; RowBatch::Inst inst = batch[ridx];
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows"); utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
ret.row_data_.resize(ret.row_data_.size() + inst.length); ret.row_data_.resize(ret.row_data_.size() + inst.length);
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length); sizeof(RowBatch::Entry) * inst.length);
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
if (src.info.labels.size() != 0) { if (src.info.labels.size() != 0) {
ret.info.labels.push_back(src.info.labels[ridx]); ret.info.labels.push_back(src.info.labels[ridx]);