rename SparseBatch to RowBatch
This commit is contained in:
parent
d5a5e0a42a
commit
a59f8945dc
46
src/data.h
46
src/data.h
@ -96,8 +96,6 @@ struct SparseBatch {
|
||||
};
|
||||
/*! \brief batch size */
|
||||
size_t size;
|
||||
/*! \brief the offset of rowid of this batch */
|
||||
size_t base_rowid;
|
||||
/*! \brief array[size+1], row pointer of each of the elements */
|
||||
const size_t *row_ptr;
|
||||
/*! \brief array[row_ptr.back()], content of the sparse element */
|
||||
@ -107,7 +105,19 @@ struct SparseBatch {
|
||||
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i]));
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief read-only row batch, used to access row continuously */
|
||||
struct RowBatch : public SparseBatch {
|
||||
/*! \brief the offset of rowid of this batch */
|
||||
size_t base_rowid;
|
||||
};
|
||||
/*!
|
||||
* \brief read-only column batch, used to access columns,
|
||||
* the columns are not required to be continuous
|
||||
*/
|
||||
struct ColBatch : public RowBatch {
|
||||
/*! \brief column index of each columns in the data */
|
||||
bst_uint *col_index;
|
||||
};
|
||||
/**
|
||||
* \brief This is a interface convention via template, defining the way to access features,
|
||||
* column access rule is defined by template, for efficiency purpose,
|
||||
@ -168,7 +178,7 @@ class FMatrixInterface {
|
||||
*/
|
||||
inline float GetColDensity(size_t cidx) const;
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const;
|
||||
inline utils::IIterator<RowBatch>* RowIterator(void) const;
|
||||
};
|
||||
|
||||
/*!
|
||||
@ -176,7 +186,7 @@ class FMatrixInterface {
|
||||
*/
|
||||
class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
typedef RowBatch::Entry Entry;
|
||||
/*! \brief row iterator */
|
||||
struct ColIter{
|
||||
const Entry *dptr_, *end_;
|
||||
@ -261,12 +271,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
* this function is not threadsafe, returns iterator stored in FMatrixS
|
||||
*/
|
||||
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
|
||||
inline utils::IIterator<RowBatch>* RowIterator(void) const {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*! \brief set iterator */
|
||||
inline void set_iter(utils::IIterator<SparseBatch> *iter) {
|
||||
inline void set_iter(utils::IIterator<RowBatch> *iter) {
|
||||
this->iter_ = iter;
|
||||
}
|
||||
/*!
|
||||
@ -297,12 +307,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
*/
|
||||
inline static void SaveBinary(utils::IStream &fo,
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<SparseBatch::Entry> &data) {
|
||||
const std::vector<RowBatch::Entry> &data) {
|
||||
size_t nrow = ptr.size() - 1;
|
||||
fo.Write(&nrow, sizeof(size_t));
|
||||
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
|
||||
if (data.size() != 0) {
|
||||
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry));
|
||||
fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
@ -313,7 +323,7 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
*/
|
||||
inline static void LoadBinary(utils::IStream &fi,
|
||||
std::vector<size_t> *out_ptr,
|
||||
std::vector<SparseBatch::Entry> *out_data) {
|
||||
std::vector<RowBatch::Entry> *out_data) {
|
||||
size_t nrow;
|
||||
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
|
||||
out_ptr->resize(nrow + 1);
|
||||
@ -321,7 +331,7 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
"invalid input file format");
|
||||
out_data->resize(out_ptr->back());
|
||||
if (out_data->size() != 0) {
|
||||
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0,
|
||||
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0,
|
||||
"invalid input file format");
|
||||
}
|
||||
}
|
||||
@ -334,16 +344,16 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
inline void InitColData(float pkeep) {
|
||||
buffered_rowset_.clear();
|
||||
// note: this part of code is serial, todo, parallelize this transformer
|
||||
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
|
||||
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
|
||||
builder.InitBudget(0);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const SparseBatch &batch = iter_->Value();
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.AddBudget(inst[j].index);
|
||||
}
|
||||
@ -355,12 +365,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
iter_->BeforeFirst();
|
||||
size_t ktop = 0;
|
||||
while (iter_->Next()) {
|
||||
const SparseBatch &batch = iter_->Value();
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (ktop < buffered_rowset_.size() &&
|
||||
buffered_rowset_[ktop] == batch.base_rowid+i) {
|
||||
++ktop;
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
builder.PushElem(inst[j].index,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
@ -381,13 +391,13 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
|
||||
private:
|
||||
// --- data structure used to support InitColAccess --
|
||||
utils::IIterator<SparseBatch> *iter_;
|
||||
utils::IIterator<RowBatch> *iter_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
/*! \brief column pointer of CSC format */
|
||||
std::vector<size_t> col_ptr_;
|
||||
/*! \brief column datas in CSC format */
|
||||
std::vector<SparseBatch::Entry> col_data_;
|
||||
std::vector<RowBatch::Entry> col_data_;
|
||||
};
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_H
|
||||
|
||||
@ -106,11 +106,11 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
std::vector<float> &preds = *out_preds;
|
||||
preds.resize(0);
|
||||
// start collecting the prediction
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
const int ngroup = model.param.num_output_group;
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
utils::Assert(batch.base_rowid * ngroup == preds.size(),
|
||||
"base_rowid is not set correctly");
|
||||
// output convention: nrow * k, where nrow is number of rows
|
||||
@ -146,7 +146,7 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
}
|
||||
random::Shuffle(feat_index);
|
||||
}
|
||||
inline void Pred(const SparseBatch::Inst &inst, float *preds) {
|
||||
inline void Pred(const RowBatch::Inst &inst, float *preds) {
|
||||
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
||||
float psum = model.bias()[gid];
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
|
||||
@ -121,10 +121,10 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
const size_t stride = info.num_row * mparam.num_output_group;
|
||||
preds.resize(stride * (mparam.size_leaf_vector+1));
|
||||
// start collecting the prediction
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
// parallel over local batch
|
||||
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
@ -208,7 +208,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
mparam.num_trees += tparam.num_parallel_tree;
|
||||
}
|
||||
// make a prediction for a single instance
|
||||
inline void Pred(const SparseBatch::Inst &inst,
|
||||
inline void Pred(const RowBatch::Inst &inst,
|
||||
int64_t buffer_index,
|
||||
int bst_group,
|
||||
unsigned root_index,
|
||||
|
||||
@ -41,15 +41,15 @@ class DMatrixSimple : public DataMatrix {
|
||||
this->info = src.info;
|
||||
this->Clear();
|
||||
// clone data content in thos matrix
|
||||
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = src.fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
row_data_.resize(row_data_.size() + inst.length);
|
||||
memcpy(&row_data_[row_ptr_.back()], inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
sizeof(RowBatch::Entry) * inst.length);
|
||||
row_ptr_.push_back(row_ptr_.back() + inst.length);
|
||||
}
|
||||
}
|
||||
@ -59,7 +59,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
* \param feats features
|
||||
* \return the index of added row
|
||||
*/
|
||||
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) {
|
||||
inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
|
||||
for (size_t i = 0; i < feats.size(); ++i) {
|
||||
row_data_.push_back(feats[i]);
|
||||
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].index+1));
|
||||
@ -78,9 +78,9 @@ class DMatrixSimple : public DataMatrix {
|
||||
FILE* file = utils::FopenCheck(fname, "r");
|
||||
float label; bool init = true;
|
||||
char tmp[1024];
|
||||
std::vector<SparseBatch::Entry> feats;
|
||||
std::vector<RowBatch::Entry> feats;
|
||||
while (fscanf(file, "%s", tmp) == 1) {
|
||||
SparseBatch::Entry e;
|
||||
RowBatch::Entry e;
|
||||
if (sscanf(tmp, "%u:%f", &e.index, &e.fvalue) == 2) {
|
||||
feats.push_back(e);
|
||||
} else {
|
||||
@ -211,13 +211,13 @@ class DMatrixSimple : public DataMatrix {
|
||||
/*! \brief row pointer of CSR sparse storage */
|
||||
std::vector<size_t> row_ptr_;
|
||||
/*! \brief data in the row */
|
||||
std::vector<SparseBatch::Entry> row_data_;
|
||||
std::vector<RowBatch::Entry> row_data_;
|
||||
/*! \brief magic number used to identify DMatrix */
|
||||
static const int kMagic = 0xffffab01;
|
||||
|
||||
protected:
|
||||
// one batch iterator that return content in the matrix
|
||||
struct OneBatchIter: utils::IIterator<SparseBatch> {
|
||||
struct OneBatchIter: utils::IIterator<RowBatch> {
|
||||
explicit OneBatchIter(DMatrixSimple *parent)
|
||||
: at_first_(true), parent_(parent) {}
|
||||
virtual ~OneBatchIter(void) {}
|
||||
@ -233,7 +233,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
batch_.data_ptr = &parent_->row_data_[0];
|
||||
return true;
|
||||
}
|
||||
virtual const SparseBatch &Value(void) const {
|
||||
virtual const RowBatch &Value(void) const {
|
||||
return batch_;
|
||||
}
|
||||
|
||||
@ -243,7 +243,7 @@ class DMatrixSimple : public DataMatrix {
|
||||
// pointer to parient
|
||||
DMatrixSimple *parent_;
|
||||
// temporal space for batch
|
||||
SparseBatch batch_;
|
||||
RowBatch batch_;
|
||||
};
|
||||
};
|
||||
} // namespace io
|
||||
|
||||
@ -491,13 +491,13 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
std::fill(data.begin(), data.end(), e);
|
||||
}
|
||||
/*! \brief fill the vector with sparse vector */
|
||||
inline void Fill(const SparseBatch::Inst &inst) {
|
||||
inline void Fill(const RowBatch::Inst &inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
data[inst[i].index].fvalue = inst[i].fvalue;
|
||||
}
|
||||
}
|
||||
/*! \brief drop the trace after fill, must be called after fill */
|
||||
inline void Drop(const SparseBatch::Inst &inst) {
|
||||
inline void Drop(const RowBatch::Inst &inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
data[inst[i].index].flag = -1;
|
||||
}
|
||||
|
||||
@ -50,16 +50,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
|
||||
fvec_temp[tid].Init(trees[0]->param.num_feature);
|
||||
}
|
||||
// start accumulating statistics
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
|
||||
"too large batch size ");
|
||||
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
RowBatch::Inst inst = batch[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
RegTree::FVec &feats = fvec_temp[tid];
|
||||
|
||||
@ -89,7 +89,7 @@ extern "C"{
|
||||
}
|
||||
mat.row_data_.resize(nelem);
|
||||
for (bst_ulong i = 0; i < nelem; ++i) {
|
||||
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]);
|
||||
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
|
||||
mat.info.info.num_col = std::max(mat.info.info.num_col,
|
||||
static_cast<size_t>(indices[i]+1));
|
||||
}
|
||||
@ -108,7 +108,7 @@ extern "C"{
|
||||
bst_ulong nelem = 0;
|
||||
for (bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (data[j] != missing) {
|
||||
mat.row_data_.push_back(SparseBatch::Entry(j, data[j]));
|
||||
mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
|
||||
++nelem;
|
||||
}
|
||||
}
|
||||
@ -135,17 +135,17 @@ extern "C"{
|
||||
ret.info.info.num_row = len;
|
||||
ret.info.info.num_col = src.info.num_col();
|
||||
|
||||
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = src.fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
utils::Assert(iter->Next(), "slice");
|
||||
const SparseBatch &batch = iter->Value();
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (bst_ulong i = 0; i < len; ++i) {
|
||||
const int ridx = idxset[i];
|
||||
SparseBatch::Inst inst = batch[ridx];
|
||||
RowBatch::Inst inst = batch[ridx];
|
||||
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
|
||||
ret.row_data_.resize(ret.row_data_.size() + inst.length);
|
||||
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
sizeof(RowBatch::Entry) * inst.length);
|
||||
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
|
||||
if (src.info.labels.size() != 0) {
|
||||
ret.info.labels.push_back(src.info.labels[ridx]);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user