current progress

This commit is contained in:
tqchen 2015-04-15 22:28:43 -07:00
parent e8f6f3b541
commit a514340c96
5 changed files with 381 additions and 165 deletions

1
.gitignore vendored
View File

@ -57,4 +57,3 @@ R-package.Rproj
*.cache* *.cache*
R-package/inst R-package/inst
R-package/src R-package/src

View File

@ -1,7 +1,7 @@
#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_ #ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_ #define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
/*! /*!
* \file page_row_iter-inl.hpp * \file page_dmatrix-inl.hpp
* row iterator based on sparse page * row iterator based on sparse page
* \author Tianqi Chen * \author Tianqi Chen
*/ */
@ -10,97 +10,11 @@
#include "../utils/iterator.h" #include "../utils/iterator.h"
#include "../utils/thread_buffer.h" #include "../utils/thread_buffer.h"
#include "./simple_fmatrix-inl.hpp" #include "./simple_fmatrix-inl.hpp"
#include "./sparse_batch_page.h"
#include "./page_fmatrix-inl.hpp"
namespace xgboost { namespace xgboost {
namespace io { namespace io {
/*! \brief page structure that can be used to store a rowbatch */
struct RowBatchPage {
public:
explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
data_ = new int[kPageSize];
utils::Assert(data_ != NULL, "fail to allocate row batch page");
this->Clear();
}
~RowBatchPage(void) {
if (data_ != NULL) delete [] data_;
}
/*!
* \brief Push one row into page
* \param row an instance row
* \return false or true to push into
*/
inline bool PushRow(const RowBatch::Inst &row) {
const size_t dsize = row.length * sizeof(RowBatch::Entry);
if (FreeBytes() < dsize+ sizeof(int)) return false;
row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
++data_[0];
return true;
}
/*!
* \brief get a row batch representation from the page
* \param p_rptr a temporal space that can be used to provide
* ind_ptr storage for RowBatch
* \return a new RowBatch object
*/
inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
RowBatch batch;
batch.base_rowid = base_rowid;
batch.data_ptr = this->data_ptr(0);
batch.size = static_cast<size_t>(this->Size());
std::vector<size_t> &rptr = *p_rptr;
rptr.resize(this->Size() + 1);
for (size_t i = 0; i < rptr.size(); ++i) {
rptr[i] = static_cast<size_t>(this->row_ptr(static_cast<int>(i)));
}
batch.ind_ptr = &rptr[0];
return batch;
}
/*! \brief get i-th row from the batch */
inline RowBatch::Inst operator[](int i) {
return RowBatch::Inst(data_ptr(0) + row_ptr(i),
static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
}
/*!
* \brief clear the page, cleanup the content
*/
inline void Clear(void) {
memset(&data_[0], 0, sizeof(int) * kPageSize);
}
/*!
* \brief load one page form instream
* \return true if loading is successful
*/
inline bool Load(utils::IStream &fi) {
return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
}
/*! \brief save one page into outstream */
inline void Save(utils::IStream &fo) {
fo.Write(&data_[0], sizeof(int) * kPageSize);
}
/*! \return number of elements */
inline int Size(void) const {
return data_[0];
}
protected:
/*! \return number of elements */
inline size_t FreeBytes(void) {
return (kPageSize - (Size() + 2)) * sizeof(int) -
row_ptr(Size()) * sizeof(RowBatch::Entry);
}
/*! \brief equivalent row pointer at i */
inline int& row_ptr(int i) {
return data_[kPageSize - i - 1];
}
inline RowBatch::Entry* data_ptr(int i) {
return (RowBatch::Entry*)(&data_[1]) + i;
}
// content of data
int *data_;
// page size
const size_t kPageSize;
};
/*! \brief thread buffer iterator */ /*! \brief thread buffer iterator */
class ThreadRowPageIterator: public utils::IIterator<RowBatch> { class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
public: public:
@ -118,7 +32,10 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
} }
virtual bool Next(void) { virtual bool Next(void) {
if (!itr.Next(page_)) return false; if (!itr.Next(page_)) return false;
out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_); out_.base_rowid = base_rowid_;
out_.ind_ptr = BeginPtr(page_->offset);
out_.data_ptr = BeginPtr(page_->data);
out_.size = page_->offset.size() - 1;
base_rowid_ += out_.size; base_rowid_ += out_.size;
return true; return true;
} }
@ -127,76 +44,18 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
} }
/*! \brief load and initialize the iterator with fi */ /*! \brief load and initialize the iterator with fi */
inline void Load(const utils::FileStream &fi) { inline void Load(const utils::FileStream &fi) {
itr.get_factory().SetFile(fi); itr.get_factory().SetFile(fi, 0);
itr.Init(); itr.Init();
this->BeforeFirst(); this->BeforeFirst();
} }
/*!
* \brief save a row iterator to output stream, in row iterator format
*/
inline static void Save(utils::IIterator<RowBatch> *iter,
utils::IStream &fo) {
RowBatchPage page(kPageSize);
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (!page.PushRow(batch[i])) {
page.Save(fo);
page.Clear();
utils::Check(page.PushRow(batch[i]), "row is too big");
}
}
}
if (page.Size() != 0) page.Save(fo);
}
/*! \brief page size 64 MB */
static const size_t kPageSize = 64 << 18;
private: private:
// base row id // base row id
size_t base_rowid_; size_t base_rowid_;
// temporal ptr
std::vector<size_t> tmp_ptr_;
// output data // output data
RowBatch out_; RowBatch out_;
// page pointer type SparsePage *page_;
typedef RowBatchPage* PagePtr; utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
// loader factory for page
struct Factory {
public:
size_t file_begin_;
utils::FileStream fi;
Factory(void) {}
inline void SetFile(const utils::FileStream &fi) {
this->fi = fi;
file_begin_ = this->fi.Tell();
}
inline bool Init(void) {
return true;
}
inline void SetParam(const char *name, const char *val) {}
inline bool LoadNext(PagePtr &val) {
return val->Load(fi);
}
inline PagePtr Create(void) {
PagePtr a = new RowBatchPage(kPageSize);
return a;
}
inline void FreeSpace(PagePtr &a) {
delete a;
}
inline void Destroy(void) {
fi.Close();
}
inline void BeforeFirst(void) {
fi.Seek(file_begin_);
}
};
protected:
PagePtr page_;
utils::ThreadBuffer<PagePtr, Factory> itr;
}; };
/*! \brief data matrix using page */ /*! \brief data matrix using page */
@ -247,8 +106,20 @@ class DMatrixPageBase : public DataMatrix {
mat.info.SaveBinary(fs); mat.info.SaveBinary(fs);
fs.Close(); fs.Close();
fname += ".row.blob"; fname += ".row.blob";
utils::IIterator<RowBatch> *iter = mat.fmat()->RowIterator();
utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb")); utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb"));
ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fbin); SparsePage page;
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
page.Push(batch[i]);
if (page.MemCostBytes() >= kPageSize) {
page.Save(&fbin); page.Clear();
}
}
}
if (page.data.size() != 0) page.Save(&fbin);
fbin.Close(); fbin.Close();
if (!silent) { if (!silent) {
utils::Printf("DMatrixPage: %lux%lu is saved to %s\n", utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
@ -268,7 +139,7 @@ class DMatrixPageBase : public DataMatrix {
} }
std::string fname_row = std::string(cache_file) + ".row.blob"; std::string fname_row = std::string(cache_file) + ".row.blob";
utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));
RowBatchPage page(ThreadRowPageIterator::kPageSize); SparsePage page;
dmlc::InputSplit *in = dmlc::InputSplit *in =
dmlc::InputSplit::Create(uri, rank, npart); dmlc::InputSplit::Create(uri, rank, npart);
std::string line; std::string line;
@ -286,10 +157,9 @@ class DMatrixPageBase : public DataMatrix {
feats.push_back(e); feats.push_back(e);
} }
RowBatch::Inst row(BeginPtr(feats), feats.size()); RowBatch::Inst row(BeginPtr(feats), feats.size());
if (!page.PushRow(row)) { page.Push(row);
page.Save(fo); if (page.MemCostBytes() >= kPageSize) {
page.Clear(); page.Save(&fo); page.Clear();
utils::Check(page.PushRow(row), "row is too big");
} }
for (size_t i = 0; i < feats.size(); ++i) { for (size_t i = 0; i < feats.size(); ++i) {
info.info.num_col = std::max(info.info.num_col, info.info.num_col = std::max(info.info.num_col,
@ -298,8 +168,8 @@ class DMatrixPageBase : public DataMatrix {
this->info.labels.push_back(label); this->info.labels.push_back(label);
info.info.num_row += 1; info.info.num_row += 1;
} }
if (page.Size() != 0) { if (page.data.size() != 0) {
page.Save(fo); page.Save(&fo);
} }
delete in; delete in;
fo.Close(); fo.Close();
@ -319,7 +189,8 @@ class DMatrixPageBase : public DataMatrix {
} }
/*! \brief magic number used to identify DMatrix */ /*! \brief magic number used to identify DMatrix */
static const int kMagic = TKMagic; static const int kMagic = TKMagic;
/*! \brief page size 64 MB */
static const size_t kPageSize = 64 << 18;
protected: protected:
/*! \brief row iterator */ /*! \brief row iterator */
ThreadRowPageIterator *iter_; ThreadRowPageIterator *iter_;

130
src/io/page_fmatrix-inl.hpp Normal file
View File

@ -0,0 +1,130 @@
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
/*!
* \file page_fmatrix-inl.hpp
* col iterator based on sparse page
* \author Tianqi Chen
*/
namespace xgboost {
namespace io {
/*! \brief thread buffer iterator */
class ThreadColPageIterator: public utils::IIterator<ColBatch> {
public:
ThreadColPageIterator(void) {
itr.SetParam("buffer_size", "2");
page_ = NULL;
}
virtual ~ThreadColPageIterator(void) {}
virtual void Init(void) {}
virtual void BeforeFirst(void) {
itr.BeforeFirst();
}
virtual bool Next(void) {
if (!itr.Next(page_)) return false;
out_.col_index = BeginPtr(itr.get_factory().index_set());
col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(NULL, 0));
for (size_t i = 0; i < col_data_.size(); ++i) {
col_data_[i] = SparseBatch::Inst
(BeginPtr(page_->data) + page_->offset[i],
page_->offset[i + 1] - page_->offset[i]);
}
out_.col_data = BeginPtr(col_data_);
out_.size = col_data_.size();
return true;
}
virtual const ColBatch &Value(void) const {
return out_;
}
/*! \brief load and initialize the iterator with fi */
inline void SetFile(const utils::FileStream &fi) {
itr.get_factory().SetFile(fi, 0);
itr.Init();
}
// set index set
inline void SetIndexSet(const std::vector<bst_uint> &fset) {
itr.get_factory().SetIndexSet(fset);
}
private:
// output data
ColBatch out_;
SparsePage *page_;
std::vector<SparseBatch::Inst> col_data_;
utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
};
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public IFMatrix {
public:
typedef SparseBatch::Entry Entry;
/*! \brief constructor */
FMatrixS(utils::IIterator<RowBatch> *iter) {
this->iter_ = iter;
}
// destructor
virtual ~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
virtual size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get number of buffered rows */
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get column size */
virtual size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
virtual float GetColDensity(size_t cidx) const {
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
virtual void InitColAccess(const std::vector<bool> &enabled,
float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep, enabled);
}
/*!
* \brief get the row iterator associated with FMatrix
*/
virtual utils::IIterator<RowBatch>* RowIterator(void) {
iter_->BeforeFirst();
return iter_;
}
/*!
* \brief get the column based iterator
*/
virtual utils::IIterator<ColBatch>* ColIterator(void) {
size_t ncol = this->NumCol();
col_iter_.col_index_.resize(ncol);
for (size_t i = 0; i < ncol; ++i) {
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
}
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
/*!
* \brief colmun based iterator
*/
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
size_t ncol = this->NumCol();
col_iter_.col_index_.resize(0);
for (size_t i = 0; i < fset.size(); ++i) {
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
}
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_

View File

@ -16,7 +16,7 @@ namespace io {
/*! /*!
* \brief sparse matrix that support column access, CSC * \brief sparse matrix that support column access, CSC
*/ */
class FMatrixS : public IFMatrix{ class FMatrixS : public IFMatrix {
public: public:
typedef SparseBatch::Entry Entry; typedef SparseBatch::Entry Entry;
/*! \brief constructor */ /*! \brief constructor */
@ -238,7 +238,7 @@ class FMatrixS : public IFMatrix{
inline void SetBatch(const std::vector<size_t> &ptr, inline void SetBatch(const std::vector<size_t> &ptr,
const std::vector<ColBatch::Entry> &data) { const std::vector<ColBatch::Entry> &data) {
batch_.size = col_index_.size(); batch_.size = col_index_.size();
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0)); col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
for (size_t i = 0; i < col_data_.size(); ++i) { for (size_t i = 0; i < col_data_.size(); ++i) {
const bst_uint ridx = col_index_[i]; const bst_uint ridx = col_index_[i];
col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx], col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx],

216
src/io/sparse_batch_page.h Normal file
View File

@ -0,0 +1,216 @@
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
/*!
* \file sparse_batch_page.h
* content holder of sparse batch that can be saved to disk
* the representation can be effectively
* use in external memory computation
* \author Tianqi Chen
*/
#include "../data.h"
namespace xgboost {
namespace io {
/*!
* \brief storage unit of sparse batch
*/
class SparsePage {
public:
/*! \brief offset of the segments */
std::vector<size_t> offset;
/*! \brief the data of the segments */
std::vector<SparseBatch::Entry> data;
/*! \brief constructor */
SparsePage() {
this->Clear();
}
/*!
* \brief load the by providing a list of interested segments
* only the interested segments are loaded
* \param fi the input stream of the file
* \param sorted_index_set sorted index of segments we are interested in
* \return true of the loading as successful, false if end of file was reached
*/
inline bool Load(utils::ISeekStream *fi,
const std::vector<bst_uint> &sorted_index_set) {
if (!fi->Read(&disk_offset_)) return false;
// setup the offset
offset.clear(); offset.push_back(0);
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
bst_uint fid = sorted_index_set[i];
size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
offset.push_back(offset.back() + size);
}
data.resize(offset.back());
// read in the data
size_t begin = fi->Tell();
size_t curr_offset = 0;
for (size_t i = 0; i < sorted_index_set.size();) {
bst_uint fid = sorted_index_set[i];
if (disk_offset_[fid] != curr_offset) {
utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted");
fi->Seek(begin + disk_offset_[fid]);
curr_offset = disk_offset_[fid];
}
size_t j, size_to_read = 0;
for (j = i; j < sorted_index_set.size(); ++j) {
if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
size_to_read += offset[j + 1] - offset[j];
} else {
break;
}
}
if (size_to_read != 0) {
utils::Check(fi->Read(BeginPtr(data) + offset[i],
size_to_read * sizeof(SparseBatch::Entry)) != 0,
"Invalid SparsePage file");
curr_offset += size_to_read;
}
i = j;
}
return true;
}
/*!
* \brief load all the segments
* \param fi the input stream of the file
* \return true of the loading as successful, false if end of file was reached
*/
inline bool Load(utils::IStream *fi) {
if (!fi->Read(&offset)) return false;
utils::Check(offset.size() != 0, "Invalid SparsePage file");
data.resize(offset.back());
if (data.size() != 0) {
utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0,
"Invalid SparsePage file");
}
return true;
}
/*!
* \brief save the data to fo, when a page was written
* to disk it must contain all the elements in the
* \param fo output stream
*/
inline void Save(utils::IStream *fo) const {
utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset");
utils::Assert(offset.back() == data.size(), "in consistent SparsePage");
fo->Write(offset);
if (data.size() != 0) {
fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
}
}
/*! \return estimation of memory cost of this page */
inline size_t MemCostBytes(void) const {
return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
}
/*! \brief clear the page */
inline void Clear(void) {
offset.clear();
offset.push_back(0);
data.clear();
}
/*!
* \brief load all the segments and add it to existing batch
* \param fi the input stream of the file
* \return true of the loading as successful, false if end of file was reached
*/
inline bool PushLoad(utils::IStream *fi) {
if (!fi->Read(&disk_offset_)) return false;
data.resize(offset.back() + disk_offset_.back());
if (disk_offset_.back() != 0) {
utils::Check(fi->Read(BeginPtr(data) + offset.back(),
disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
"Invalid SparsePage file");
}
size_t top = offset.back();
size_t begin = offset.size();
offset.resize(offset.size() + disk_offset_.size());
for (size_t i = 0; i < disk_offset_.size(); ++i) {
offset[i + begin] = top + disk_offset_[i];
}
}
/*!
* \brief Push row batch into the page
* \param batch the row batch
*/
inline void Push(const RowBatch &batch) {
data.resize(offset.back() + batch.size);
std::memcpy(BeginPtr(data) + offset.back(),
batch.data_ptr + batch.ind_ptr[0],
sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
size_t top = offset.back();
size_t begin = offset.size();
offset.resize(offset.size() + batch.size);
for (size_t i = 0; i < batch.size; ++i) {
offset[i + begin] = top + batch.ind_ptr[i] - batch.ind_ptr[0];
}
}
/*!
* \brief Push one instance into page
* \param row an instance row
*/
inline void Push(const SparseBatch::Inst &inst) {
offset.push_back(offset.back() + inst.length);
size_t begin = data.size();
data.resize(begin + inst.length);
std::memcpy(BeginPtr(data) + begin, inst.data,
sizeof(SparseBatch::Entry) * inst.length);
}
private:
/*! \brief external memory column offset */
std::vector<size_t> disk_offset_;
};
/*!
* \brief factory class for SparsePage,
* used in threadbuffer template
*/
class SparsePageFactory {
public:
SparsePageFactory(void) {}
inline void SetFile(const utils::FileStream &fi,
size_t file_begin = 0) {
fi_ = fi;
file_begin_ = file_begin;
}
inline const std::vector<bst_uint> &index_set(void) const {
return action_index_set_;
}
// set index set, will be used after next before first
inline void SetIndexSet(const std::vector<bst_uint> &index_set) {
set_index_set_ = index_set;
std::sort(set_index_set_.begin(), set_index_set_.end());
}
inline bool Init(void) {
return true;
}
inline void SetParam(const char *name, const char *val) {}
inline bool LoadNext(SparsePage *val) {
if (action_index_set_.size() != 0) {
return val->Load(&fi_, action_index_set_);
} else {
return val->Load(&fi_);
}
}
inline SparsePage *Create(void) {
return new SparsePage();
}
inline void FreeSpace(SparsePage *a) {
delete a;
}
inline void Destroy(void) {
fi_.Close();
}
inline void BeforeFirst(void) {
fi_.Seek(file_begin_);
action_index_set_ = set_index_set_;
}
private:
size_t file_begin_;
utils::FileStream fi_;
std::vector<bst_uint> action_index_set_;
std::vector<bst_uint> set_index_set_;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_SPARSE_BATCH_PAGE_H_