untested version of cpage

This commit is contained in:
tqchen 2014-09-02 15:34:11 -07:00
parent 4b9aeea89c
commit a89e3063e6
3 changed files with 65 additions and 22 deletions

View File

@ -7,6 +7,7 @@ using namespace std;
#include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp"
#include "page_dmatrix-inl.hpp"
#include "page_fmatrix-inl.hpp"
// implements data loads using dmatrix simple for now
@ -30,6 +31,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
// the file pointer is hold in page matrix
return dmat;
}
if (magic == DMatrixColPage::kMagic) {
DMatrixColPage *dmat = new DMatrixColPage(fname);
dmat->Load(fs, silent, fname);
// the file pointer is hold in page matrix
return dmat;
}
fs.Close();
DMatrixSimple *dmat = new DMatrixSimple();
@ -42,6 +49,10 @@ void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
DMatrixPage::Save(fname, dmat, silent);
return;
}
if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
DMatrixColPage::Save(fname, dmat, silent);
return;
}
if (dmat.magic == DMatrixSimple::kMagic) {
const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
p_dmat->SaveBinary(fname, silent);

View File

@ -9,7 +9,6 @@
#include "../utils/iterator.h"
#include "../utils/thread_buffer.h"
#include "./simple_fmatrix-inl.hpp"
#include "./page_fmatrix-inl.hpp"
namespace xgboost {
namespace io {
@ -200,26 +199,24 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
};
/*! \brief data matrix using page */
class DMatrixPage : public DataMatrix {
template<int TKMagic>
class DMatrixPageBase : public DataMatrix {
public:
DMatrixPage(void) : DataMatrix(kMagic) {
DMatrixPageBase(void) : DataMatrix(kMagic) {
iter_ = new ThreadRowPageIterator();
fmat_ = new FMatrixS(iter_);
}
// virtual destructor
virtual ~DMatrixPage(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
virtual ~DMatrixPageBase(void) {
// do not delete row iterator, since it is owned by fmat
// to be cleaned up in a more clear way
}
/*! \brief load and initialize the iterator with fi */
inline void Load(utils::FileStream &fi,
bool silent = false,
const char *fname = NULL){
int magic;
utils::Check(fi.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
utils::Check(magic == kMagic, "invalid format,magic number mismatch");
int tmagic;
utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
utils::Check(tmagic == magic, "invalid format,magic number mismatch");
this->info.LoadBinary(fi);
iter_->Load(fi);
if (!silent) {
@ -250,12 +247,27 @@ class DMatrixPage : public DataMatrix {
static_cast<unsigned long>(mat.info.num_col()), fname);
}
}
/*! \brief the real fmatrix */
FMatrixS *fmat_;
/*! \brief magic number used to identify DMatrix */
static const int kMagic = TKMagic;
protected:
/*! \brief row iterator */
ThreadRowPageIterator *iter_;
/*! \brief magic number used to identify DMatrix */
static const int kMagic = 0xffffab02;
};
class DMatrixPage : public DMatrixPageBase<0xffffab02> {
public:
DMatrixPage(void) {
fmat_ = new FMatrixS(iter_);
}
virtual ~DMatrixPage(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
}
/*! \brief the real fmatrix */
IFMatrix *fmat_;
};
} // namespace io
} // namespace xgboost

View File

@ -199,7 +199,8 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
class FMatrixPage : public IFMatrix {
public:
/*! \brief constructor */
FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer) {
FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer)
: fname_cbuffer_(fname_buffer) {
this->row_iter_ = iter;
this->col_iter_ = NULL;
this->fi_ = NULL;
@ -238,7 +239,8 @@ class FMatrixPage : public IFMatrix {
}
virtual void InitColAccess(float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
this->InitColData(pkeep, fname_cbuffer_.c_str(),
64 << 20, 5);
}
/*!
* \brief get the row iterator associated with FMatrix
@ -281,11 +283,12 @@ class FMatrixPage : public IFMatrix {
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
inline void InitColData(float pkeep, const char *fname,
size_t buffer_size, size_t col_step) {
buffered_rowset_.clear();
utils::FileStream fo(utils::FopenCheck(fname_cbuffer_.c_str(), "wb+"));
utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
// use 64M buffer
utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, 64<<20);
utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
// start working
row_iter_->BeforeFirst();
@ -322,7 +325,7 @@ class FMatrixPage : public IFMatrix {
}
}
builder.Finalize();
builder.SortRows(ColBatch::Entry::CmpValue, 5);
builder.SortRows(ColBatch::Entry::CmpValue, col_step);
fo.Close();
}
@ -339,6 +342,23 @@ class FMatrixPage : public IFMatrix {
std::vector<bst_uint> buffered_rowset_;
};
class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
public:
DMatrixColPage(const char *fname) {
std::string fext = fname;
fext += ".col";
fmat_ = new FMatrixPage(iter_, fext.c_str());
}
virtual ~DMatrixColPage(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
}
/*! \brief the real fmatrix */
IFMatrix *fmat_;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_