From a89e3063e6cc327f1552cdea154864fa510f8040 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 15:34:11 -0700 Subject: [PATCH] untested version of cpage --- src/io/io.cpp | 11 ++++++++++ src/io/page_dmatrix-inl.hpp | 44 +++++++++++++++++++++++-------------- src/io/page_fmatrix-inl.hpp | 32 ++++++++++++++++++++++----- 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/src/io/io.cpp b/src/io/io.cpp index c2d9e26d3..faed31f13 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -7,6 +7,7 @@ using namespace std; #include "../utils/utils.h" #include "simple_dmatrix-inl.hpp" #include "page_dmatrix-inl.hpp" +#include "page_fmatrix-inl.hpp" // implements data loads using dmatrix simple for now @@ -30,6 +31,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) { // the file pointer is hold in page matrix return dmat; } + if (magic == DMatrixColPage::kMagic) { + DMatrixColPage *dmat = new DMatrixColPage(fname); + dmat->Load(fs, silent, fname); + // the file pointer is hold in page matrix + return dmat; + } fs.Close(); DMatrixSimple *dmat = new DMatrixSimple(); @@ -42,6 +49,10 @@ void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { DMatrixPage::Save(fname, dmat, silent); return; } + if (!strcmp(fname + strlen(fname) - 6, ".cpage")) { + DMatrixColPage::Save(fname, dmat, silent); + return; + } if (dmat.magic == DMatrixSimple::kMagic) { const DMatrixSimple *p_dmat = static_cast(&dmat); p_dmat->SaveBinary(fname, silent); diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 83c745599..63010d882 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -9,7 +9,6 @@ #include "../utils/iterator.h" #include "../utils/thread_buffer.h" #include "./simple_fmatrix-inl.hpp" -#include "./page_fmatrix-inl.hpp" namespace xgboost { namespace io { @@ -200,26 +199,24 @@ class ThreadRowPageIterator: public utils::IIterator { }; /*! \brief data matrix using page */ -class DMatrixPage : public DataMatrix { +template +class DMatrixPageBase : public DataMatrix { public: - DMatrixPage(void) : DataMatrix(kMagic) { + DMatrixPageBase(void) : DataMatrix(kMagic) { iter_ = new ThreadRowPageIterator(); - fmat_ = new FMatrixS(iter_); } // virtual destructor - virtual ~DMatrixPage(void) { - delete fmat_; - } - virtual IFMatrix *fmat(void) const { - return fmat_; + virtual ~DMatrixPageBase(void) { + // do not delete row iterator, since it is owned by fmat + // to be cleaned up in a more clear way } /*! \brief load and initialize the iterator with fi */ inline void Load(utils::FileStream &fi, bool silent = false, const char *fname = NULL){ - int magic; - utils::Check(fi.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); - utils::Check(magic == kMagic, "invalid format,magic number mismatch"); + int tmagic; + utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format"); + utils::Check(tmagic == magic, "invalid format,magic number mismatch"); this->info.LoadBinary(fi); iter_->Load(fi); if (!silent) { @@ -250,12 +247,27 @@ class DMatrixPage : public DataMatrix { static_cast(mat.info.num_col()), fname); } } - /*! \brief the real fmatrix */ - FMatrixS *fmat_; + /*! \brief magic number used to identify DMatrix */ + static const int kMagic = TKMagic; + protected: + /*! \brief row iterator */ ThreadRowPageIterator *iter_; - /*! \brief magic number used to identify DMatrix */ - static const int kMagic = 0xffffab02; +}; + +class DMatrixPage : public DMatrixPageBase<0xffffab02> { + public: + DMatrixPage(void) { + fmat_ = new FMatrixS(iter_); + } + virtual ~DMatrixPage(void) { + delete fmat_; + } + virtual IFMatrix *fmat(void) const { + return fmat_; + } + /*! \brief the real fmatrix */ + IFMatrix *fmat_; }; } // namespace io } // namespace xgboost diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 7e9903be4..4189c0c85 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -199,7 +199,8 @@ class ThreadColPageIterator : public utils::IIterator { class FMatrixPage : public IFMatrix { public: /*! \brief constructor */ - FMatrixPage(utils::IIterator *iter, std::string fname_buffer) { + FMatrixPage(utils::IIterator *iter, std::string fname_buffer) + : fname_cbuffer_(fname_buffer) { this->row_iter_ = iter; this->col_iter_ = NULL; this->fi_ = NULL; @@ -238,7 +239,8 @@ class FMatrixPage : public IFMatrix { } virtual void InitColAccess(float pkeep = 1.0f) { if (this->HaveColAccess()) return; - this->InitColData(pkeep); + this->InitColData(pkeep, fname_cbuffer_.c_str(), + 64 << 20, 5); } /*! * \brief get the row iterator associated with FMatrix @@ -281,11 +283,12 @@ class FMatrixPage : public IFMatrix { * \brief intialize column data * \param pkeep probability to keep a row */ - inline void InitColData(float pkeep) { + inline void InitColData(float pkeep, const char *fname, + size_t buffer_size, size_t col_step) { buffered_rowset_.clear(); - utils::FileStream fo(utils::FopenCheck(fname_cbuffer_.c_str(), "wb+")); + utils::FileStream fo(utils::FopenCheck(fname, "wb+")); // use 64M buffer - utils::SparseCSRFileBuilder builder(&fo, 64<<20); + utils::SparseCSRFileBuilder builder(&fo, buffer_size); // start working row_iter_->BeforeFirst(); @@ -322,7 +325,7 @@ class FMatrixPage : public IFMatrix { } } builder.Finalize(); - builder.SortRows(ColBatch::Entry::CmpValue, 5); + builder.SortRows(ColBatch::Entry::CmpValue, col_step); fo.Close(); } @@ -339,6 +342,23 @@ class FMatrixPage : public IFMatrix { std::vector buffered_rowset_; }; +class DMatrixColPage : public DMatrixPageBase<0xffffab03> { + public: + DMatrixColPage(const char *fname) { + std::string fext = fname; + fext += ".col"; + fmat_ = new FMatrixPage(iter_, fext.c_str()); + } + virtual ~DMatrixColPage(void) { + delete fmat_; + } + virtual IFMatrix *fmat(void) const { + return fmat_; + } + /*! \brief the real fmatrix */ + IFMatrix *fmat_; +}; + } // namespace io } // namespace xgboost #endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_