some lint
This commit is contained in:
parent
e6e467ad60
commit
401d648372
@ -5,6 +5,7 @@
|
|||||||
* row iterator based on sparse page
|
* row iterator based on sparse page
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <vector>
|
||||||
#include "../data.h"
|
#include "../data.h"
|
||||||
#include "../utils/iterator.h"
|
#include "../utils/iterator.h"
|
||||||
#include "../utils/thread_buffer.h"
|
#include "../utils/thread_buffer.h"
|
||||||
@ -15,7 +16,7 @@ namespace io {
|
|||||||
/*! \brief page structure that can be used to store a rowbatch */
|
/*! \brief page structure that can be used to store a rowbatch */
|
||||||
struct RowBatchPage {
|
struct RowBatchPage {
|
||||||
public:
|
public:
|
||||||
RowBatchPage(size_t page_size) : kPageSize(page_size) {
|
explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
|
||||||
data_ = new int[kPageSize];
|
data_ = new int[kPageSize];
|
||||||
utils::Assert(data_ != NULL, "fail to allocate row batch page");
|
utils::Assert(data_ != NULL, "fail to allocate row batch page");
|
||||||
this->Clear();
|
this->Clear();
|
||||||
@ -31,10 +32,10 @@ struct RowBatchPage {
|
|||||||
inline bool PushRow(const RowBatch::Inst &row) {
|
inline bool PushRow(const RowBatch::Inst &row) {
|
||||||
const size_t dsize = row.length * sizeof(RowBatch::Entry);
|
const size_t dsize = row.length * sizeof(RowBatch::Entry);
|
||||||
if (FreeBytes() < dsize+ sizeof(int)) return false;
|
if (FreeBytes() < dsize+ sizeof(int)) return false;
|
||||||
row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
|
row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
|
||||||
memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
|
memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
|
||||||
++ data_[0];
|
++data_[0];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief get a row batch representation from the page
|
* \brief get a row batch representation from the page
|
||||||
@ -43,7 +44,7 @@ struct RowBatchPage {
|
|||||||
* \return a new RowBatch object
|
* \return a new RowBatch object
|
||||||
*/
|
*/
|
||||||
inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
|
inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
|
||||||
RowBatch batch;
|
RowBatch batch;
|
||||||
batch.base_rowid = base_rowid;
|
batch.base_rowid = base_rowid;
|
||||||
batch.data_ptr = this->data_ptr(0);
|
batch.data_ptr = this->data_ptr(0);
|
||||||
batch.size = static_cast<size_t>(this->Size());
|
batch.size = static_cast<size_t>(this->Size());
|
||||||
@ -57,7 +58,7 @@ struct RowBatchPage {
|
|||||||
}
|
}
|
||||||
/*! \brief get i-th row from the batch */
|
/*! \brief get i-th row from the batch */
|
||||||
inline RowBatch::Inst operator[](int i) {
|
inline RowBatch::Inst operator[](int i) {
|
||||||
return RowBatch::Inst(data_ptr(0) + row_ptr(i),
|
return RowBatch::Inst(data_ptr(0) + row_ptr(i),
|
||||||
static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
|
static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -85,8 +86,8 @@ struct RowBatchPage {
|
|||||||
private:
|
private:
|
||||||
/*! \return number of elements */
|
/*! \return number of elements */
|
||||||
inline size_t FreeBytes(void) {
|
inline size_t FreeBytes(void) {
|
||||||
return (kPageSize - (Size() + 2)) * sizeof(int)
|
return (kPageSize - (Size() + 2)) * sizeof(int) -
|
||||||
- row_ptr(Size()) * sizeof(RowBatch::Entry) ;
|
row_ptr(Size()) * sizeof(RowBatch::Entry);
|
||||||
}
|
}
|
||||||
/*! \brief equivalent row pointer at i */
|
/*! \brief equivalent row pointer at i */
|
||||||
inline int& row_ptr(int i) {
|
inline int& row_ptr(int i) {
|
||||||
@ -98,7 +99,7 @@ struct RowBatchPage {
|
|||||||
// page size
|
// page size
|
||||||
const size_t kPageSize;
|
const size_t kPageSize;
|
||||||
// content of data
|
// content of data
|
||||||
int *data_;
|
int *data_;
|
||||||
};
|
};
|
||||||
/*! \brief thread buffer iterator */
|
/*! \brief thread buffer iterator */
|
||||||
class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
||||||
@ -108,8 +109,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
|||||||
page_ = NULL;
|
page_ = NULL;
|
||||||
base_rowid_ = 0;
|
base_rowid_ = 0;
|
||||||
}
|
}
|
||||||
virtual ~ThreadRowPageIterator(void) {
|
virtual ~ThreadRowPageIterator(void) {}
|
||||||
}
|
|
||||||
virtual void Init(void) {
|
virtual void Init(void) {
|
||||||
}
|
}
|
||||||
virtual void BeforeFirst(void) {
|
virtual void BeforeFirst(void) {
|
||||||
@ -117,12 +117,12 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
|||||||
base_rowid_ = 0;
|
base_rowid_ = 0;
|
||||||
}
|
}
|
||||||
virtual bool Next(void) {
|
virtual bool Next(void) {
|
||||||
if(!itr.Next(page_)) return false;
|
if (!itr.Next(page_)) return false;
|
||||||
out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
|
out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
|
||||||
base_rowid_ += out_.size;
|
base_rowid_ += out_.size;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
virtual const RowBatch &Value(void) const{
|
virtual const RowBatch &Value(void) const {
|
||||||
return out_;
|
return out_;
|
||||||
}
|
}
|
||||||
/*! \brief load and initialize the iterator with fi */
|
/*! \brief load and initialize the iterator with fi */
|
||||||
@ -152,6 +152,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
|||||||
}
|
}
|
||||||
/*! \brief page size 64 MB */
|
/*! \brief page size 64 MB */
|
||||||
static const size_t kPageSize = 64 << 18;
|
static const size_t kPageSize = 64 << 18;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// base row id
|
// base row id
|
||||||
size_t base_rowid_;
|
size_t base_rowid_;
|
||||||
@ -195,7 +196,7 @@ class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
PagePtr page_;
|
PagePtr page_;
|
||||||
utils::ThreadBuffer<PagePtr,Factory> itr;
|
utils::ThreadBuffer<PagePtr, Factory> itr;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief data matrix using page */
|
/*! \brief data matrix using page */
|
||||||
@ -213,10 +214,10 @@ class DMatrixPageBase : public DataMatrix {
|
|||||||
/*! \brief load and initialize the iterator with fi */
|
/*! \brief load and initialize the iterator with fi */
|
||||||
inline void Load(utils::FileStream &fi,
|
inline void Load(utils::FileStream &fi,
|
||||||
bool silent = false,
|
bool silent = false,
|
||||||
const char *fname = NULL){
|
const char *fname = NULL) {
|
||||||
int tmagic;
|
int tmagic;
|
||||||
utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
|
utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
|
||||||
utils::Check(tmagic == magic, "invalid format,magic number mismatch");
|
utils::Check(tmagic == magic, "invalid format,magic number mismatch");
|
||||||
this->info.LoadBinary(fi);
|
this->info.LoadBinary(fi);
|
||||||
iter_->Load(fi);
|
iter_->Load(fi);
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
@ -229,7 +230,7 @@ class DMatrixPageBase : public DataMatrix {
|
|||||||
utils::Printf("\n");
|
utils::Printf("\n");
|
||||||
}
|
}
|
||||||
if (info.group_ptr.size() != 0) {
|
if (info.group_ptr.size() != 0) {
|
||||||
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -249,8 +250,8 @@ class DMatrixPageBase : public DataMatrix {
|
|||||||
}
|
}
|
||||||
/*! \brief magic number used to identify DMatrix */
|
/*! \brief magic number used to identify DMatrix */
|
||||||
static const int kMagic = TKMagic;
|
static const int kMagic = TKMagic;
|
||||||
protected:
|
|
||||||
|
|
||||||
|
protected:
|
||||||
/*! \brief row iterator */
|
/*! \brief row iterator */
|
||||||
ThreadRowPageIterator *iter_;
|
ThreadRowPageIterator *iter_;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -5,6 +5,9 @@
|
|||||||
* sparse page manager for fmatrix
|
* sparse page manager for fmatrix
|
||||||
* \author Tianqi Chen
|
* \author Tianqi Chen
|
||||||
*/
|
*/
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <algorithm>
|
||||||
#include "../data.h"
|
#include "../data.h"
|
||||||
#include "../utils/iterator.h"
|
#include "../utils/iterator.h"
|
||||||
#include "../utils/io.h"
|
#include "../utils/io.h"
|
||||||
@ -34,7 +37,7 @@ class CSCMatrixManager {
|
|||||||
/*! \brief column index */
|
/*! \brief column index */
|
||||||
std::vector<bst_uint> col_index;
|
std::vector<bst_uint> col_index;
|
||||||
/*! \brief column data */
|
/*! \brief column data */
|
||||||
std::vector<ColBatch::Inst> col_data;
|
std::vector<ColBatch::Inst> col_data;
|
||||||
/*! \brief number of free entries */
|
/*! \brief number of free entries */
|
||||||
inline size_t NumFreeEntry(void) const {
|
inline size_t NumFreeEntry(void) const {
|
||||||
return buffer.size() - num_entry;
|
return buffer.size() - num_entry;
|
||||||
@ -52,6 +55,7 @@ class CSCMatrixManager {
|
|||||||
batch.col_data = BeginPtr(col_data);
|
batch.col_data = BeginPtr(col_data);
|
||||||
return batch;
|
return batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/*! \brief buffer space, not to be changed since ready */
|
/*! \brief buffer space, not to be changed since ready */
|
||||||
std::vector<ColBatch::Entry> buffer;
|
std::vector<ColBatch::Entry> buffer;
|
||||||
@ -80,7 +84,7 @@ class CSCMatrixManager {
|
|||||||
col_index_ = col_todo_;
|
col_index_ = col_todo_;
|
||||||
read_top_ = 0;
|
read_top_ = 0;
|
||||||
}
|
}
|
||||||
inline bool LoadNext(PagePtr &val) {
|
inline bool LoadNext(PagePtr &val) {
|
||||||
val->Clear();
|
val->Clear();
|
||||||
if (read_top_ >= col_index_.size()) return false;
|
if (read_top_ >= col_index_.size()) return false;
|
||||||
while (read_top_ < col_index_.size()) {
|
while (read_top_ < col_index_.size()) {
|
||||||
@ -106,7 +110,7 @@ class CSCMatrixManager {
|
|||||||
psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
|
psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
|
||||||
}
|
}
|
||||||
utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
|
utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
|
||||||
page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
|
page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
|
||||||
}
|
}
|
||||||
inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
|
inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
|
||||||
if (!setall) {
|
if (!setall) {
|
||||||
@ -124,6 +128,7 @@ class CSCMatrixManager {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/*! \brief fill a page with */
|
/*! \brief fill a page with */
|
||||||
inline bool TryFill(size_t cidx, Page *p_page) {
|
inline bool TryFill(size_t cidx, Page *p_page) {
|
||||||
@ -173,21 +178,22 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
|
|||||||
}
|
}
|
||||||
virtual void BeforeFirst(void) {
|
virtual void BeforeFirst(void) {
|
||||||
itr_.BeforeFirst();
|
itr_.BeforeFirst();
|
||||||
}
|
}
|
||||||
virtual bool Next(void) {
|
virtual bool Next(void) {
|
||||||
// page to be loaded
|
// page to be loaded
|
||||||
CSCMatrixManager::PagePtr page;
|
CSCMatrixManager::PagePtr page;
|
||||||
if(!itr_.Next(page)) return false;
|
if (!itr_.Next(page)) return false;
|
||||||
out_ = page->GetBatch();
|
out_ = page->GetBatch();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
virtual const ColBatch &Value(void) const{
|
virtual const ColBatch &Value(void) const {
|
||||||
return out_;
|
return out_;
|
||||||
}
|
}
|
||||||
inline const std::vector<size_t> &col_ptr(void) const {
|
inline const std::vector<size_t> &col_ptr(void) const {
|
||||||
return itr_.get_factory().col_ptr();
|
return itr_.get_factory().col_ptr();
|
||||||
}
|
}
|
||||||
inline void SetColSet(const std::vector<bst_uint> &cset, bool setall = false) {
|
inline void SetColSet(const std::vector<bst_uint> &cset,
|
||||||
|
bool setall = false) {
|
||||||
itr_.get_factory().SetColSet(cset, setall);
|
itr_.get_factory().SetColSet(cset, setall);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,9 +201,8 @@ class ThreadColPageIterator : public utils::IIterator<ColBatch> {
|
|||||||
// output data
|
// output data
|
||||||
ColBatch out_;
|
ColBatch out_;
|
||||||
// internal iterator
|
// internal iterator
|
||||||
utils::ThreadBuffer<CSCMatrixManager::PagePtr,CSCMatrixManager> itr_;
|
utils::ThreadBuffer<CSCMatrixManager::PagePtr, CSCMatrixManager> itr_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief sparse matrix that support column access
|
* \brief sparse matrix that support column access
|
||||||
*/
|
*/
|
||||||
@ -216,7 +221,7 @@ class FMatrixPage : public IFMatrix {
|
|||||||
if (col_iter_ != NULL) delete col_iter_;
|
if (col_iter_ != NULL) delete col_iter_;
|
||||||
if (fi_ != NULL) {
|
if (fi_ != NULL) {
|
||||||
fi_->Close(); delete fi_;
|
fi_->Close(); delete fi_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*! \return whether column access is enabled */
|
/*! \return whether column access is enabled */
|
||||||
virtual bool HaveColAccess(void) const {
|
virtual bool HaveColAccess(void) const {
|
||||||
@ -272,7 +277,7 @@ class FMatrixPage : public IFMatrix {
|
|||||||
col_iter_->BeforeFirst();
|
col_iter_->BeforeFirst();
|
||||||
return col_iter_;
|
return col_iter_;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
/*!
|
/*!
|
||||||
* \brief try load column data from file
|
* \brief try load column data from file
|
||||||
@ -282,25 +287,24 @@ class FMatrixPage : public IFMatrix {
|
|||||||
if (fp == NULL) return false;
|
if (fp == NULL) return false;
|
||||||
fi_ = new utils::FileStream(fp);
|
fi_ = new utils::FileStream(fp);
|
||||||
static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
|
static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
|
||||||
col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
|
col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief intialize column data
|
* \brief intialize column data
|
||||||
* \param pkeep probability to keep a row
|
* \param pkeep probability to keep a row
|
||||||
*/
|
*/
|
||||||
inline void InitColData(float pkeep, const char *fname,
|
inline void InitColData(float pkeep, const char *fname,
|
||||||
size_t buffer_size, size_t col_step) {
|
size_t buffer_size, size_t col_step) {
|
||||||
buffered_rowset_.clear();
|
buffered_rowset_.clear();
|
||||||
utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
|
utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
|
||||||
// use 64M buffer
|
// use 64M buffer
|
||||||
utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
|
utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
|
||||||
|
|
||||||
// start working
|
// start working
|
||||||
row_iter_->BeforeFirst();
|
row_iter_->BeforeFirst();
|
||||||
while (row_iter_->Next()) {
|
while (row_iter_->Next()) {
|
||||||
const RowBatch &batch = row_iter_->Value();
|
const RowBatch &batch = row_iter_->Value();
|
||||||
for (size_t i = 0; i < batch.size; ++i) {
|
for (size_t i = 0; i < batch.size; ++i) {
|
||||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||||
RowBatch::Inst inst = batch[i];
|
RowBatch::Inst inst = batch[i];
|
||||||
@ -350,7 +354,7 @@ class FMatrixPage : public IFMatrix {
|
|||||||
|
|
||||||
class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
|
class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
|
||||||
public:
|
public:
|
||||||
DMatrixColPage(const char *fname) {
|
explicit DMatrixColPage(const char *fname) {
|
||||||
std::string fext = fname;
|
std::string fext = fname;
|
||||||
fext += ".col";
|
fext += ".col";
|
||||||
fmat_ = new FMatrixPage(iter_, fext.c_str());
|
fmat_ = new FMatrixPage(iter_, fext.c_str());
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user