Merge remote-tracking branch 'origin/unity'
Conflicts: R-package/src/Makevars R-package/src/Makevars.win src/utils/io.h wrapper/xgboost.py
This commit is contained in:
@@ -6,6 +6,8 @@ using namespace std;
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "simple_dmatrix-inl.hpp"
|
||||
#include "page_dmatrix-inl.hpp"
|
||||
|
||||
// implements data loads using dmatrix simple for now
|
||||
|
||||
namespace xgboost {
|
||||
@@ -21,7 +23,13 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
|
||||
dmat->LoadBinary(fs, silent, fname);
|
||||
fs.Close();
|
||||
return dmat;
|
||||
}
|
||||
}
|
||||
if (magic == DMatrixPage::kMagic) {
|
||||
DMatrixPage *dmat = new DMatrixPage();
|
||||
dmat->Load(fs, silent, fname);
|
||||
// the file pointer is hold in page matrix
|
||||
return dmat;
|
||||
}
|
||||
fs.Close();
|
||||
|
||||
DMatrixSimple *dmat = new DMatrixSimple();
|
||||
@@ -30,11 +38,17 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
|
||||
}
|
||||
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
|
||||
if (!strcmp(fname + strlen(fname) - 5, ".page")) {
|
||||
DMatrixPage::Save(fname, dmat, silent);
|
||||
return;
|
||||
}
|
||||
if (dmat.magic == DMatrixSimple::kMagic) {
|
||||
const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
|
||||
p_dmat->SaveBinary(fname, silent);
|
||||
} else {
|
||||
utils::Error("not implemented");
|
||||
DMatrixSimple smat;
|
||||
smat.CopyFrom(dmat);
|
||||
smat.SaveBinary(fname, silent);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
262
src/io/page_dmatrix-inl.hpp
Normal file
262
src/io/page_dmatrix-inl.hpp
Normal file
@@ -0,0 +1,262 @@
|
||||
#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
|
||||
#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
|
||||
/*!
|
||||
* \file page_row_iter-inl.hpp
|
||||
* row iterator based on sparse page
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "../data.h"
|
||||
#include "../utils/iterator.h"
|
||||
#include "../utils/thread_buffer.h"
|
||||
#include "./simple_fmatrix-inl.hpp"
|
||||
#include "./page_fmatrix-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*! \brief page structure that can be used to store a rowbatch */
|
||||
struct RowBatchPage {
|
||||
public:
|
||||
RowBatchPage(size_t page_size) : kPageSize(page_size) {
|
||||
data_ = new int[kPageSize];
|
||||
utils::Assert(data_ != NULL, "fail to allocate row batch page");
|
||||
this->Clear();
|
||||
}
|
||||
~RowBatchPage(void) {
|
||||
if (data_ != NULL) delete [] data_;
|
||||
}
|
||||
/*!
|
||||
* \brief Push one row into page
|
||||
* \param row an instance row
|
||||
* \return false or true to push into
|
||||
*/
|
||||
inline bool PushRow(const RowBatch::Inst &row) {
|
||||
const size_t dsize = row.length * sizeof(RowBatch::Entry);
|
||||
if (FreeBytes() < dsize+ sizeof(int)) return false;
|
||||
row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
|
||||
memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
|
||||
++ data_[0];
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief get a row batch representation from the page
|
||||
* \param p_rptr a temporal space that can be used to provide
|
||||
* ind_ptr storage for RowBatch
|
||||
* \return a new RowBatch object
|
||||
*/
|
||||
inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
|
||||
RowBatch batch;
|
||||
batch.base_rowid = base_rowid;
|
||||
batch.data_ptr = this->data_ptr(0);
|
||||
batch.size = static_cast<size_t>(this->Size());
|
||||
std::vector<size_t> &rptr = *p_rptr;
|
||||
rptr.resize(this->Size() + 1);
|
||||
for (size_t i = 0; i < rptr.size(); ++i) {
|
||||
rptr[i] = static_cast<size_t>(this->row_ptr(static_cast<int>(i)));
|
||||
}
|
||||
batch.ind_ptr = &rptr[0];
|
||||
return batch;
|
||||
}
|
||||
/*! \brief get i-th row from the batch */
|
||||
inline RowBatch::Inst operator[](int i) {
|
||||
return RowBatch::Inst(data_ptr(0) + row_ptr(i),
|
||||
static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
|
||||
}
|
||||
/*!
|
||||
* \brief clear the page, cleanup the content
|
||||
*/
|
||||
inline void Clear(void) {
|
||||
memset(&data_[0], 0, sizeof(int) * kPageSize);
|
||||
}
|
||||
/*!
|
||||
* \brief load one page form instream
|
||||
* \return true if loading is successful
|
||||
*/
|
||||
inline bool Load(utils::IStream &fi) {
|
||||
return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
|
||||
}
|
||||
/*! \brief save one page into outstream */
|
||||
inline void Save(utils::IStream &fo) {
|
||||
fo.Write(&data_[0], sizeof(int) * kPageSize);
|
||||
}
|
||||
/*! \return number of elements */
|
||||
inline int Size(void) const {
|
||||
return data_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \return number of elements */
|
||||
inline size_t FreeBytes(void) {
|
||||
return (kPageSize - (Size() + 2)) * sizeof(int)
|
||||
- row_ptr(Size()) * sizeof(RowBatch::Entry) ;
|
||||
}
|
||||
/*! \brief equivalent row pointer at i */
|
||||
inline int& row_ptr(int i) {
|
||||
return data_[kPageSize - i - 1];
|
||||
}
|
||||
inline RowBatch::Entry* data_ptr(int i) {
|
||||
return (RowBatch::Entry*)(&data_[1]) + i;
|
||||
}
|
||||
// page size
|
||||
const size_t kPageSize;
|
||||
// content of data
|
||||
int *data_;
|
||||
};
|
||||
/*! \brief thread buffer iterator */
|
||||
class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
||||
public:
|
||||
ThreadRowPageIterator(void) {
|
||||
itr.SetParam("buffer_size", "2");
|
||||
page_ = NULL;
|
||||
base_rowid_ = 0;
|
||||
}
|
||||
virtual ~ThreadRowPageIterator(void) {
|
||||
}
|
||||
virtual void Init(void) {
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
itr.BeforeFirst();
|
||||
base_rowid_ = 0;
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if(!itr.Next(page_)) return false;
|
||||
out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
|
||||
base_rowid_ += out_.size;
|
||||
return true;
|
||||
}
|
||||
virtual const RowBatch &Value(void) const{
|
||||
return out_;
|
||||
}
|
||||
/*! \brief load and initialize the iterator with fi */
|
||||
inline void Load(const utils::FileStream &fi) {
|
||||
itr.get_factory().SetFile(fi);
|
||||
itr.Init();
|
||||
this->BeforeFirst();
|
||||
}
|
||||
/*!
|
||||
* \brief save a row iterator to output stream, in row iterator format
|
||||
*/
|
||||
inline static void Save(utils::IIterator<RowBatch> *iter,
|
||||
utils::IStream &fo) {
|
||||
RowBatchPage page(kPageSize);
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (!page.PushRow(batch[i])) {
|
||||
page.Save(fo);
|
||||
page.Clear();
|
||||
utils::Check(page.PushRow(batch[i]), "row is too big");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (page.Size() != 0) page.Save(fo);
|
||||
}
|
||||
/*! \brief page size 64 MB */
|
||||
static const size_t kPageSize = 64 << 18;
|
||||
private:
|
||||
// base row id
|
||||
size_t base_rowid_;
|
||||
// temporal ptr
|
||||
std::vector<size_t> tmp_ptr_;
|
||||
// output data
|
||||
RowBatch out_;
|
||||
// page pointer type
|
||||
typedef RowBatchPage* PagePtr;
|
||||
// loader factory for page
|
||||
struct Factory {
|
||||
public:
|
||||
long file_begin_;
|
||||
utils::FileStream fi;
|
||||
Factory(void) {}
|
||||
inline void SetFile(const utils::FileStream &fi) {
|
||||
this->fi = fi;
|
||||
file_begin_ = this->fi.Tell();
|
||||
}
|
||||
inline bool Init(void) {
|
||||
return true;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {}
|
||||
inline bool LoadNext(PagePtr &val) {
|
||||
return val->Load(fi);
|
||||
}
|
||||
inline PagePtr Create(void) {
|
||||
PagePtr a = new RowBatchPage(kPageSize);
|
||||
return a;
|
||||
}
|
||||
inline void FreeSpace(PagePtr &a) {
|
||||
delete a;
|
||||
}
|
||||
inline void Destroy(void) {
|
||||
fi.Close();
|
||||
}
|
||||
inline void BeforeFirst(void) {
|
||||
fi.Seek(file_begin_);
|
||||
}
|
||||
};
|
||||
|
||||
protected:
|
||||
PagePtr page_;
|
||||
utils::ThreadBuffer<PagePtr,Factory> itr;
|
||||
};
|
||||
|
||||
/*! \brief data matrix using page */
|
||||
class DMatrixPage : public DataMatrix {
|
||||
public:
|
||||
DMatrixPage(void) : DataMatrix(kMagic) {
|
||||
iter_ = new ThreadRowPageIterator();
|
||||
fmat_ = new FMatrixS(iter_);
|
||||
}
|
||||
// virtual destructor
|
||||
virtual ~DMatrixPage(void) {
|
||||
delete fmat_;
|
||||
}
|
||||
virtual IFMatrix *fmat(void) const {
|
||||
return fmat_;
|
||||
}
|
||||
/*! \brief load and initialize the iterator with fi */
|
||||
inline void Load(utils::FileStream &fi,
|
||||
bool silent = false,
|
||||
const char *fname = NULL){
|
||||
int magic;
|
||||
utils::Check(fi.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
|
||||
utils::Check(magic == kMagic, "invalid format,magic number mismatch");
|
||||
this->info.LoadBinary(fi);
|
||||
iter_->Load(fi);
|
||||
if (!silent) {
|
||||
printf("DMatrixPage: %lux%lu matrix is loaded",
|
||||
static_cast<unsigned long>(info.num_row()),
|
||||
static_cast<unsigned long>(info.num_col()));
|
||||
if (fname != NULL) {
|
||||
printf(" from %s\n", fname);
|
||||
} else {
|
||||
printf("\n");
|
||||
}
|
||||
if (info.group_ptr.size() != 0) {
|
||||
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief save a DataMatrix as DMatrixPage*/
|
||||
inline static void Save(const char* fname, const DataMatrix &mat, bool silent) {
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||
int magic = kMagic;
|
||||
fs.Write(&magic, sizeof(magic));
|
||||
mat.info.SaveBinary(fs);
|
||||
ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
|
||||
fs.Close();
|
||||
if (!silent) {
|
||||
printf("DMatrixPage: %lux%lu is saved to %s\n",
|
||||
static_cast<unsigned long>(mat.info.num_row()),
|
||||
static_cast<unsigned long>(mat.info.num_col()), fname);
|
||||
}
|
||||
}
|
||||
/*! \brief the real fmatrix */
|
||||
FMatrixS *fmat_;
|
||||
/*! \brief row iterator */
|
||||
ThreadRowPageIterator *iter_;
|
||||
/*! \brief magic number used to identify DMatrix */
|
||||
static const int kMagic = 0xffffab02;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
|
||||
316
src/io/page_fmatrix-inl.hpp
Normal file
316
src/io/page_fmatrix-inl.hpp
Normal file
@@ -0,0 +1,316 @@
|
||||
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
|
||||
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
|
||||
/*!
|
||||
* \file page_fmatrix-inl.hpp
|
||||
* sparse page manager for fmatrix
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "../data.h"
|
||||
#include "../utils/iterator.h"
|
||||
#include "../utils/thread_buffer.h"
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
|
||||
class CSCMatrixManager {
|
||||
public:
|
||||
/*! \brief in memory page */
|
||||
struct Page {
|
||||
public:
|
||||
/*! \brief initialize the page */
|
||||
explicit Page(size_t size) {
|
||||
buffer.resize(size);
|
||||
col_index.reserve(10);
|
||||
col_data.reserve(10);
|
||||
}
|
||||
/*! \brief clear the page */
|
||||
inline void Clear(void) {
|
||||
num_entry = 0;
|
||||
col_index.clear();
|
||||
col_data.clear();
|
||||
}
|
||||
/*! \brief number of used entries */
|
||||
size_t num_entry;
|
||||
/*! \brief column index */
|
||||
std::vector<bst_uint> col_index;
|
||||
/*! \brief column data */
|
||||
std::vector<ColBatch::Inst> col_data;
|
||||
/*! \brief number of free entries */
|
||||
inline size_t NumFreeEntry(void) const {
|
||||
return buffer.size() - num_entry;
|
||||
}
|
||||
inline ColBatch::Entry* AllocEntry(size_t len) {
|
||||
ColBatch::Entry *p_data = &buffer[0] + num_entry;
|
||||
num_entry += len;
|
||||
return p_data;
|
||||
}
|
||||
/*! \brief get underlying batch */
|
||||
inline ColBatch GetBatch(void) const {
|
||||
ColBatch batch;
|
||||
batch.col_index = &col_index[0];
|
||||
batch.col_data = &col_data[0];
|
||||
return batch;
|
||||
}
|
||||
private:
|
||||
/*! \brief buffer space, not to be changed since ready */
|
||||
std::vector<ColBatch::Entry> buffer;
|
||||
};
|
||||
/*! \brief define type of page pointer */
|
||||
typedef Page *PagePtr;
|
||||
/*! \brief get column pointer */
|
||||
inline const std::vector<size_t> &col_ptr(void) const {
|
||||
return col_ptr_;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
}
|
||||
inline PagePtr Create(void) {
|
||||
return new Page(page_size_);
|
||||
}
|
||||
inline void FreeSpace(PagePtr &a) {
|
||||
delete a;
|
||||
}
|
||||
inline void Destroy(void) {
|
||||
}
|
||||
inline void BeforeFirst(void) {
|
||||
col_index_ = col_todo_;
|
||||
read_top_ = 0;
|
||||
}
|
||||
inline bool LoadNext(PagePtr &val) {
|
||||
val->Clear();
|
||||
if (read_top_ >= col_index_.size()) return false;
|
||||
while (read_top_ < col_index_.size()) {
|
||||
if (!this->TryFill(col_index_[read_top_], val)) return true;
|
||||
++read_top_;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
inline bool Init(void) {
|
||||
this->BeforeFirst();
|
||||
return true;
|
||||
}
|
||||
inline void Setup(utils::ISeekStream *fi, double page_ratio) {
|
||||
fi_ = fi;
|
||||
fi_->Read(&begin_meta_ , sizeof(size_t));
|
||||
fi_->Seek(begin_meta_);
|
||||
fi_->Read(&col_ptr_);
|
||||
size_t psmax = 0;
|
||||
for (size_t i = 0; i < col_ptr_.size() - 1; ++i) {
|
||||
psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
|
||||
}
|
||||
utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
|
||||
page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
|
||||
}
|
||||
inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
|
||||
if (!setall) {
|
||||
col_todo_.resize(cset.size());
|
||||
for (size_t i = 0; i < cset.size(); ++i) {
|
||||
col_todo_[i] = cset[i];
|
||||
utils::Assert(col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1),
|
||||
"CSCMatrixManager: column index exceed bound");
|
||||
}
|
||||
std::sort(col_todo_.begin(), col_todo_.end());
|
||||
} else {
|
||||
col_todo_.resize(col_ptr_.size()-1);
|
||||
for (size_t i = 0; i < col_todo_.size(); ++i) {
|
||||
col_todo_[i] = static_cast<bst_uint>(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
/*! \brief fill a page with */
|
||||
inline bool TryFill(size_t cidx, Page *p_page) {
|
||||
size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
|
||||
if (p_page->NumFreeEntry() < len) return false;
|
||||
ColBatch::Entry *p_data = p_page->AllocEntry(len);
|
||||
fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + sizeof(size_t));
|
||||
utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
|
||||
"invalid column buffer format");
|
||||
p_page->col_data.push_back(ColBatch::Inst(p_data, len));
|
||||
p_page->col_index.push_back(cidx);
|
||||
}
|
||||
// the following are in memory auxiliary data structure
|
||||
/*! \brief top of reader position */
|
||||
size_t read_top_;
|
||||
/*! \brief size of page */
|
||||
size_t page_size_;
|
||||
/*! \brief column index to be loaded */
|
||||
std::vector<bst_uint> col_index_;
|
||||
/*! \brief column index to be after calling before first */
|
||||
std::vector<bst_uint> col_todo_;
|
||||
// the following are input content
|
||||
/*! \brief size of data content */
|
||||
size_t begin_meta_;
|
||||
/*! \brief input stream */
|
||||
utils::ISeekStream *fi_;
|
||||
/*! \brief column pointer of CSC format */
|
||||
std::vector<size_t> col_ptr_;
|
||||
};
|
||||
|
||||
class ThreadColPageIterator : public utils::IIterator<ColBatch> {
|
||||
public:
|
||||
ThreadColPageIterator(void) {
|
||||
itr_.SetParam("buffer_size", "2");
|
||||
page_ = NULL;
|
||||
fi_ = NULL;
|
||||
silent = 0;
|
||||
}
|
||||
virtual ~ThreadColPageIterator(void) {
|
||||
if (fi_ != NULL) {
|
||||
fi_->Close(); delete fi_;
|
||||
}
|
||||
}
|
||||
virtual void Init(void) {
|
||||
fi_ = new utils::FileStream(utils::FopenCheck(col_pagefile_.c_str(), "rb"));
|
||||
itr_.get_factory().Setup(fi_, col_pageratio_);
|
||||
if (silent == 0) {
|
||||
printf("ThreadColPageIterator: finish initialzing from %s, %u columns\n",
|
||||
col_pagefile_.c_str(), static_cast<unsigned>(col_ptr().size() - 1));
|
||||
}
|
||||
}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("col_pageratio", val)) col_pageratio_ = atof(val);
|
||||
if (!strcmp("col_pagefile", val)) col_pagefile_ = val;
|
||||
if (!strcmp("silent", val)) silent = atoi(val);
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
itr_.BeforeFirst();
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if(!itr_.Next(page_)) return false;
|
||||
out_ = page_->GetBatch();
|
||||
return true;
|
||||
}
|
||||
virtual const ColBatch &Value(void) const{
|
||||
return out_;
|
||||
}
|
||||
inline const std::vector<size_t> &col_ptr(void) const {
|
||||
return itr_.get_factory().col_ptr();
|
||||
}
|
||||
inline void SetColSet(const std::vector<bst_uint> &cset, bool setall = false) {
|
||||
itr_.get_factory().SetColSet(cset, setall);
|
||||
}
|
||||
|
||||
private:
|
||||
// shutup
|
||||
int silent;
|
||||
// input file
|
||||
utils::FileStream *fi_;
|
||||
// size of page
|
||||
float col_pageratio_;
|
||||
// name of file
|
||||
std::string col_pagefile_;
|
||||
// output data
|
||||
ColBatch out_;
|
||||
// page to be loaded
|
||||
CSCMatrixManager::PagePtr page_;
|
||||
// internal iterator
|
||||
utils::ThreadBuffer<CSCMatrixManager::PagePtr,CSCMatrixManager> itr_;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief sparse matrix that support column access
|
||||
*/
|
||||
class FMatrixPage : public IFMatrix {
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
FMatrixPage(utils::IIterator<RowBatch> *iter) {
|
||||
this->row_iter_ = iter;
|
||||
this->col_iter_ = NULL;
|
||||
}
|
||||
// destructor
|
||||
virtual ~FMatrixPage(void) {
|
||||
if (row_iter_ != NULL) delete row_iter_;
|
||||
if (col_iter_ != NULL) delete col_iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_iter_ != NULL;
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_iter_->col_ptr().size() - 1;
|
||||
}
|
||||
/*! \brief get number of buffered rows */
|
||||
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
/*! \brief get column size */
|
||||
virtual size_t GetColSize(size_t cidx) const {
|
||||
const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
|
||||
return col_ptr[cidx+1] - col_ptr[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
virtual float GetColDensity(size_t cidx) const {
|
||||
const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
|
||||
size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
}
|
||||
virtual void InitColAccess(float pkeep = 1.0f) {
|
||||
if (this->HaveColAccess()) return;
|
||||
this->InitColData(pkeep);
|
||||
}
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
*/
|
||||
virtual utils::IIterator<RowBatch>* RowIterator(void) {
|
||||
row_iter_->BeforeFirst();
|
||||
return row_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief get the column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch>* ColIterator(void) {
|
||||
std::vector<bst_uint> cset;
|
||||
col_iter_->SetColSet(cset, true);
|
||||
col_iter_->BeforeFirst();
|
||||
return col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief colmun based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
col_iter_->SetColSet(fset, false);
|
||||
col_iter_->BeforeFirst();
|
||||
return col_iter_;
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief intialize column data
|
||||
* \param pkeep probability to keep a row
|
||||
*/
|
||||
inline void InitColData(float pkeep) {
|
||||
buffered_rowset_.clear();
|
||||
// start working
|
||||
row_iter_->BeforeFirst();
|
||||
while (row_iter_->Next()) {
|
||||
const RowBatch &batch = row_iter_->Value();
|
||||
|
||||
}
|
||||
row_iter_->BeforeFirst();
|
||||
size_t ktop = 0;
|
||||
while (row_iter_->Next()) {
|
||||
const RowBatch &batch = row_iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (ktop < buffered_rowset_.size() &&
|
||||
buffered_rowset_[ktop] == batch.base_rowid + i) {
|
||||
++ktop;
|
||||
// TODO1
|
||||
}
|
||||
}
|
||||
}
|
||||
// sort columns
|
||||
}
|
||||
|
||||
private:
|
||||
// row iterator
|
||||
utils::IIterator<RowBatch> *row_iter_;
|
||||
// column iterator
|
||||
ThreadColPageIterator *col_iter_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
};
|
||||
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
|
||||
@@ -44,8 +44,8 @@ class DMatrixSimple : public DataMatrix {
|
||||
}
|
||||
/*! \brief copy content data from source matrix */
|
||||
inline void CopyFrom(const DataMatrix &src) {
|
||||
this->info = src.info;
|
||||
this->Clear();
|
||||
this->info = src.info;
|
||||
// clone data content in thos matrix
|
||||
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
|
||||
@@ -150,7 +150,7 @@ class FMatrixS : public IFMatrix{
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||
RowBatch::Inst inst = batch[i];
|
||||
|
||||
Reference in New Issue
Block a user