simplify and parallelize data builder
This commit is contained in:
@@ -6,10 +6,6 @@
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "simple_dmatrix-inl.hpp"
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
#include "page_dmatrix-inl.hpp"
|
||||
#include "page_fmatrix-inl.hpp"
|
||||
#endif
|
||||
// implements data loads using dmatrix simple for now
|
||||
|
||||
namespace xgboost {
|
||||
@@ -28,43 +24,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent,
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
|
||||
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
|
||||
fs.Seek(0);
|
||||
|
||||
if (magic == DMatrixSimple::kMagic) {
|
||||
DMatrixSimple *dmat = new DMatrixSimple();
|
||||
dmat->LoadBinary(fs, silent, fname);
|
||||
fs.Close();
|
||||
return dmat;
|
||||
}
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
std::string tmp_fname;
|
||||
const char *fname_ext = NULL;
|
||||
if (strchr(fname, ';') != NULL) {
|
||||
tmp_fname = fname;
|
||||
char *ptr = strchr(&tmp_fname[0], ';');
|
||||
ptr[0] = '\0'; fname = &tmp_fname[0];
|
||||
fname_ext = ptr + 1;
|
||||
}
|
||||
if (magic == DMatrixPage::kMagic) {
|
||||
if (fname_ext == NULL) {
|
||||
DMatrixPage *dmat = new DMatrixPage();
|
||||
dmat->Load(fs, silent, fname);
|
||||
return dmat;
|
||||
} else {
|
||||
DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
|
||||
dmat->Load(fs, silent, fname, true);
|
||||
return dmat;
|
||||
}
|
||||
}
|
||||
if (magic == DMatrixColPage::kMagic) {
|
||||
std::string sfname = fname;
|
||||
if (fname_ext == NULL) {
|
||||
sfname += ".col"; fname_ext = sfname.c_str();
|
||||
}
|
||||
DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
|
||||
dmat->Load(fs, silent, fname);
|
||||
return dmat;
|
||||
}
|
||||
#endif
|
||||
fs.Close();
|
||||
DMatrixSimple *dmat = new DMatrixSimple();
|
||||
dmat->CacheLoad(fname, silent, savebuffer);
|
||||
@@ -72,16 +37,6 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent,
|
||||
}
|
||||
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
if (!strcmp(fname + strlen(fname) - 5, ".page")) {
|
||||
DMatrixPage::Save(fname, dmat, silent);
|
||||
return;
|
||||
}
|
||||
if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
|
||||
DMatrixColPage::Save(fname, dmat, silent);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (dmat.magic == DMatrixSimple::kMagic) {
|
||||
const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
|
||||
p_dmat->SaveBinary(fname, silent);
|
||||
|
||||
@@ -9,7 +9,8 @@
|
||||
#include "../utils/utils.h"
|
||||
#include "../utils/random.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../utils/matrix_csr.h"
|
||||
#include "../utils/group_data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*!
|
||||
@@ -147,21 +148,40 @@ class FMatrixS : public IFMatrix{
|
||||
* \param pkeep probability to keep a row
|
||||
*/
|
||||
inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
|
||||
// clear rowset
|
||||
buffered_rowset_.clear();
|
||||
// note: this part of code is serial, todo, parallelize this transformer
|
||||
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
|
||||
builder.InitBudget(0);
|
||||
// bit map
|
||||
int nthread;
|
||||
std::vector<bool> bmap;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
// build the column matrix in parallel
|
||||
utils::ParallelGroupBuilder<RowBatch::Entry> builder(&col_ptr_, &col_data_);
|
||||
builder.InitBudget(0, nthread);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bmap.resize(bmap.size() + batch.size, true);
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
|
||||
buffered_rowset_.push_back(ridx);
|
||||
} else {
|
||||
bmap[i] = false;
|
||||
}
|
||||
}
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]){
|
||||
builder.AddBudget(inst[j].index);
|
||||
builder.AddBudget(inst[j].index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,19 +190,19 @@ class FMatrixS : public IFMatrix{
|
||||
builder.InitStorage();
|
||||
|
||||
iter_->BeforeFirst();
|
||||
size_t ktop = 0;
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
if (ktop < buffered_rowset_.size() &&
|
||||
buffered_rowset_[ktop] == batch.base_rowid+i) {
|
||||
++ktop;
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.PushElem(inst[j].index,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue));
|
||||
builder.Push(inst[j].index,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue), tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user