This commit is contained in:
tqchen
2015-07-03 19:35:23 -07:00
parent aba41d07cd
commit 1123253f79
10 changed files with 178 additions and 143 deletions

View File

@@ -1,6 +1,8 @@
// Copyright by Contributors
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <string>
#include "../utils/io.h"
// implements a single no split version of DMLC
@@ -9,7 +11,7 @@
namespace xgboost {
namespace utils {
/*!
* \brief line split implementation from single FILE
* \brief line split implementation from single FILE
* simply returns lines of files, used for stdin
*/
class SingleFileSplit : public dmlc::InputSplit {
@@ -32,7 +34,7 @@ class SingleFileSplit : public dmlc::InputSplit {
}
virtual size_t Read(void *ptr, size_t size) {
return std::fread(ptr, 1, size, fp_);
}
}
virtual void Write(const void *ptr, size_t size) {
utils::Error("cannot do write in inputsplit");
}
@@ -47,13 +49,13 @@ class SingleFileSplit : public dmlc::InputSplit {
chunk_end_);
out_rec->dptr = chunk_begin_;
out_rec->size = next - chunk_begin_;
chunk_begin_ = next;
chunk_begin_ = next;
return true;
}
virtual bool NextChunk(Blob *out_chunk) {
if (chunk_begin_ == chunk_end_) {
if (!LoadChunk()) return false;
}
}
out_chunk->dptr = chunk_begin_;
out_chunk->size = chunk_end_ - chunk_begin_;
chunk_begin_ = chunk_end_;
@@ -64,8 +66,8 @@ class SingleFileSplit : public dmlc::InputSplit {
if (max_size <= overflow_.length()) {
*size = 0; return true;
}
if (overflow_.length() != 0) {
std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
if (overflow_.length() != 0) {
std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
}
size_t olen = overflow_.length();
overflow_.resize(0);
@@ -88,13 +90,13 @@ class SingleFileSplit : public dmlc::InputSplit {
return true;
}
}
protected:
inline const char* FindLastRecordBegin(const char *begin,
const char *end) {
if (begin == end) return begin;
for (const char *p = end - 1; p != begin; --p) {
if (*p == '\n' || *p == '\r') return p + 1;
if (*p == '\n' || *p == '\r') return p + 1;
}
return begin;
}
@@ -143,7 +145,7 @@ class StdFile : public dmlc::Stream {
public:
explicit StdFile(std::FILE *fp, bool use_stdio)
: fp(fp), use_stdio(use_stdio) {
}
}
virtual ~StdFile(void) {
this->Close();
}
@@ -154,7 +156,7 @@ class StdFile : public dmlc::Stream {
std::fwrite(ptr, size, 1, fp);
}
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
std::fseek(fp, static_cast<long>(pos), SEEK_SET); // NOLINT(*)
}
virtual size_t Tell(void) {
return std::ftell(fp);
@@ -197,7 +199,7 @@ Stream *Stream::Create(const char *fname, const char * const mode, bool allow_nu
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
utils::Check(strncmp(fname, "s3://", 5) != 0, msg);
utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg);
std::FILE *fp = NULL;
bool use_stdio = false;
using namespace std;

View File

@@ -1,3 +1,4 @@
// Copyright 2014 by Contributors
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
@@ -17,7 +18,7 @@ DataMatrix* LoadDataMatrix(const char *fname,
const char *cache_file) {
using namespace std;
std::string fname_ = fname;
const char *dlm = strchr(fname, '#');
if (dlm != NULL) {
utils::Check(strchr(dlm + 1, '#') == NULL,
@@ -29,7 +30,7 @@ DataMatrix* LoadDataMatrix(const char *fname,
cache_file = dlm +1;
}
if (cache_file == NULL) {
if (cache_file == NULL) {
if (!std::strcmp(fname, "stdin") ||
!std::strncmp(fname, "s3://", 5) ||
!std::strncmp(fname, "hdfs://", 7) ||
@@ -42,7 +43,7 @@ DataMatrix* LoadDataMatrix(const char *fname,
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
fs.Seek(0);
if (magic == DMatrixSimple::kMagic) {
if (magic == DMatrixSimple::kMagic) {
DMatrixSimple *dmat = new DMatrixSimple();
dmat->LoadBinary(fs, silent, fname);
fs.Close();
@@ -81,7 +82,7 @@ DataMatrix* LoadDataMatrix(const char *fname,
}
}
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
if (dmat.magic == DMatrixSimple::kMagic) {
const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
p_dmat->SaveBinary(fname, silent);

View File

@@ -22,7 +22,7 @@ namespace io {
/*! \brief page returned by libsvm parser */
struct LibSVMPage : public SparsePage {
std::vector<float> label;
// overload clear
// overload clear
inline void Clear() {
SparsePage::Clear();
label.clear();
@@ -35,7 +35,7 @@ struct LibSVMPage : public SparsePage {
*/
class LibSVMPageFactory {
public:
explicit LibSVMPageFactory()
LibSVMPageFactory()
: bytes_read_(0), at_head_(true) {
}
inline bool Init(void) {
@@ -85,7 +85,7 @@ class LibSVMPageFactory {
data->resize(nthread);
bytes_read_ += chunk.size;
utils::Assert(chunk.size != 0, "LibSVMParser.FileData");
char *head = reinterpret_cast<char*>(chunk.dptr);
char *head = reinterpret_cast<char*>(chunk.dptr);
#pragma omp parallel num_threads(nthread_)
{
// threadid
@@ -150,7 +150,7 @@ class LibSVMPageFactory {
}
return begin;
}
private:
// nthread
int nthread_;
@@ -199,12 +199,13 @@ class LibSVMParser : public utils::IIterator<LibSVMPage> {
inline size_t bytes_read(void) const {
return itr.get_factory().bytes_read();
}
private:
bool at_end_;
size_t data_ptr_;
std::vector<LibSVMPage> *data_;
utils::ThreadBuffer<std::vector<LibSVMPage>*, LibSVMPageFactory> itr;
};
};
} // namespace io
} // namespace xgboost

View File

@@ -1,11 +1,15 @@
#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
/*!
* Copyright (c) 2014 by Contributors
* \file page_dmatrix-inl.hpp
* row iterator based on sparse page
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
#include <vector>
#include <string>
#include <algorithm>
#include "../data.h"
#include "../utils/iterator.h"
#include "../utils/thread_buffer.h"
@@ -94,12 +98,12 @@ class DMatrixPageBase : public DataMatrix {
fbin.Close();
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
static_cast<unsigned long>(mat.info.num_row()),
static_cast<unsigned long>(mat.info.num_col()), fname_);
static_cast<unsigned long>(mat.info.num_row()), // NOLINT(*)
static_cast<unsigned long>(mat.info.num_col()), fname_); // NOLINT(*)
}
}
/*! \brief load and initialize the iterator with fi */
inline void LoadBinary(utils::FileStream &fi,
inline void LoadBinary(utils::FileStream &fi, // NOLINT(*)
bool silent,
const char *fname_) {
this->set_cache_file(fname_);
@@ -114,8 +118,8 @@ class DMatrixPageBase : public DataMatrix {
iter_->Load(fs);
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()));
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col())); // NOLINT(*)
if (fname_ != NULL) {
utils::Printf(" from %s\n", fname_);
} else {
@@ -141,7 +145,7 @@ class DMatrixPageBase : public DataMatrix {
}
this->set_cache_file(cache_file);
std::string fname_row = std::string(cache_file) + ".row.blob";
utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));
utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));
SparsePage page;
size_t bytes_write = 0;
double tstart = rabit::utils::GetTime();
@@ -178,8 +182,8 @@ class DMatrixPageBase : public DataMatrix {
if (page.data.size() != 0) {
page.Save(&fo);
}
fo.Close();
iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb")));
fo.Close();
iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb")));
// save data matrix
utils::FileStream fs(utils::FopenCheck(cache_file, "wb"));
int tmagic = kMagic;
@@ -188,8 +192,8 @@ class DMatrixPageBase : public DataMatrix {
fs.Close();
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
uri);
}
}
@@ -241,12 +245,12 @@ class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
virtual IFMatrix *fmat(void) const {
return fmat_;
}
virtual void set_cache_file(const std::string &cache_file) {
virtual void set_cache_file(const std::string &cache_file) {
}
virtual void CheckMagic(int tmagic) {
utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
tmagic == DMatrixPageBase<0xffffab03>::kMagic,
"invalid format,magic number mismatch");
"invalid format,magic number mismatch");
}
/*! \brief the real fmatrix */
IFMatrix *fmat_;

View File

@@ -1,10 +1,16 @@
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
/*!
* Copyright (c) 2014 by Contributors
* \file page_fmatrix-inl.hpp
* col iterator based on sparse page
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#include <vector>
#include <string>
#include <algorithm>
namespace xgboost {
namespace io {
/*! \brief thread buffer iterator */
@@ -42,9 +48,9 @@ class ThreadColPageIterator: public utils::IIterator<ColBatch> {
}
// set index set
inline void SetIndexSet(const std::vector<bst_uint> &fset, bool load_all) {
itr.get_factory().SetIndexSet(fset, load_all);
itr.get_factory().SetIndexSet(fset, load_all);
}
private:
// output data
ColBatch out_;
@@ -96,7 +102,7 @@ struct ColConvertFactory {
return true;
}
}
if (tmp_.Size() != 0){
if (tmp_.Size() != 0) {
this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
*enabled_, val);
return true;
@@ -104,7 +110,7 @@ struct ColConvertFactory {
return false;
}
}
inline void Destroy(void) {}
inline void Destroy(void) {}
inline void BeforeFirst(void) {}
inline void MakeColPage(const SparsePage &prow,
const bst_uint *ridx,
@@ -115,7 +121,7 @@ struct ColConvertFactory {
#pragma omp parallel
{
nthread = omp_get_num_threads();
int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1);
int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1);
if (nthread > max_nthread) {
nthread = max_nthread;
}
@@ -130,10 +136,10 @@ struct ColConvertFactory {
int tid = omp_get_thread_num();
for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
const SparseBatch::Entry &e = prow.data[j];
if (enabled[e.index]) {
if (enabled[e.index]) {
builder.AddBudget(e.index, tid);
}
}
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static) num_threads(nthread)
@@ -169,7 +175,7 @@ struct ColConvertFactory {
// buffered rowset
std::vector<bst_uint> *buffered_rowset_;
// enabled marks
const std::vector<bool> *enabled_;
const std::vector<bool> *enabled_;
// internal temp cache
SparsePage tmp_;
/*! \brief page size 256 M */
@@ -191,7 +197,7 @@ class FMatrixPage : public IFMatrix {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const {
virtual bool HaveColAccess(void) const {
return col_size_.size() != 0;
}
/*! \brief get number of colmuns */
@@ -212,7 +218,7 @@ class FMatrixPage : public IFMatrix {
size_t nmiss = num_buffered_row_ - (col_size_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
}
virtual void InitColAccess(const std::vector<bool> &enabled,
virtual void InitColAccess(const std::vector<bool> &enabled,
float pkeep, size_t max_row_perbatch) {
if (this->HaveColAccess()) return;
if (TryLoadColData()) return;
@@ -242,11 +248,11 @@ class FMatrixPage : public IFMatrix {
/*!
* \brief colmun based iterator
*/
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
size_t ncol = this->NumCol();
col_index_.resize(0);
for (size_t i = 0; i < fset.size(); ++i) {
if (fset[i] < ncol) col_index_.push_back(fset[i]);
if (fset[i] < ncol) col_index_.push_back(fset[i]);
}
col_iter_.SetIndexSet(col_index_, false);
col_iter_.BeforeFirst();
@@ -255,13 +261,13 @@ class FMatrixPage : public IFMatrix {
// set the cache file name
inline void set_cache_file(const std::string &cache_file) {
col_data_name_ = std::string(cache_file) + ".col.blob";
col_meta_name_ = std::string(cache_file) + ".col.meta";
col_meta_name_ = std::string(cache_file) + ".col.meta";
}
protected:
inline bool TryLoadColData(void) {
std::FILE *fi = fopen64(col_meta_name_.c_str(), "rb");
if (fi == NULL) return false;
if (fi == NULL) return false;
utils::FileStream fs(fi);
LoadMeta(&fs);
fs.Close();
@@ -306,12 +312,12 @@ class FMatrixPage : public IFMatrix {
SparsePage *pcol;
while (citer.Next(pcol)) {
for (size_t i = 0; i < pcol->Size(); ++i) {
col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
}
pcol->Save(&fo);
size_t spage = pcol->MemCostBytes();
bytes_write += spage;
double tnow = rabit::utils::GetTime();
double tnow = rabit::utils::GetTime();
double tdiff = tnow - tstart;
utils::Printf("Writting to %s in %g MB/s, %lu MB written current speed:%g MB/s\n",
col_data_name_.c_str(),

View File

@@ -1,13 +1,15 @@
#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
/*!
* Copyright 2014 by Contributors
* \file simple_dmatrix-inl.hpp
* \brief simple implementation of DMatrixS that can be used
* \brief simple implementation of DMatrixS that can be used
* the data format of xgboost is templatized, which means it can accept
* any data structure that implements the function defined by FMatrix
* this file is a specific implementation of input data structure that can be used by BoostLearner
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#include <string>
#include <cstring>
#include <vector>
@@ -119,13 +121,13 @@ class DMatrixSimple : public DataMatrix {
for (size_t i = 0; i < batch.data.size(); ++i) {
info.info.num_col = std::max(info.info.num_col,
static_cast<size_t>(batch.data[i].index+1));
}
}
}
if (!silent) {
utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), uri);
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
static_cast<unsigned long>(row_data_.size()), uri); // NOLINT(*)
}
// try to load in additional file
if (!loadsplit) {
@@ -141,7 +143,7 @@ class DMatrixSimple : public DataMatrix {
"DMatrix: weight data does not match the number of rows in features");
}
std::string mname = name + ".base_margin";
if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) {
if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) {
}
}
}
@@ -165,10 +167,11 @@ class DMatrixSimple : public DataMatrix {
* \param silent whether print information during loading
* \param fname file name, used to print message
*/
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { // NOLINT(*)
int tmagic;
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch",
fname == NULL ? "" : fname);
info.LoadBinary(fs);
LoadBinary(fs, &row_ptr_, &row_data_);
@@ -176,9 +179,9 @@ class DMatrixSimple : public DataMatrix {
if (!silent) {
utils::Printf("%lux%lu matrix with %lu entries is loaded",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()));
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
static_cast<unsigned long>(row_data_.size())); // NOLINT(*)
if (fname != NULL) {
utils::Printf(" from %s\n", fname);
} else {
@@ -205,9 +208,9 @@ class DMatrixSimple : public DataMatrix {
if (!silent) {
utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
static_cast<unsigned long>(row_data_.size()), fname); // NOLINT(*)
if (info.group_ptr.size() != 0) {
utils::Printf("data contains %u groups\n",
static_cast<unsigned>(info.group_ptr.size()-1));
@@ -256,7 +259,7 @@ class DMatrixSimple : public DataMatrix {
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
inline static void SaveBinary(utils::IStream &fo, // NOLINT(*)
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
@@ -272,7 +275,7 @@ class DMatrixSimple : public DataMatrix {
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
inline static void LoadBinary(utils::IStream &fi, // NOLINT(*)
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;
@@ -314,7 +317,7 @@ class DMatrixSimple : public DataMatrix {
DMatrixSimple *parent_;
// temporal space for batch
RowBatch batch_;
};
};
};
} // namespace io
} // namespace xgboost

View File

@@ -1,11 +1,15 @@
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
/*!
* Copyright 2014 by Contributors
* \file simple_fmatrix-inl.hpp
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
#include <limits>
#include <algorithm>
#include <vector>
#include "../data.h"
#include "../utils/utils.h"
#include "../utils/random.h"
@@ -30,7 +34,7 @@ class FMatrixS : public IFMatrix {
}
// destructor
virtual ~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const {
@@ -54,7 +58,7 @@ class FMatrixS : public IFMatrix {
size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
virtual void InitColAccess(const std::vector<bool> &enabled,
virtual void InitColAccess(const std::vector<bool> &enabled,
float pkeep, size_t max_row_perbatch) {
if (this->HaveColAccess()) return;
this->InitColData(enabled, pkeep, max_row_perbatch);
@@ -85,7 +89,7 @@ class FMatrixS : public IFMatrix {
size_t ncol = this->NumCol();
col_iter_.col_index_.resize(0);
for (size_t i = 0; i < fset.size(); ++i) {
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
}
col_iter_.BeforeFirst();
return &col_iter_;
@@ -94,7 +98,7 @@ class FMatrixS : public IFMatrix {
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*)
size_t n = 0;
fo.Write(&n, sizeof(n));
}
@@ -102,10 +106,10 @@ class FMatrixS : public IFMatrix {
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*)
// do nothing in load col access
}
protected:
/*!
* \brief intialize column data
@@ -129,7 +133,7 @@ class FMatrixS : public IFMatrix {
for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
SparsePage *pcol = col_iter_.cpages_[i];
for (size_t j = 0; j < pcol->Size(); ++j) {
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
}
}
}
@@ -139,7 +143,7 @@ class FMatrixS : public IFMatrix {
* \param pcol the target column
*/
inline void MakeOneBatch(const std::vector<bool> &enabled,
float pkeep,
float pkeep,
SparsePage *pcol) {
// clear rowset
buffered_rowset_.clear();
@@ -159,8 +163,8 @@ class FMatrixS : public IFMatrix {
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
bmap.resize(bmap.size() + batch.size, true);
long batch_size = static_cast<long>(batch.size);
for (long i = 0; i < batch_size; ++i) {
long batch_size = static_cast<long>(batch.size); // NOLINT(*)
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(ridx);
@@ -169,13 +173,13 @@ class FMatrixS : public IFMatrix {
}
}
#pragma omp parallel for schedule(static)
for (long i = 0; i < batch_size; ++i) {
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (bmap[ridx]) {
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
if (enabled[inst[j].index]){
if (enabled[inst[j].index]) {
builder.AddBudget(inst[j].index, tid);
}
}
@@ -183,18 +187,18 @@ class FMatrixS : public IFMatrix {
}
}
builder.InitStorage();
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
#pragma omp parallel for schedule(static)
for (long i = 0; i < static_cast<long>(batch.size); ++i) {
for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
int tid = omp_get_thread_num();
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
if (bmap[ridx]) {
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
if (enabled[inst[j].index]) {
if (enabled[inst[j].index]) {
builder.Push(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue), tid);
@@ -261,7 +265,7 @@ class FMatrixS : public IFMatrix {
#pragma omp parallel
{
nthread = omp_get_num_threads();
int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
if (nthread > max_nthread) {
nthread = max_nthread;
}
@@ -277,7 +281,7 @@ class FMatrixS : public IFMatrix {
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
const SparseBatch::Entry &e = inst[j];
if (enabled[e.index]) {
if (enabled[e.index]) {
builder.AddBudget(e.index, tid);
}
}
@@ -330,10 +334,10 @@ class FMatrixS : public IFMatrix {
static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
}
batch_.col_index = BeginPtr(col_index_);
batch_.col_data = BeginPtr(col_data_);
batch_.col_data = BeginPtr(col_data_);
return true;
}
virtual const ColBatch &Value(void) const {
virtual const ColBatch &Value(void) const {
return batch_;
}
inline void Clear(void) {
@@ -347,7 +351,7 @@ class FMatrixS : public IFMatrix {
// column content
std::vector<ColBatch::Inst> col_data_;
// column sparse pages
std::vector<SparsePage*> cpages_;
std::vector<SparsePage*> cpages_;
// data pointer
size_t data_ptr_;
// temporal space for batch
@@ -357,7 +361,7 @@ class FMatrixS : public IFMatrix {
// column iterator
ColBatchIter col_iter_;
// shared meta info with DMatrix
const learner::MetaInfo &info_;
const learner::MetaInfo &info_;
// row iterator
utils::IIterator<RowBatch> *iter_;
/*! \brief list of row index that are buffered */
@@ -367,4 +371,4 @@ class FMatrixS : public IFMatrix {
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP
#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_

View File

@@ -1,18 +1,22 @@
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
/*!
* Copyright (c) 2014 by Contributors
* \file sparse_batch_page.h
* content holder of sparse batch that can be saved to disk
* the representation can be effectively
* use in external memory computation
* \author Tianqi Chen
*/
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
#include <vector>
#include <algorithm>
#include "../data.h"
namespace xgboost {
namespace io {
/*!
* \brief storage unit of sparse batch
* \brief storage unit of sparse batch
*/
class SparsePage {
public:
@@ -96,7 +100,7 @@ class SparsePage {
}
/*!
* \brief save the data to fo, when a page was written
* to disk it must contain all the elements in the
* to disk it must contain all the elements in the
* \param fo output stream
*/
inline void Save(utils::IStream *fo) const {
@@ -124,7 +128,7 @@ class SparsePage {
*/
inline bool PushLoad(utils::IStream *fi) {
if (!fi->Read(&disk_offset_)) return false;
data.resize(offset.back() + disk_offset_.back());
data.resize(offset.back() + disk_offset_.back());
if (disk_offset_.back() != 0) {
utils::Check(fi->Read(BeginPtr(data) + offset.back(),
disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
@@ -138,7 +142,7 @@ class SparsePage {
}
return true;
}
/*!
/*!
* \brief Push row batch into the page
* \param batch the row batch
*/
@@ -154,7 +158,7 @@ class SparsePage {
offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
}
}
/*!
/*!
* \brief Push a sparse page
* \param batch the row page
*/
@@ -170,7 +174,7 @@ class SparsePage {
offset[i + begin] = top + batch.offset[i + 1];
}
}
/*!
/*!
* \brief Push one instance into page
* \param row an instance row
*/
@@ -202,7 +206,7 @@ class SparsePage {
};
/*!
* \brief factory class for SparsePage,
* used in threadbuffer template
* used in threadbuffer template
*/
class SparsePageFactory {
public:
@@ -217,7 +221,7 @@ class SparsePageFactory {
return action_index_set_;
}
// set index set, will be used after next before first
inline void SetIndexSet(const std::vector<bst_uint> &index_set,
inline void SetIndexSet(const std::vector<bst_uint> &index_set,
bool load_all) {
set_load_all_ = load_all;
if (!set_load_all_) {
@@ -229,7 +233,7 @@ class SparsePageFactory {
return true;
}
inline void SetParam(const char *name, const char *val) {}
inline bool LoadNext(SparsePage *val) {
inline bool LoadNext(SparsePage *val) {
if (!action_load_all_) {
if (action_index_set_.size() == 0) {
return false;