[REFACTOR] cleanup structure
This commit is contained in:
229
old_src/io/dmlc_simple.cpp
Normal file
229
old_src/io/dmlc_simple.cpp
Normal file
@@ -0,0 +1,229 @@
|
||||
// Copyright by Contributors
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#define NOMINMAX
|
||||
#include <string>
|
||||
#include "../utils/io.h"
|
||||
|
||||
// implements a single no split version of DMLC
|
||||
// in case we want to avoid dependency on dmlc-core
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief line split implementation from single FILE
|
||||
* simply returns lines of files, used for stdin
|
||||
*/
|
||||
class SingleFileSplit : public dmlc::InputSplit {
|
||||
public:
|
||||
explicit SingleFileSplit(const char *fname)
|
||||
: use_stdin_(false),
|
||||
chunk_begin_(NULL), chunk_end_(NULL) {
|
||||
if (!std::strcmp(fname, "stdin")) {
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
use_stdin_ = true; fp_ = stdin;
|
||||
#endif
|
||||
}
|
||||
if (!use_stdin_) {
|
||||
fp_ = utils::FopenCheck(fname, "rb");
|
||||
}
|
||||
buffer_.resize(kBufferSize);
|
||||
}
|
||||
virtual ~SingleFileSplit(void) {
|
||||
if (!use_stdin_) std::fclose(fp_);
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return std::fread(ptr, 1, size, fp_);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
utils::Error("cannot do write in inputsplit");
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
std::fseek(fp_, 0, SEEK_SET);
|
||||
}
|
||||
virtual bool NextRecord(Blob *out_rec) {
|
||||
if (chunk_begin_ == chunk_end_) {
|
||||
if (!LoadChunk()) return false;
|
||||
}
|
||||
char *next = FindNextRecord(chunk_begin_,
|
||||
chunk_end_);
|
||||
out_rec->dptr = chunk_begin_;
|
||||
out_rec->size = next - chunk_begin_;
|
||||
chunk_begin_ = next;
|
||||
return true;
|
||||
}
|
||||
virtual bool NextChunk(Blob *out_chunk) {
|
||||
if (chunk_begin_ == chunk_end_) {
|
||||
if (!LoadChunk()) return false;
|
||||
}
|
||||
out_chunk->dptr = chunk_begin_;
|
||||
out_chunk->size = chunk_end_ - chunk_begin_;
|
||||
chunk_begin_ = chunk_end_;
|
||||
return true;
|
||||
}
|
||||
inline bool ReadChunk(void *buf, size_t *size) {
|
||||
size_t max_size = *size;
|
||||
if (max_size <= overflow_.length()) {
|
||||
*size = 0; return true;
|
||||
}
|
||||
if (overflow_.length() != 0) {
|
||||
std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
|
||||
}
|
||||
size_t olen = overflow_.length();
|
||||
overflow_.resize(0);
|
||||
size_t nread = this->Read(reinterpret_cast<char*>(buf) + olen,
|
||||
max_size - olen);
|
||||
nread += olen;
|
||||
if (nread == 0) return false;
|
||||
if (nread != max_size) {
|
||||
*size = nread;
|
||||
return true;
|
||||
} else {
|
||||
const char *bptr = reinterpret_cast<const char*>(buf);
|
||||
// return the last position where a record starts
|
||||
const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size);
|
||||
*size = bend - bptr;
|
||||
overflow_.resize(max_size - *size);
|
||||
if (overflow_.length() != 0) {
|
||||
std::memcpy(BeginPtr(overflow_), bend, overflow_.length());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
inline const char* FindLastRecordBegin(const char *begin,
|
||||
const char *end) {
|
||||
if (begin == end) return begin;
|
||||
for (const char *p = end - 1; p != begin; --p) {
|
||||
if (*p == '\n' || *p == '\r') return p + 1;
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
inline char* FindNextRecord(char *begin, char *end) {
|
||||
char *p;
|
||||
for (p = begin; p != end; ++p) {
|
||||
if (*p == '\n' || *p == '\r') break;
|
||||
}
|
||||
for (; p != end; ++p) {
|
||||
if (*p != '\n' && *p != '\r') return p;
|
||||
}
|
||||
return end;
|
||||
}
|
||||
inline bool LoadChunk(void) {
|
||||
while (true) {
|
||||
size_t size = buffer_.length();
|
||||
if (!ReadChunk(BeginPtr(buffer_), &size)) return false;
|
||||
if (size == 0) {
|
||||
buffer_.resize(buffer_.length() * 2);
|
||||
} else {
|
||||
chunk_begin_ = reinterpret_cast<char *>(BeginPtr(buffer_));
|
||||
chunk_end_ = chunk_begin_ + size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
// buffer size
|
||||
static const size_t kBufferSize = 1 << 18UL;
|
||||
// file
|
||||
std::FILE *fp_;
|
||||
bool use_stdin_;
|
||||
// internal overflow
|
||||
std::string overflow_;
|
||||
// internal buffer
|
||||
std::string buffer_;
|
||||
// beginning of chunk
|
||||
char *chunk_begin_;
|
||||
// end of chunk
|
||||
char *chunk_end_;
|
||||
};
|
||||
|
||||
class StdFile : public dmlc::Stream {
|
||||
public:
|
||||
explicit StdFile(std::FILE *fp, bool use_stdio)
|
||||
: fp(fp), use_stdio(use_stdio) {
|
||||
}
|
||||
virtual ~StdFile(void) {
|
||||
this->Close();
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return std::fread(ptr, 1, size, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
Check(std::fwrite(ptr, size, 1, fp) == 1, "StdFile::Write: fwrite error!");
|
||||
}
|
||||
virtual void Seek(size_t pos) {
|
||||
std::fseek(fp, static_cast<long>(pos), SEEK_SET); // NOLINT(*)
|
||||
}
|
||||
virtual size_t Tell(void) {
|
||||
return std::ftell(fp);
|
||||
}
|
||||
virtual bool AtEnd(void) const {
|
||||
return std::feof(fp) != 0;
|
||||
}
|
||||
inline void Close(void) {
|
||||
if (fp != NULL && !use_stdio) {
|
||||
std::fclose(fp); fp = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::FILE *fp;
|
||||
bool use_stdio;
|
||||
};
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
|
||||
namespace dmlc {
|
||||
InputSplit* InputSplit::Create(const char *uri,
|
||||
unsigned part,
|
||||
unsigned nsplit,
|
||||
const char *type) {
|
||||
using namespace std;
|
||||
using namespace xgboost;
|
||||
const char *msg = "xgboost is compiled in local mode\n"\
|
||||
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
|
||||
utils::Check(strncmp(uri, "s3://", 5) != 0, msg);
|
||||
utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg);
|
||||
utils::Check(nsplit == 1, msg);
|
||||
return new utils::SingleFileSplit(uri);
|
||||
}
|
||||
|
||||
Stream *Stream::Create(const char *fname, const char * const mode, bool allow_null) {
|
||||
using namespace std;
|
||||
using namespace xgboost;
|
||||
const char *msg = "xgboost is compiled in local mode\n"\
|
||||
"to use hdfs, s3 or distributed version, compile with make dmlc=1";
|
||||
utils::Check(strncmp(fname, "s3://", 5) != 0, msg);
|
||||
utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg);
|
||||
|
||||
std::FILE *fp = NULL;
|
||||
bool use_stdio = false;
|
||||
using namespace std;
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
if (!strcmp(fname, "stdin")) {
|
||||
use_stdio = true; fp = stdin;
|
||||
}
|
||||
if (!strcmp(fname, "stdout")) {
|
||||
use_stdio = true; fp = stdout;
|
||||
}
|
||||
#endif
|
||||
if (!strncmp(fname, "file://", 7)) fname += 7;
|
||||
if (!use_stdio) {
|
||||
std::string flag = mode;
|
||||
if (flag == "w") flag = "wb";
|
||||
if (flag == "r") flag = "rb";
|
||||
fp = fopen64(fname, flag.c_str());
|
||||
}
|
||||
if (fp != NULL) {
|
||||
return new utils::StdFile(fp, use_stdio);
|
||||
} else {
|
||||
utils::Check(allow_null, "fail to open file %s", fname);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
} // namespace dmlc
|
||||
|
||||
97
old_src/io/io.cpp
Normal file
97
old_src/io/io.cpp
Normal file
@@ -0,0 +1,97 @@
|
||||
// Copyright 2014 by Contributors
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#define NOMINMAX
|
||||
#include <string>
|
||||
#include "./io.h"
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "simple_dmatrix-inl.hpp"
|
||||
#include "page_dmatrix-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
DataMatrix* LoadDataMatrix(const char *fname,
|
||||
bool silent,
|
||||
bool savebuffer,
|
||||
bool loadsplit,
|
||||
const char *cache_file) {
|
||||
using namespace std;
|
||||
std::string fname_ = fname;
|
||||
|
||||
const char *dlm = strchr(fname, '#');
|
||||
if (dlm != NULL) {
|
||||
utils::Check(strchr(dlm + 1, '#') == NULL,
|
||||
"only one `#` is allowed in file path for cachefile specification");
|
||||
utils::Check(cache_file == NULL,
|
||||
"can only specify the cachefile with `#` or argument, not both");
|
||||
fname_ = std::string(fname, dlm - fname);
|
||||
fname = fname_.c_str();
|
||||
cache_file = dlm +1;
|
||||
}
|
||||
|
||||
if (cache_file == NULL) {
|
||||
if (!std::strcmp(fname, "stdin") ||
|
||||
!std::strncmp(fname, "s3://", 5) ||
|
||||
!std::strncmp(fname, "hdfs://", 7) ||
|
||||
loadsplit) {
|
||||
DMatrixSimple *dmat = new DMatrixSimple();
|
||||
dmat->LoadText(fname, silent, loadsplit);
|
||||
return dmat;
|
||||
}
|
||||
int magic;
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
|
||||
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
|
||||
fs.Seek(0);
|
||||
if (magic == DMatrixSimple::kMagic) {
|
||||
DMatrixSimple *dmat = new DMatrixSimple();
|
||||
dmat->LoadBinary(fs, silent, fname);
|
||||
fs.Close();
|
||||
return dmat;
|
||||
}
|
||||
fs.Close();
|
||||
DMatrixSimple *dmat = new DMatrixSimple();
|
||||
dmat->CacheLoad(fname, silent, savebuffer);
|
||||
return dmat;
|
||||
} else {
|
||||
std::string cache_fname = cache_file;
|
||||
if (loadsplit) {
|
||||
std::ostringstream os;
|
||||
os << cache_file << ".r" << rabit::GetRank();
|
||||
cache_fname = os.str();
|
||||
cache_file = cache_fname.c_str();
|
||||
}
|
||||
FILE *fi = fopen64(cache_file, "rb");
|
||||
if (fi != NULL) {
|
||||
DMatrixPage *dmat = new DMatrixPage();
|
||||
utils::FileStream fs(fi);
|
||||
dmat->LoadBinary(fs, silent, cache_file);
|
||||
fs.Close();
|
||||
return dmat;
|
||||
} else {
|
||||
if (fname[0] == '!') {
|
||||
DMatrixHalfRAM *dmat = new DMatrixHalfRAM();
|
||||
dmat->LoadText(fname + 1, cache_file, false, loadsplit);
|
||||
return dmat;
|
||||
} else {
|
||||
DMatrixPage *dmat = new DMatrixPage();
|
||||
dmat->LoadText(fname, cache_file, false, loadsplit);
|
||||
return dmat;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
|
||||
if (dmat.magic == DMatrixSimple::kMagic) {
|
||||
const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
|
||||
p_dmat->SaveBinary(fname, silent);
|
||||
} else {
|
||||
DMatrixSimple smat;
|
||||
smat.CopyFrom(dmat);
|
||||
smat.SaveBinary(fname, silent);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
47
old_src/io/io.h
Normal file
47
old_src/io/io.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*!
|
||||
* Copyright 2014 by Contributors
|
||||
* \file io.h
|
||||
* \brief handles input data format of xgboost
|
||||
* I/O module handles a specific DMatrix format
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_IO_H_
|
||||
#define XGBOOST_IO_IO_H_
|
||||
|
||||
#include "../data.h"
|
||||
#include "../learner/dmatrix.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace related to data format */
|
||||
namespace io {
|
||||
/*! \brief DMatrix object that I/O module support save/load */
|
||||
typedef learner::DMatrix DataMatrix;
|
||||
/*!
|
||||
* \brief load DataMatrix from stream
|
||||
* \param fname file name to be loaded
|
||||
* \param silent whether print message during loading
|
||||
* \param savebuffer whether temporal buffer the file if the file is in text format
|
||||
* \param loadsplit whether we only load a split of input files
|
||||
* such that each worker node get a split of the data
|
||||
* \param cache_file name of cache_file, used by external memory version
|
||||
* can be NULL, if cache_file is specified, this will be the temporal
|
||||
* space that can be re-used to store intermediate data
|
||||
* \return a loaded DMatrix
|
||||
*/
|
||||
DataMatrix* LoadDataMatrix(const char *fname,
|
||||
bool silent,
|
||||
bool savebuffer,
|
||||
bool loadsplit,
|
||||
const char *cache_file = NULL);
|
||||
/*!
|
||||
* \brief save DataMatrix into stream,
|
||||
* note: the saved dmatrix format may not be in exactly same as input
|
||||
* SaveDMatrix will choose the best way to materialize the dmatrix.
|
||||
* \param dmat the dmatrix to be saved
|
||||
* \param fname file name to be saved
|
||||
* \param silent whether print message during saving
|
||||
*/
|
||||
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_IO_H_
|
||||
212
old_src/io/libsvm_parser.h
Normal file
212
old_src/io/libsvm_parser.h
Normal file
@@ -0,0 +1,212 @@
|
||||
/*!
|
||||
* Copyright (c) 2015 by Contributors
|
||||
* \file libsvm_parser.h
|
||||
* \brief iterator parser to parse libsvm format
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_LIBSVM_PARSER_H_
|
||||
#define XGBOOST_IO_LIBSVM_PARSER_H_
|
||||
#define NOMINMAX
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <cctype>
|
||||
#include <algorithm>
|
||||
#include "../utils/omp.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../sync/sync.h"
|
||||
#include "../utils/thread_buffer.h"
|
||||
#include "./sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*! \brief page returned by libsvm parser */
|
||||
struct LibSVMPage : public SparsePage {
|
||||
std::vector<float> label;
|
||||
// overload clear
|
||||
inline void Clear() {
|
||||
SparsePage::Clear();
|
||||
label.clear();
|
||||
}
|
||||
};
|
||||
/*!
|
||||
* \brief libsvm parser that parses the input lines
|
||||
* and returns rows in input data
|
||||
* factory that was used by threadbuffer template
|
||||
*/
|
||||
class LibSVMPageFactory {
|
||||
public:
|
||||
LibSVMPageFactory()
|
||||
: bytes_read_(0), at_head_(true) {
|
||||
}
|
||||
inline bool Init(void) {
|
||||
return true;
|
||||
}
|
||||
inline void Setup(dmlc::InputSplit *source,
|
||||
int nthread) {
|
||||
source_ = source;
|
||||
int maxthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
maxthread = omp_get_num_procs();
|
||||
}
|
||||
maxthread = std::max(maxthread / 2, 1);
|
||||
nthread_ = std::min(maxthread, nthread);
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {}
|
||||
inline bool LoadNext(std::vector<LibSVMPage> *data) {
|
||||
return FillData(data);
|
||||
}
|
||||
inline void FreeSpace(std::vector<LibSVMPage> *a) {
|
||||
delete a;
|
||||
}
|
||||
inline std::vector<LibSVMPage> *Create(void) {
|
||||
return new std::vector<LibSVMPage>();
|
||||
}
|
||||
inline void BeforeFirst(void) {
|
||||
utils::Assert(at_head_, "cannot call beforefirst");
|
||||
}
|
||||
inline void Destroy(void) {
|
||||
delete source_;
|
||||
}
|
||||
inline size_t bytes_read(void) const {
|
||||
return bytes_read_;
|
||||
}
|
||||
|
||||
protected:
|
||||
inline bool FillData(std::vector<LibSVMPage> *data) {
|
||||
dmlc::InputSplit::Blob chunk;
|
||||
if (!source_->NextChunk(&chunk)) return false;
|
||||
int nthread;
|
||||
#pragma omp parallel num_threads(nthread_)
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
// reserve space for data
|
||||
data->resize(nthread);
|
||||
bytes_read_ += chunk.size;
|
||||
utils::Assert(chunk.size != 0, "LibSVMParser.FileData");
|
||||
char *head = reinterpret_cast<char*>(chunk.dptr);
|
||||
#pragma omp parallel num_threads(nthread_)
|
||||
{
|
||||
// threadid
|
||||
int tid = omp_get_thread_num();
|
||||
size_t nstep = (chunk.size + nthread - 1) / nthread;
|
||||
size_t sbegin = std::min(tid * nstep, chunk.size);
|
||||
size_t send = std::min((tid + 1) * nstep, chunk.size);
|
||||
char *pbegin = BackFindEndLine(head + sbegin, head);
|
||||
char *pend;
|
||||
if (tid + 1 == nthread) {
|
||||
pend = head + send;
|
||||
} else {
|
||||
pend = BackFindEndLine(head + send, head);
|
||||
}
|
||||
ParseBlock(pbegin, pend, &(*data)[tid]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief parse data into out
|
||||
* \param begin beginning of buffer
|
||||
* \param end end of buffer
|
||||
*/
|
||||
inline void ParseBlock(char *begin,
|
||||
char *end,
|
||||
LibSVMPage *out) {
|
||||
using namespace std;
|
||||
out->Clear();
|
||||
char *p = begin;
|
||||
while (p != end) {
|
||||
while (isspace(*p) && p != end) ++p;
|
||||
if (p == end) break;
|
||||
char *head = p;
|
||||
while (isdigit(*p) && p != end) ++p;
|
||||
if (*p == ':') {
|
||||
out->data.push_back(SparseBatch::Entry(atol(head),
|
||||
static_cast<bst_float>(atof(p + 1))));
|
||||
} else {
|
||||
if (out->label.size() != 0) {
|
||||
out->offset.push_back(out->data.size());
|
||||
}
|
||||
out->label.push_back(static_cast<float>(atof(head)));
|
||||
}
|
||||
while (!isspace(*p) && p != end) ++p;
|
||||
}
|
||||
if (out->label.size() != 0) {
|
||||
out->offset.push_back(out->data.size());
|
||||
}
|
||||
utils::Check(out->label.size() + 1 == out->offset.size(),
|
||||
"LibSVMParser inconsistent");
|
||||
}
|
||||
/*!
|
||||
* \brief start from bptr, go backward and find first endof line
|
||||
* \param bptr end position to go backward
|
||||
* \param begin the beginning position of buffer
|
||||
* \return position of first endof line going backward
|
||||
*/
|
||||
inline char* BackFindEndLine(char *bptr,
|
||||
char *begin) {
|
||||
for (; bptr != begin; --bptr) {
|
||||
if (*bptr == '\n' || *bptr == '\r') return bptr;
|
||||
}
|
||||
return begin;
|
||||
}
|
||||
|
||||
private:
|
||||
// nthread
|
||||
int nthread_;
|
||||
// number of bytes readed
|
||||
size_t bytes_read_;
|
||||
// at beginning, at end of stream
|
||||
bool at_head_;
|
||||
// source split that provides the data
|
||||
dmlc::InputSplit *source_;
|
||||
};
|
||||
|
||||
class LibSVMParser : public utils::IIterator<LibSVMPage> {
|
||||
public:
|
||||
explicit LibSVMParser(dmlc::InputSplit *source,
|
||||
int nthread)
|
||||
: at_end_(false), data_ptr_(0), data_(NULL) {
|
||||
itr.SetParam("buffer_size", "2");
|
||||
itr.get_factory().Setup(source, nthread);
|
||||
itr.Init();
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
itr.BeforeFirst();
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (at_end_) return false;
|
||||
while (true) {
|
||||
if (data_ == NULL || data_ptr_ >= data_->size()) {
|
||||
if (!itr.Next(data_)) {
|
||||
at_end_ = true; return false;
|
||||
} else {
|
||||
data_ptr_ = 0;
|
||||
}
|
||||
}
|
||||
while (data_ptr_ < data_->size()) {
|
||||
data_ptr_ += 1;
|
||||
if ((*data_)[data_ptr_ - 1].Size() != 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
virtual const LibSVMPage &Value(void) const {
|
||||
return (*data_)[data_ptr_ - 1];
|
||||
}
|
||||
inline size_t bytes_read(void) const {
|
||||
return itr.get_factory().bytes_read();
|
||||
}
|
||||
|
||||
private:
|
||||
bool at_end_;
|
||||
size_t data_ptr_;
|
||||
std::vector<LibSVMPage> *data_;
|
||||
utils::ThreadBuffer<std::vector<LibSVMPage>*, LibSVMPageFactory> itr;
|
||||
};
|
||||
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_LIBSVM_PARSER_H_
|
||||
260
old_src/io/page_dmatrix-inl.hpp
Normal file
260
old_src/io/page_dmatrix-inl.hpp
Normal file
@@ -0,0 +1,260 @@
|
||||
/*!
|
||||
* Copyright (c) 2014 by Contributors
|
||||
* \file page_dmatrix-inl.hpp
|
||||
* row iterator based on sparse page
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
|
||||
#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include "../data.h"
|
||||
#include "../utils/iterator.h"
|
||||
#include "../utils/thread_buffer.h"
|
||||
#include "./simple_fmatrix-inl.hpp"
|
||||
#include "./sparse_batch_page.h"
|
||||
#include "./page_fmatrix-inl.hpp"
|
||||
#include "./libsvm_parser.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*! \brief thread buffer iterator */
|
||||
class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
|
||||
public:
|
||||
ThreadRowPageIterator(void) {
|
||||
itr.SetParam("buffer_size", "4");
|
||||
page_ = NULL;
|
||||
base_rowid_ = 0;
|
||||
}
|
||||
virtual ~ThreadRowPageIterator(void) {}
|
||||
virtual void Init(void) {
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
itr.BeforeFirst();
|
||||
base_rowid_ = 0;
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (!itr.Next(page_)) return false;
|
||||
out_ = page_->GetRowBatch(base_rowid_);
|
||||
base_rowid_ += out_.size;
|
||||
return true;
|
||||
}
|
||||
virtual const RowBatch &Value(void) const {
|
||||
return out_;
|
||||
}
|
||||
/*! \brief load and initialize the iterator with fi */
|
||||
inline void Load(const utils::FileStream &fi) {
|
||||
itr.get_factory().SetFile(fi, 0);
|
||||
itr.Init();
|
||||
this->BeforeFirst();
|
||||
}
|
||||
|
||||
private:
|
||||
// base row id
|
||||
size_t base_rowid_;
|
||||
// output data
|
||||
RowBatch out_;
|
||||
SparsePage *page_;
|
||||
utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
|
||||
};
|
||||
|
||||
/*! \brief data matrix using page */
|
||||
template<int TKMagic>
|
||||
class DMatrixPageBase : public DataMatrix {
|
||||
public:
|
||||
DMatrixPageBase(void) : DataMatrix(kMagic) {
|
||||
iter_ = new ThreadRowPageIterator();
|
||||
}
|
||||
// virtual destructor
|
||||
virtual ~DMatrixPageBase(void) {
|
||||
// do not delete row iterator, since it is owned by fmat
|
||||
// to be cleaned up in a more clear way
|
||||
}
|
||||
/*! \brief save a DataMatrix as DMatrixPage */
|
||||
inline static void Save(const char *fname_, const DataMatrix &mat, bool silent) {
|
||||
std::string fname = fname_;
|
||||
utils::FileStream fs(utils::FopenCheck(fname.c_str(), "wb"));
|
||||
int magic = kMagic;
|
||||
fs.Write(&magic, sizeof(magic));
|
||||
mat.info.SaveBinary(fs);
|
||||
fs.Close();
|
||||
fname += ".row.blob";
|
||||
utils::IIterator<RowBatch> *iter = mat.fmat()->RowIterator();
|
||||
utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb"));
|
||||
SparsePage page;
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
page.Push(batch[i]);
|
||||
if (page.MemCostBytes() >= kPageSize) {
|
||||
page.Save(&fbin); page.Clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (page.data.size() != 0) page.Save(&fbin);
|
||||
fbin.Close();
|
||||
if (!silent) {
|
||||
utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
|
||||
static_cast<unsigned long>(mat.info.num_row()), // NOLINT(*)
|
||||
static_cast<unsigned long>(mat.info.num_col()), fname_); // NOLINT(*)
|
||||
}
|
||||
}
|
||||
/*! \brief load and initialize the iterator with fi */
|
||||
inline void LoadBinary(utils::FileStream &fi, // NOLINT(*)
|
||||
bool silent,
|
||||
const char *fname_) {
|
||||
this->set_cache_file(fname_);
|
||||
std::string fname = fname_;
|
||||
int tmagic;
|
||||
utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
|
||||
this->CheckMagic(tmagic);
|
||||
this->info.LoadBinary(fi);
|
||||
// load in the row data file
|
||||
fname += ".row.blob";
|
||||
utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb"));
|
||||
iter_->Load(fs);
|
||||
if (!silent) {
|
||||
utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
|
||||
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
|
||||
static_cast<unsigned long>(info.num_col())); // NOLINT(*)
|
||||
if (fname_ != NULL) {
|
||||
utils::Printf(" from %s\n", fname_);
|
||||
} else {
|
||||
utils::Printf("\n");
|
||||
}
|
||||
if (info.group_ptr.size() != 0) {
|
||||
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief save a LibSVM format file as DMatrixPage */
|
||||
inline void LoadText(const char *uri,
|
||||
const char* cache_file,
|
||||
bool silent,
|
||||
bool loadsplit) {
|
||||
if (!silent) {
|
||||
utils::Printf("start generate text file from %s\n", uri);
|
||||
}
|
||||
int rank = 0, npart = 1;
|
||||
if (loadsplit) {
|
||||
rank = rabit::GetRank();
|
||||
npart = rabit::GetWorldSize();
|
||||
}
|
||||
this->set_cache_file(cache_file);
|
||||
std::string fname_row = std::string(cache_file) + ".row.blob";
|
||||
utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));
|
||||
SparsePage page;
|
||||
size_t bytes_write = 0;
|
||||
double tstart = rabit::utils::GetTime();
|
||||
LibSVMParser parser(
|
||||
dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
|
||||
info.Clear();
|
||||
while (parser.Next()) {
|
||||
const LibSVMPage &batch = parser.Value();
|
||||
size_t nlabel = info.labels.size();
|
||||
info.labels.resize(nlabel + batch.label.size());
|
||||
if (batch.label.size() != 0) {
|
||||
std::memcpy(BeginPtr(info.labels) + nlabel,
|
||||
BeginPtr(batch.label),
|
||||
batch.label.size() * sizeof(float));
|
||||
}
|
||||
page.Push(batch);
|
||||
for (size_t i = 0; i < batch.data.size(); ++i) {
|
||||
info.info.num_col = std::max(info.info.num_col,
|
||||
static_cast<size_t>(batch.data[i].index+1));
|
||||
}
|
||||
if (page.MemCostBytes() >= kPageSize) {
|
||||
bytes_write += page.MemCostBytes();
|
||||
page.Save(&fo);
|
||||
page.Clear();
|
||||
double tdiff = rabit::utils::GetTime() - tstart;
|
||||
if (!silent) {
|
||||
utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
|
||||
cache_file, (bytes_write >> 20UL) / tdiff,
|
||||
(bytes_write >> 20UL));
|
||||
}
|
||||
}
|
||||
info.info.num_row += batch.label.size();
|
||||
}
|
||||
if (page.data.size() != 0) {
|
||||
page.Save(&fo);
|
||||
}
|
||||
fo.Close();
|
||||
iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb")));
|
||||
// save data matrix
|
||||
utils::FileStream fs(utils::FopenCheck(cache_file, "wb"));
|
||||
int tmagic = kMagic;
|
||||
fs.Write(&tmagic, sizeof(tmagic));
|
||||
this->info.SaveBinary(fs);
|
||||
fs.Close();
|
||||
if (!silent) {
|
||||
utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n",
|
||||
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
|
||||
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
|
||||
uri);
|
||||
}
|
||||
}
|
||||
/*! \brief magic number used to identify DMatrix */
|
||||
static const int kMagic = TKMagic;
|
||||
/*! \brief page size 32 MB */
|
||||
static const size_t kPageSize = 32UL << 20UL;
|
||||
|
||||
protected:
|
||||
virtual void set_cache_file(const std::string &cache_file) = 0;
|
||||
virtual void CheckMagic(int tmagic) = 0;
|
||||
/*! \brief row iterator */
|
||||
ThreadRowPageIterator *iter_;
|
||||
};
|
||||
|
||||
class DMatrixPage : public DMatrixPageBase<0xffffab02> {
|
||||
public:
|
||||
DMatrixPage(void) {
|
||||
fmat_ = new FMatrixPage(iter_, this->info);
|
||||
}
|
||||
virtual ~DMatrixPage(void) {
|
||||
delete fmat_;
|
||||
}
|
||||
virtual IFMatrix *fmat(void) const {
|
||||
return fmat_;
|
||||
}
|
||||
virtual void set_cache_file(const std::string &cache_file) {
|
||||
fmat_->set_cache_file(cache_file);
|
||||
}
|
||||
virtual void CheckMagic(int tmagic) {
|
||||
utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
|
||||
tmagic == DMatrixPageBase<0xffffab03>::kMagic,
|
||||
"invalid format,magic number mismatch");
|
||||
}
|
||||
/*! \brief the real fmatrix */
|
||||
FMatrixPage *fmat_;
|
||||
};
|
||||
|
||||
// mix of FMatrix S and DMatrix
|
||||
// cost half of ram usually as DMatrixSimple
|
||||
class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
|
||||
public:
|
||||
DMatrixHalfRAM(void) {
|
||||
fmat_ = new FMatrixS(iter_, this->info);
|
||||
}
|
||||
virtual ~DMatrixHalfRAM(void) {
|
||||
delete fmat_;
|
||||
}
|
||||
virtual IFMatrix *fmat(void) const {
|
||||
return fmat_;
|
||||
}
|
||||
virtual void set_cache_file(const std::string &cache_file) {
|
||||
}
|
||||
virtual void CheckMagic(int tmagic) {
|
||||
utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
|
||||
tmagic == DMatrixPageBase<0xffffab03>::kMagic,
|
||||
"invalid format,magic number mismatch");
|
||||
}
|
||||
/*! \brief the real fmatrix */
|
||||
IFMatrix *fmat_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
|
||||
360
old_src/io/page_fmatrix-inl.hpp
Normal file
360
old_src/io/page_fmatrix-inl.hpp
Normal file
@@ -0,0 +1,360 @@
|
||||
/*!
|
||||
* Copyright (c) 2014 by Contributors
|
||||
* \file page_fmatrix-inl.hpp
|
||||
* col iterator based on sparse page
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
|
||||
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*! \brief thread buffer iterator */
|
||||
class ThreadColPageIterator: public utils::IIterator<ColBatch> {
|
||||
public:
|
||||
ThreadColPageIterator(void) {
|
||||
itr.SetParam("buffer_size", "2");
|
||||
page_ = NULL;
|
||||
}
|
||||
virtual ~ThreadColPageIterator(void) {}
|
||||
virtual void Init(void) {}
|
||||
virtual void BeforeFirst(void) {
|
||||
itr.BeforeFirst();
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (!itr.Next(page_)) return false;
|
||||
out_.col_index = BeginPtr(itr.get_factory().index_set());
|
||||
col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(NULL, 0));
|
||||
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||
col_data_[i] = SparseBatch::Inst
|
||||
(BeginPtr(page_->data) + page_->offset[i],
|
||||
static_cast<bst_uint>(page_->offset[i + 1] - page_->offset[i]));
|
||||
}
|
||||
out_.col_data = BeginPtr(col_data_);
|
||||
out_.size = col_data_.size();
|
||||
return true;
|
||||
}
|
||||
virtual const ColBatch &Value(void) const {
|
||||
return out_;
|
||||
}
|
||||
/*! \brief load and initialize the iterator with fi */
|
||||
inline void SetFile(const utils::FileStream &fi) {
|
||||
itr.get_factory().SetFile(fi);
|
||||
itr.Init();
|
||||
}
|
||||
// set index set
|
||||
inline void SetIndexSet(const std::vector<bst_uint> &fset, bool load_all) {
|
||||
itr.get_factory().SetIndexSet(fset, load_all);
|
||||
}
|
||||
|
||||
private:
|
||||
// output data
|
||||
ColBatch out_;
|
||||
SparsePage *page_;
|
||||
std::vector<SparseBatch::Inst> col_data_;
|
||||
utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
|
||||
};
|
||||
|
||||
struct ColConvertFactory {
|
||||
inline bool Init(void) {
|
||||
return true;
|
||||
}
|
||||
inline void Setup(float pkeep,
|
||||
size_t max_row_perbatch,
|
||||
size_t num_col,
|
||||
utils::IIterator<RowBatch> *iter,
|
||||
std::vector<bst_uint> *buffered_rowset,
|
||||
const std::vector<bool> *enabled) {
|
||||
pkeep_ = pkeep;
|
||||
max_row_perbatch_ = max_row_perbatch;
|
||||
num_col_ = num_col;
|
||||
iter_ = iter;
|
||||
buffered_rowset_ = buffered_rowset;
|
||||
enabled_ = enabled;
|
||||
}
|
||||
inline SparsePage *Create(void) {
|
||||
return new SparsePage();
|
||||
}
|
||||
inline void FreeSpace(SparsePage *a) {
|
||||
delete a;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {}
|
||||
inline bool LoadNext(SparsePage *val) {
|
||||
tmp_.Clear();
|
||||
size_t btop = buffered_rowset_->size();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep_ == 1.0f || random::SampleBinary(pkeep_)) {
|
||||
buffered_rowset_->push_back(ridx);
|
||||
tmp_.Push(batch[i]);
|
||||
}
|
||||
}
|
||||
if (tmp_.MemCostBytes() >= kPageSize ||
|
||||
tmp_.Size() >= max_row_perbatch_) {
|
||||
this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
|
||||
*enabled_, val);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (tmp_.Size() != 0) {
|
||||
this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
|
||||
*enabled_, val);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
inline void Destroy(void) {}
|
||||
inline void BeforeFirst(void) {}
|
||||
inline void MakeColPage(const SparsePage &prow,
|
||||
const bst_uint *ridx,
|
||||
const std::vector<bool> &enabled,
|
||||
SparsePage *pcol) {
|
||||
pcol->Clear();
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1);
|
||||
if (nthread > max_nthread) {
|
||||
nthread = max_nthread;
|
||||
}
|
||||
}
|
||||
pcol->Clear();
|
||||
utils::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(num_col_, nthread);
|
||||
bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
|
||||
const SparseBatch::Entry &e = prow.data[j];
|
||||
if (enabled[e.index]) {
|
||||
builder.AddBudget(e.index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
|
||||
const SparseBatch::Entry &e = prow.data[j];
|
||||
builder.Push(e.index,
|
||||
SparseBatch::Entry(ridx[i], e.fvalue),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
utils::Assert(pcol->Size() == num_col_, "inconsistent col data");
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(BeginPtr(pcol->data) + pcol->offset[i],
|
||||
BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
// probability of keep
|
||||
float pkeep_;
|
||||
// maximum number of rows per batch
|
||||
size_t max_row_perbatch_;
|
||||
// number of columns
|
||||
size_t num_col_;
|
||||
// row batch iterator
|
||||
utils::IIterator<RowBatch> *iter_;
|
||||
// buffered rowset
|
||||
std::vector<bst_uint> *buffered_rowset_;
|
||||
// enabled marks
|
||||
const std::vector<bool> *enabled_;
|
||||
// internal temp cache
|
||||
SparsePage tmp_;
|
||||
/*! \brief page size 256 M */
|
||||
static const size_t kPageSize = 256 << 20UL;
|
||||
};
|
||||
/*!
|
||||
* \brief sparse matrix that support column access, CSC
|
||||
*/
|
||||
class FMatrixPage : public IFMatrix {
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
/*! \brief constructor */
|
||||
FMatrixPage(utils::IIterator<RowBatch> *iter,
|
||||
const learner::MetaInfo &info) : info(info) {
|
||||
this->iter_ = iter;
|
||||
}
|
||||
// destructor
|
||||
virtual ~FMatrixPage(void) {
|
||||
if (iter_ != NULL) delete iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of columns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_size_.size();
|
||||
}
|
||||
/*! \brief get number of buffered rows */
|
||||
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
/*! \brief get column size */
|
||||
virtual size_t GetColSize(size_t cidx) const {
|
||||
return col_size_[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
virtual float GetColDensity(size_t cidx) const {
|
||||
size_t nmiss = num_buffered_row_ - (col_size_[cidx]);
|
||||
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
|
||||
}
|
||||
virtual void InitColAccess(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
if (this->HaveColAccess()) return;
|
||||
if (TryLoadColData()) return;
|
||||
this->InitColData(enabled, pkeep, max_row_perbatch);
|
||||
utils::Check(TryLoadColData(), "failed on creating col.blob");
|
||||
}
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
*/
|
||||
virtual utils::IIterator<RowBatch>* RowIterator(void) {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief get the column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch>* ColIterator(void) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_index_.resize(0);
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_index_.push_back(static_cast<bst_uint>(i));
|
||||
}
|
||||
col_iter_.SetIndexSet(col_index_, false);
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_index_.resize(0);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (fset[i] < ncol) col_index_.push_back(fset[i]);
|
||||
}
|
||||
col_iter_.SetIndexSet(col_index_, false);
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
// set the cache file name
|
||||
inline void set_cache_file(const std::string &cache_file) {
|
||||
col_data_name_ = std::string(cache_file) + ".col.blob";
|
||||
col_meta_name_ = std::string(cache_file) + ".col.meta";
|
||||
}
|
||||
|
||||
protected:
|
||||
inline bool TryLoadColData(void) {
|
||||
std::FILE *fi = fopen64(col_meta_name_.c_str(), "rb");
|
||||
if (fi == NULL) return false;
|
||||
utils::FileStream fs(fi);
|
||||
LoadMeta(&fs);
|
||||
fs.Close();
|
||||
fi = utils::FopenCheck(col_data_name_.c_str(), "rb");
|
||||
if (fi == NULL) return false;
|
||||
col_iter_.SetFile(utils::FileStream(fi));
|
||||
return true;
|
||||
}
|
||||
inline void LoadMeta(utils::IStream *fi) {
|
||||
utils::Check(fi->Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0,
|
||||
"invalid col.blob file");
|
||||
utils::Check(fi->Read(&buffered_rowset_),
|
||||
"invalid col.blob file");
|
||||
utils::Check(fi->Read(&col_size_),
|
||||
"invalid col.blob file");
|
||||
}
|
||||
inline void SaveMeta(utils::IStream *fo) {
|
||||
fo->Write(&num_buffered_row_, sizeof(num_buffered_row_));
|
||||
fo->Write(buffered_rowset_);
|
||||
fo->Write(col_size_);
|
||||
}
|
||||
/*!
|
||||
* \brief initialize column data
|
||||
* \param enabled the list of enabled columns
|
||||
* \param pkeep probability to keep a row
|
||||
* \param max_row_perbatch maximum row per batch
|
||||
*/
|
||||
inline void InitColData(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
// clear rowset
|
||||
buffered_rowset_.clear();
|
||||
col_size_.resize(info.num_col());
|
||||
std::fill(col_size_.begin(), col_size_.end(), 0);
|
||||
utils::FileStream fo;
|
||||
fo = utils::FileStream(utils::FopenCheck(col_data_name_.c_str(), "wb"));
|
||||
iter_->BeforeFirst();
|
||||
double tstart = rabit::utils::GetTime();
|
||||
size_t bytes_write = 0;
|
||||
utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer;
|
||||
citer.SetParam("buffer_size", "2");
|
||||
citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(),
|
||||
iter_, &buffered_rowset_, &enabled);
|
||||
citer.Init();
|
||||
SparsePage *pcol;
|
||||
while (citer.Next(pcol)) {
|
||||
for (size_t i = 0; i < pcol->Size(); ++i) {
|
||||
col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
|
||||
}
|
||||
pcol->Save(&fo);
|
||||
size_t spage = pcol->MemCostBytes();
|
||||
bytes_write += spage;
|
||||
double tnow = rabit::utils::GetTime();
|
||||
double tdiff = tnow - tstart;
|
||||
utils::Printf("Writing to %s in %g MB/s, %lu MB written\n",
|
||||
col_data_name_.c_str(),
|
||||
(bytes_write >> 20UL) / tdiff,
|
||||
(bytes_write >> 20UL));
|
||||
}
|
||||
fo.Close();
|
||||
num_buffered_row_ = buffered_rowset_.size();
|
||||
fo = utils::FileStream(utils::FopenCheck(col_meta_name_.c_str(), "wb"));
|
||||
this->SaveMeta(&fo);
|
||||
fo.Close();
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief page size 256 M */
|
||||
static const size_t kPageSize = 256 << 20UL;
|
||||
// shared meta info with DMatrix
|
||||
const learner::MetaInfo &info;
|
||||
// row iterator
|
||||
utils::IIterator<RowBatch> *iter_;
|
||||
/*! \brief column based data file name */
|
||||
std::string col_data_name_;
|
||||
/*! \brief column based data file name */
|
||||
std::string col_meta_name_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
// number of buffered rows
|
||||
size_t num_buffered_row_;
|
||||
// count for column data
|
||||
std::vector<size_t> col_size_;
|
||||
// internal column index for output
|
||||
std::vector<bst_uint> col_index_;
|
||||
// internal thread backed col iterator
|
||||
ThreadColPageIterator col_iter_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
|
||||
324
old_src/io/simple_dmatrix-inl.hpp
Normal file
324
old_src/io/simple_dmatrix-inl.hpp
Normal file
@@ -0,0 +1,324 @@
|
||||
/*!
|
||||
* Copyright 2014 by Contributors
|
||||
* \file simple_dmatrix-inl.hpp
|
||||
* \brief simple implementation of DMatrixS that can be used
|
||||
* the data format of xgboost is templatized, which means it can accept
|
||||
* any data structure that implements the function defined by FMatrix
|
||||
* this file is a specific implementation of input data structure that can be used by BoostLearner
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
|
||||
#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
|
||||
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include "../data.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../learner/dmatrix.h"
|
||||
#include "./io.h"
|
||||
#include "./simple_fmatrix-inl.hpp"
|
||||
#include "../sync/sync.h"
|
||||
#include "./libsvm_parser.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*! \brief implementation of DataMatrix, in CSR format */
|
||||
class DMatrixSimple : public DataMatrix {
|
||||
public:
|
||||
// constructor
|
||||
DMatrixSimple(void) : DataMatrix(kMagic) {
|
||||
fmat_ = new FMatrixS(new OneBatchIter(this), this->info);
|
||||
this->Clear();
|
||||
}
|
||||
// virtual destructor
|
||||
virtual ~DMatrixSimple(void) {
|
||||
delete fmat_;
|
||||
}
|
||||
virtual IFMatrix *fmat(void) const {
|
||||
return fmat_;
|
||||
}
|
||||
/*! \brief clear the storage */
|
||||
inline void Clear(void) {
|
||||
row_ptr_.clear();
|
||||
row_ptr_.push_back(0);
|
||||
row_data_.clear();
|
||||
info.Clear();
|
||||
}
|
||||
/*! \brief copy content data from source matrix */
|
||||
inline void CopyFrom(const DataMatrix &src) {
|
||||
this->Clear();
|
||||
this->info = src.info;
|
||||
// clone data contents from src matrix
|
||||
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
row_data_.resize(row_data_.size() + inst.length);
|
||||
if (inst.length != 0) {
|
||||
std::memcpy(&row_data_[row_ptr_.back()], inst.data,
|
||||
sizeof(RowBatch::Entry) * inst.length);
|
||||
}
|
||||
row_ptr_.push_back(row_ptr_.back() + inst.length);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief add a row to the matrix
|
||||
* \param feats features
|
||||
* \return the index of added row
|
||||
*/
|
||||
inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
|
||||
for (size_t i = 0; i < feats.size(); ++i) {
|
||||
row_data_.push_back(feats[i]);
|
||||
info.info.num_col = std::max(info.info.num_col,
|
||||
static_cast<size_t>(feats[i].index+1));
|
||||
}
|
||||
row_ptr_.push_back(row_ptr_.back() + feats.size());
|
||||
info.info.num_row += 1;
|
||||
return row_ptr_.size() - 2;
|
||||
}
|
||||
/*!
|
||||
* \brief load split of input, used in distributed mode
|
||||
* \param uri the uri of input
|
||||
* \param loadsplit whether loadsplit of data or all the data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void LoadText(const char *uri, bool silent = false, bool loadsplit = false) {
|
||||
int rank = 0, npart = 1;
|
||||
if (loadsplit) {
|
||||
rank = rabit::GetRank();
|
||||
npart = rabit::GetWorldSize();
|
||||
}
|
||||
LibSVMParser parser(
|
||||
dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
|
||||
this->Clear();
|
||||
while (parser.Next()) {
|
||||
const LibSVMPage &batch = parser.Value();
|
||||
size_t nlabel = info.labels.size();
|
||||
info.labels.resize(nlabel + batch.label.size());
|
||||
if (batch.label.size() != 0) {
|
||||
std::memcpy(BeginPtr(info.labels) + nlabel,
|
||||
BeginPtr(batch.label),
|
||||
batch.label.size() * sizeof(float));
|
||||
}
|
||||
size_t ndata = row_data_.size();
|
||||
row_data_.resize(ndata + batch.data.size());
|
||||
if (batch.data.size() != 0) {
|
||||
std::memcpy(BeginPtr(row_data_) + ndata,
|
||||
BeginPtr(batch.data),
|
||||
batch.data.size() * sizeof(RowBatch::Entry));
|
||||
}
|
||||
row_ptr_.resize(row_ptr_.size() + batch.label.size());
|
||||
for (size_t i = 0; i < batch.label.size(); ++i) {
|
||||
row_ptr_[nlabel + i + 1] = row_ptr_[nlabel] + batch.offset[i + 1];
|
||||
}
|
||||
info.info.num_row += batch.Size();
|
||||
for (size_t i = 0; i < batch.data.size(); ++i) {
|
||||
info.info.num_col = std::max(info.info.num_col,
|
||||
static_cast<size_t>(batch.data[i].index+1));
|
||||
}
|
||||
}
|
||||
if (!silent) {
|
||||
utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
|
||||
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
|
||||
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
|
||||
static_cast<unsigned long>(row_data_.size()), uri); // NOLINT(*)
|
||||
}
|
||||
// try to load in additional file
|
||||
if (!loadsplit) {
|
||||
std::string name = uri;
|
||||
std::string gname = name + ".group";
|
||||
if (info.TryLoadGroup(gname.c_str(), silent)) {
|
||||
utils::Check(info.group_ptr.back() == info.num_row(),
|
||||
"DMatrix: group data does not match the number of rows in features");
|
||||
}
|
||||
std::string wname = name + ".weight";
|
||||
if (info.TryLoadFloatInfo("weight", wname.c_str(), silent)) {
|
||||
utils::Check(info.weights.size() == info.num_row(),
|
||||
"DMatrix: weight data does not match the number of rows in features");
|
||||
}
|
||||
std::string mname = name + ".base_margin";
|
||||
if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) {
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \return whether loading is success
|
||||
*/
|
||||
inline bool LoadBinary(const char* fname, bool silent = false) {
|
||||
std::FILE *fp = fopen64(fname, "rb");
|
||||
if (fp == NULL) return false;
|
||||
utils::FileStream fs(fp);
|
||||
this->LoadBinary(fs, silent, fname);
|
||||
fs.Close();
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief load from binary stream
|
||||
* \param fs input file stream
|
||||
* \param silent whether print information during loading
|
||||
* \param fname file name, used to print message
|
||||
*/
|
||||
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) { // NOLINT(*)
|
||||
int tmagic;
|
||||
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
|
||||
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch",
|
||||
fname == NULL ? "" : fname);
|
||||
|
||||
info.LoadBinary(fs);
|
||||
LoadBinary(fs, &row_ptr_, &row_data_);
|
||||
fmat_->LoadColAccess(fs);
|
||||
|
||||
if (!silent) {
|
||||
utils::Printf("%lux%lu matrix with %lu entries is loaded",
|
||||
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
|
||||
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
|
||||
static_cast<unsigned long>(row_data_.size())); // NOLINT(*)
|
||||
if (fname != NULL) {
|
||||
utils::Printf(" from %s\n", fname);
|
||||
} else {
|
||||
utils::Printf("\n");
|
||||
}
|
||||
if (info.group_ptr.size() != 0) {
|
||||
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief save to binary file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
*/
|
||||
inline void SaveBinary(const char* fname, bool silent = false) const {
|
||||
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
|
||||
int tmagic = kMagic;
|
||||
fs.Write(&tmagic, sizeof(tmagic));
|
||||
info.SaveBinary(fs);
|
||||
SaveBinary(fs, row_ptr_, row_data_);
|
||||
fmat_->SaveColAccess(fs);
|
||||
fs.Close();
|
||||
|
||||
if (!silent) {
|
||||
utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
|
||||
static_cast<unsigned long>(info.num_row()), // NOLINT(*)
|
||||
static_cast<unsigned long>(info.num_col()), // NOLINT(*)
|
||||
static_cast<unsigned long>(row_data_.size()), fname); // NOLINT(*)
|
||||
if (info.group_ptr.size() != 0) {
|
||||
utils::Printf("data contains %u groups\n",
|
||||
static_cast<unsigned>(info.group_ptr.size()-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
|
||||
* otherwise the function will first check if fname + '.buffer' exists,
|
||||
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
|
||||
* and try to create a buffer file
|
||||
* \param fname name of binary data
|
||||
* \param silent whether print information or not
|
||||
* \param savebuffer whether do save binary buffer if it is text
|
||||
*/
|
||||
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
|
||||
using namespace std;
|
||||
size_t len = strlen(fname);
|
||||
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
|
||||
if (!this->LoadBinary(fname, silent)) {
|
||||
utils::Error("can not open file \"%s\"", fname);
|
||||
}
|
||||
return;
|
||||
}
|
||||
char bname[1024];
|
||||
utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname);
|
||||
if (!this->LoadBinary(bname, silent)) {
|
||||
this->LoadText(fname, silent);
|
||||
if (savebuffer) this->SaveBinary(bname, silent);
|
||||
}
|
||||
}
|
||||
// data fields
|
||||
/*! \brief row pointer of CSR sparse storage */
|
||||
std::vector<size_t> row_ptr_;
|
||||
/*! \brief data in the row */
|
||||
std::vector<RowBatch::Entry> row_data_;
|
||||
/*! \brief the real fmatrix */
|
||||
FMatrixS *fmat_;
|
||||
/*! \brief magic number used to identify DMatrix */
|
||||
static const int kMagic = 0xffffab01;
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief save data to binary stream
|
||||
* \param fo output stream
|
||||
* \param ptr pointer data
|
||||
* \param data data content
|
||||
*/
|
||||
inline static void SaveBinary(utils::IStream &fo, // NOLINT(*)
|
||||
const std::vector<size_t> &ptr,
|
||||
const std::vector<RowBatch::Entry> &data) {
|
||||
size_t nrow = ptr.size() - 1;
|
||||
fo.Write(&nrow, sizeof(size_t));
|
||||
fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
|
||||
if (data.size() != 0) {
|
||||
fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load data from binary stream
|
||||
* \param fi input stream
|
||||
* \param out_ptr pointer data
|
||||
* \param out_data data content
|
||||
*/
|
||||
inline static void LoadBinary(utils::IStream &fi, // NOLINT(*)
|
||||
std::vector<size_t> *out_ptr,
|
||||
std::vector<RowBatch::Entry> *out_data) {
|
||||
size_t nrow;
|
||||
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
|
||||
out_ptr->resize(nrow + 1);
|
||||
utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
|
||||
"invalid input file format");
|
||||
out_data->resize(out_ptr->back());
|
||||
if (out_data->size() != 0) {
|
||||
utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
|
||||
"invalid input file format");
|
||||
}
|
||||
}
|
||||
// one batch iterator that return content in the matrix
|
||||
struct OneBatchIter: utils::IIterator<RowBatch> {
|
||||
explicit OneBatchIter(DMatrixSimple *parent)
|
||||
: at_first_(true), parent_(parent) {}
|
||||
virtual ~OneBatchIter(void) {}
|
||||
virtual void BeforeFirst(void) {
|
||||
at_first_ = true;
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (!at_first_) return false;
|
||||
at_first_ = false;
|
||||
batch_.size = parent_->row_ptr_.size() - 1;
|
||||
batch_.base_rowid = 0;
|
||||
batch_.ind_ptr = BeginPtr(parent_->row_ptr_);
|
||||
batch_.data_ptr = BeginPtr(parent_->row_data_);
|
||||
return true;
|
||||
}
|
||||
virtual const RowBatch &Value(void) const {
|
||||
return batch_;
|
||||
}
|
||||
|
||||
private:
|
||||
// whether is at first
|
||||
bool at_first_;
|
||||
// pointer to parent
|
||||
DMatrixSimple *parent_;
|
||||
// temporal space for batch
|
||||
RowBatch batch_;
|
||||
};
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // namespace XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
|
||||
374
old_src/io/simple_fmatrix-inl.hpp
Normal file
374
old_src/io/simple_fmatrix-inl.hpp
Normal file
@@ -0,0 +1,374 @@
|
||||
/*!
|
||||
* Copyright 2014 by Contributors
|
||||
* \file simple_fmatrix-inl.hpp
|
||||
* \brief the input data structure for gradient boosting
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
|
||||
#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
|
||||
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
#include "../utils/utils.h"
|
||||
#include "../utils/random.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../learner/dmatrix.h"
|
||||
#include "../utils/group_data.h"
|
||||
#include "./sparse_batch_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*!
|
||||
* \brief sparse matrix that support column access, CSC
|
||||
*/
|
||||
class FMatrixS : public IFMatrix {
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
/*! \brief constructor */
|
||||
FMatrixS(utils::IIterator<RowBatch> *iter,
|
||||
const learner::MetaInfo &info)
|
||||
: info_(info) {
|
||||
this->iter_ = iter;
|
||||
}
|
||||
// destructor
|
||||
virtual ~FMatrixS(void) {
|
||||
if (iter_ != NULL) delete iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
virtual bool HaveColAccess(void) const {
|
||||
return col_size_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of columns */
|
||||
virtual size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_size_.size();
|
||||
}
|
||||
/*! \brief get number of buffered rows */
|
||||
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
|
||||
return buffered_rowset_;
|
||||
}
|
||||
/*! \brief get column size */
|
||||
virtual size_t GetColSize(size_t cidx) const {
|
||||
return col_size_[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
virtual float GetColDensity(size_t cidx) const {
|
||||
size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
|
||||
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
|
||||
}
|
||||
virtual void InitColAccess(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
if (this->HaveColAccess()) return;
|
||||
this->InitColData(enabled, pkeep, max_row_perbatch);
|
||||
}
|
||||
/*!
|
||||
* \brief get the row iterator associated with FMatrix
|
||||
*/
|
||||
virtual utils::IIterator<RowBatch>* RowIterator(void) {
|
||||
iter_->BeforeFirst();
|
||||
return iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief get the column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch>* ColIterator(void) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_iter_.col_index_.resize(ncol);
|
||||
for (size_t i = 0; i < ncol; ++i) {
|
||||
col_iter_.col_index_[i] = static_cast<bst_uint>(i);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief column based iterator
|
||||
*/
|
||||
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
|
||||
size_t ncol = this->NumCol();
|
||||
col_iter_.col_index_.resize(0);
|
||||
for (size_t i = 0; i < fset.size(); ++i) {
|
||||
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
|
||||
}
|
||||
col_iter_.BeforeFirst();
|
||||
return &col_iter_;
|
||||
}
|
||||
/*!
|
||||
* \brief save column access data into stream
|
||||
* \param fo output stream to save to
|
||||
*/
|
||||
inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*)
|
||||
size_t n = 0;
|
||||
fo.Write(&n, sizeof(n));
|
||||
}
|
||||
/*!
|
||||
* \brief load column access data from stream
|
||||
* \param fo output stream to load from
|
||||
*/
|
||||
inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*)
|
||||
// do nothing in load col access
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief initialize column data
|
||||
* \param enabled the list of enabled columns
|
||||
* \param pkeep probability to keep a row
|
||||
* \param max_row_perbatch maximum row per batch
|
||||
*/
|
||||
inline void InitColData(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
col_iter_.Clear();
|
||||
if (info_.num_row() < max_row_perbatch) {
|
||||
SparsePage *page = new SparsePage();
|
||||
this->MakeOneBatch(enabled, pkeep, page);
|
||||
col_iter_.cpages_.push_back(page);
|
||||
} else {
|
||||
this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
|
||||
}
|
||||
// setup col-size
|
||||
col_size_.resize(info_.num_col());
|
||||
std::fill(col_size_.begin(), col_size_.end(), 0);
|
||||
for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
|
||||
SparsePage *pcol = col_iter_.cpages_[i];
|
||||
for (size_t j = 0; j < pcol->Size(); ++j) {
|
||||
col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief make column page from iterator
|
||||
* \param pkeep probability to keep a row
|
||||
* \param pcol the target column
|
||||
*/
|
||||
inline void MakeOneBatch(const std::vector<bool> &enabled,
|
||||
float pkeep,
|
||||
SparsePage *pcol) {
|
||||
// clear rowset
|
||||
buffered_rowset_.clear();
|
||||
// bit map
|
||||
int nthread;
|
||||
std::vector<bool> bmap;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
pcol->Clear();
|
||||
utils::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info_.num_col(), nthread);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
bmap.resize(bmap.size() + batch.size, true);
|
||||
long batch_size = static_cast<long>(batch.size); // NOLINT(*)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
} else {
|
||||
bmap[i] = false;
|
||||
}
|
||||
}
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.AddBudget(inst[j].index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
|
||||
int tid = omp_get_thread_num();
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (bmap[ridx]) {
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
if (enabled[inst[j].index]) {
|
||||
builder.Push(inst[j].index,
|
||||
Entry((bst_uint)(batch.base_rowid+i),
|
||||
inst[j].fvalue), tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
utils::Assert(pcol->Size() == info_.num_col(),
|
||||
"inconsistent col data");
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(BeginPtr(pcol->data) + pcol->offset[i],
|
||||
BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void MakeManyBatch(const std::vector<bool> &enabled,
|
||||
float pkeep, size_t max_row_perbatch) {
|
||||
size_t btop = 0;
|
||||
buffered_rowset_.clear();
|
||||
// internal temp cache
|
||||
SparsePage tmp; tmp.Clear();
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const RowBatch &batch = iter_->Value();
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
|
||||
buffered_rowset_.push_back(ridx);
|
||||
tmp.Push(batch[i]);
|
||||
}
|
||||
if (tmp.Size() >= max_row_perbatch) {
|
||||
SparsePage *page = new SparsePage();
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page);
|
||||
col_iter_.cpages_.push_back(page);
|
||||
btop = buffered_rowset_.size();
|
||||
tmp.Clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (tmp.Size() != 0) {
|
||||
SparsePage *page = new SparsePage();
|
||||
this->MakeColPage(tmp.GetRowBatch(0),
|
||||
BeginPtr(buffered_rowset_) + btop,
|
||||
enabled, page);
|
||||
col_iter_.cpages_.push_back(page);
|
||||
}
|
||||
}
|
||||
// make column page from subset of rowbatchs
|
||||
inline void MakeColPage(const RowBatch &batch,
|
||||
const bst_uint *ridx,
|
||||
const std::vector<bool> &enabled,
|
||||
SparsePage *pcol) {
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
|
||||
if (nthread > max_nthread) {
|
||||
nthread = max_nthread;
|
||||
}
|
||||
}
|
||||
pcol->Clear();
|
||||
utils::ParallelGroupBuilder<SparseBatch::Entry>
|
||||
builder(&pcol->offset, &pcol->data);
|
||||
builder.InitBudget(info_.num_col(), nthread);
|
||||
bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
if (enabled[e.index]) {
|
||||
builder.AddBudget(e.index, tid);
|
||||
}
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
#pragma omp parallel for schedule(static) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||
int tid = omp_get_thread_num();
|
||||
RowBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < inst.length; ++j) {
|
||||
const SparseBatch::Entry &e = inst[j];
|
||||
builder.Push(e.index,
|
||||
SparseBatch::Entry(ridx[i], e.fvalue),
|
||||
tid);
|
||||
}
|
||||
}
|
||||
utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
|
||||
// sort columns
|
||||
bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
|
||||
#pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
|
||||
for (bst_omp_uint i = 0; i < ncol; ++i) {
|
||||
if (pcol->offset[i] < pcol->offset[i + 1]) {
|
||||
std::sort(BeginPtr(pcol->data) + pcol->offset[i],
|
||||
BeginPtr(pcol->data) + pcol->offset[i + 1],
|
||||
SparseBatch::Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// one batch iterator that return content in the matrix
|
||||
struct ColBatchIter: utils::IIterator<ColBatch> {
|
||||
ColBatchIter(void) : data_ptr_(0) {}
|
||||
virtual ~ColBatchIter(void) {
|
||||
this->Clear();
|
||||
}
|
||||
virtual void BeforeFirst(void) {
|
||||
data_ptr_ = 0;
|
||||
}
|
||||
virtual bool Next(void) {
|
||||
if (data_ptr_ >= cpages_.size()) return false;
|
||||
data_ptr_ += 1;
|
||||
SparsePage *pcol = cpages_[data_ptr_ - 1];
|
||||
batch_.size = col_index_.size();
|
||||
col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
|
||||
for (size_t i = 0; i < col_data_.size(); ++i) {
|
||||
const bst_uint ridx = col_index_[i];
|
||||
col_data_[i] = SparseBatch::Inst
|
||||
(BeginPtr(pcol->data) + pcol->offset[ridx],
|
||||
static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
|
||||
}
|
||||
batch_.col_index = BeginPtr(col_index_);
|
||||
batch_.col_data = BeginPtr(col_data_);
|
||||
return true;
|
||||
}
|
||||
virtual const ColBatch &Value(void) const {
|
||||
return batch_;
|
||||
}
|
||||
inline void Clear(void) {
|
||||
for (size_t i = 0; i < cpages_.size(); ++i) {
|
||||
delete cpages_[i];
|
||||
}
|
||||
cpages_.clear();
|
||||
}
|
||||
// data content
|
||||
std::vector<bst_uint> col_index_;
|
||||
// column content
|
||||
std::vector<ColBatch::Inst> col_data_;
|
||||
// column sparse pages
|
||||
std::vector<SparsePage*> cpages_;
|
||||
// data pointer
|
||||
size_t data_ptr_;
|
||||
// temporal space for batch
|
||||
ColBatch batch_;
|
||||
};
|
||||
// --- data structure used to support InitColAccess --
|
||||
// column iterator
|
||||
ColBatchIter col_iter_;
|
||||
// shared meta info with DMatrix
|
||||
const learner::MetaInfo &info_;
|
||||
// row iterator
|
||||
utils::IIterator<RowBatch> *iter_;
|
||||
/*! \brief list of row index that are buffered */
|
||||
std::vector<bst_uint> buffered_rowset_;
|
||||
// count for column data
|
||||
std::vector<size_t> col_size_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_
|
||||
272
old_src/io/sparse_batch_page.h
Normal file
272
old_src/io/sparse_batch_page.h
Normal file
@@ -0,0 +1,272 @@
|
||||
/*!
|
||||
* Copyright (c) 2014 by Contributors
|
||||
* \file sparse_batch_page.h
|
||||
* content holder of sparse batch that can be saved to disk
|
||||
* the representation can be effectively
|
||||
* use in external memory computation
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
|
||||
#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace io {
|
||||
/*!
|
||||
* \brief storage unit of sparse batch
|
||||
*/
|
||||
class SparsePage {
|
||||
public:
|
||||
/*! \brief offset of the segments */
|
||||
std::vector<size_t> offset;
|
||||
/*! \brief the data of the segments */
|
||||
std::vector<SparseBatch::Entry> data;
|
||||
/*! \brief constructor */
|
||||
SparsePage() {
|
||||
this->Clear();
|
||||
}
|
||||
/*! \return number of instance in the page */
|
||||
inline size_t Size() const {
|
||||
return offset.size() - 1;
|
||||
}
|
||||
/*!
|
||||
* \brief load only the segments we are interested in
|
||||
* \param fi the input stream of the file
|
||||
* \param sorted_index_set sorted index of segments we are interested in
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
inline bool Load(utils::ISeekStream *fi,
|
||||
const std::vector<bst_uint> &sorted_index_set) {
|
||||
if (!fi->Read(&disk_offset_)) return false;
|
||||
// setup the offset
|
||||
offset.clear(); offset.push_back(0);
|
||||
for (size_t i = 0; i < sorted_index_set.size(); ++i) {
|
||||
bst_uint fid = sorted_index_set[i];
|
||||
utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format");
|
||||
size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
|
||||
offset.push_back(offset.back() + size);
|
||||
}
|
||||
data.resize(offset.back());
|
||||
// read in the data
|
||||
size_t begin = fi->Tell();
|
||||
size_t curr_offset = 0;
|
||||
for (size_t i = 0; i < sorted_index_set.size();) {
|
||||
bst_uint fid = sorted_index_set[i];
|
||||
if (disk_offset_[fid] != curr_offset) {
|
||||
utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted");
|
||||
fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
|
||||
curr_offset = disk_offset_[fid];
|
||||
}
|
||||
size_t j, size_to_read = 0;
|
||||
for (j = i; j < sorted_index_set.size(); ++j) {
|
||||
if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
|
||||
size_to_read += offset[j + 1] - offset[j];
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (size_to_read != 0) {
|
||||
utils::Check(fi->Read(BeginPtr(data) + offset[i],
|
||||
size_to_read * sizeof(SparseBatch::Entry)) != 0,
|
||||
"Invalid SparsePage file");
|
||||
curr_offset += size_to_read;
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
// seek to end of record
|
||||
if (curr_offset != disk_offset_.back()) {
|
||||
fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief load all the segments
|
||||
* \param fi the input stream of the file
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
inline bool Load(utils::IStream *fi) {
|
||||
if (!fi->Read(&offset)) return false;
|
||||
utils::Check(offset.size() != 0, "Invalid SparsePage file");
|
||||
data.resize(offset.back());
|
||||
if (data.size() != 0) {
|
||||
utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0,
|
||||
"Invalid SparsePage file");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief save the data to fo, when a page was written
|
||||
* to disk it must contain all the elements in the
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void Save(utils::IStream *fo) const {
|
||||
utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset");
|
||||
utils::Assert(offset.back() == data.size(), "in consistent SparsePage");
|
||||
fo->Write(offset);
|
||||
if (data.size() != 0) {
|
||||
fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
|
||||
}
|
||||
}
|
||||
/*! \return estimation of memory cost of this page */
|
||||
inline size_t MemCostBytes(void) const {
|
||||
return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
|
||||
}
|
||||
/*! \brief clear the page */
|
||||
inline void Clear(void) {
|
||||
offset.clear();
|
||||
offset.push_back(0);
|
||||
data.clear();
|
||||
}
|
||||
/*!
|
||||
* \brief load all the segments and add it to existing batch
|
||||
* \param fi the input stream of the file
|
||||
* \return true of the loading as successful, false if end of file was reached
|
||||
*/
|
||||
inline bool PushLoad(utils::IStream *fi) {
|
||||
if (!fi->Read(&disk_offset_)) return false;
|
||||
data.resize(offset.back() + disk_offset_.back());
|
||||
if (disk_offset_.back() != 0) {
|
||||
utils::Check(fi->Read(BeginPtr(data) + offset.back(),
|
||||
disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
|
||||
"Invalid SparsePage file");
|
||||
}
|
||||
size_t top = offset.back();
|
||||
size_t begin = offset.size();
|
||||
offset.resize(offset.size() + disk_offset_.size());
|
||||
for (size_t i = 0; i < disk_offset_.size(); ++i) {
|
||||
offset[i + begin] = top + disk_offset_[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief Push row batch into the page
|
||||
* \param batch the row batch
|
||||
*/
|
||||
inline void Push(const RowBatch &batch) {
|
||||
data.resize(offset.back() + batch.ind_ptr[batch.size]);
|
||||
std::memcpy(BeginPtr(data) + offset.back(),
|
||||
batch.data_ptr + batch.ind_ptr[0],
|
||||
sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
|
||||
size_t top = offset.back();
|
||||
size_t begin = offset.size();
|
||||
offset.resize(offset.size() + batch.size);
|
||||
for (size_t i = 0; i < batch.size; ++i) {
|
||||
offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief Push a sparse page
|
||||
* \param batch the row page
|
||||
*/
|
||||
inline void Push(const SparsePage &batch) {
|
||||
size_t top = offset.back();
|
||||
data.resize(top + batch.data.size());
|
||||
std::memcpy(BeginPtr(data) + top,
|
||||
BeginPtr(batch.data),
|
||||
sizeof(SparseBatch::Entry) * batch.data.size());
|
||||
size_t begin = offset.size();
|
||||
offset.resize(begin + batch.Size());
|
||||
for (size_t i = 0; i < batch.Size(); ++i) {
|
||||
offset[i + begin] = top + batch.offset[i + 1];
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief Push one instance into page
|
||||
* \param row an instance row
|
||||
*/
|
||||
inline void Push(const SparseBatch::Inst &inst) {
|
||||
offset.push_back(offset.back() + inst.length);
|
||||
size_t begin = data.size();
|
||||
data.resize(begin + inst.length);
|
||||
if (inst.length != 0) {
|
||||
std::memcpy(BeginPtr(data) + begin, inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \param base_rowid base_rowid of the data
|
||||
* \return row batch representation of the page
|
||||
*/
|
||||
inline RowBatch GetRowBatch(size_t base_rowid) const {
|
||||
RowBatch out;
|
||||
out.base_rowid = base_rowid;
|
||||
out.ind_ptr = BeginPtr(offset);
|
||||
out.data_ptr = BeginPtr(data);
|
||||
out.size = offset.size() - 1;
|
||||
return out;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief external memory column offset */
|
||||
std::vector<size_t> disk_offset_;
|
||||
};
|
||||
/*!
|
||||
* \brief factory class for SparsePage,
|
||||
* used in threadbuffer template
|
||||
*/
|
||||
class SparsePageFactory {
|
||||
public:
|
||||
SparsePageFactory(void)
|
||||
: action_load_all_(true), set_load_all_(true) {}
|
||||
inline void SetFile(const utils::FileStream &fi,
|
||||
size_t file_begin = 0) {
|
||||
fi_ = fi;
|
||||
file_begin_ = file_begin;
|
||||
}
|
||||
inline const std::vector<bst_uint> &index_set(void) const {
|
||||
return action_index_set_;
|
||||
}
|
||||
// set index set, will be used after next before first
|
||||
inline void SetIndexSet(const std::vector<bst_uint> &index_set,
|
||||
bool load_all) {
|
||||
set_load_all_ = load_all;
|
||||
if (!set_load_all_) {
|
||||
set_index_set_ = index_set;
|
||||
std::sort(set_index_set_.begin(), set_index_set_.end());
|
||||
}
|
||||
}
|
||||
inline bool Init(void) {
|
||||
return true;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {}
|
||||
inline bool LoadNext(SparsePage *val) {
|
||||
if (!action_load_all_) {
|
||||
if (action_index_set_.size() == 0) {
|
||||
return false;
|
||||
} else {
|
||||
return val->Load(&fi_, action_index_set_);
|
||||
}
|
||||
} else {
|
||||
return val->Load(&fi_);
|
||||
}
|
||||
}
|
||||
inline SparsePage *Create(void) {
|
||||
return new SparsePage();
|
||||
}
|
||||
inline void FreeSpace(SparsePage *a) {
|
||||
delete a;
|
||||
}
|
||||
inline void Destroy(void) {
|
||||
fi_.Close();
|
||||
}
|
||||
inline void BeforeFirst(void) {
|
||||
fi_.Seek(file_begin_);
|
||||
action_load_all_ = set_load_all_;
|
||||
if (!set_load_all_) {
|
||||
action_index_set_ = set_index_set_;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool action_load_all_, set_load_all_;
|
||||
size_t file_begin_;
|
||||
utils::FileStream fi_;
|
||||
std::vector<bst_uint> action_index_set_;
|
||||
std::vector<bst_uint> set_index_set_;
|
||||
};
|
||||
} // namespace io
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_IO_SPARSE_BATCH_PAGE_H_
|
||||
Reference in New Issue
Block a user