check in io module

This commit is contained in:
tqchen 2014-08-16 14:06:31 -07:00
parent ac1cc15b90
commit c4acb4fe01
10 changed files with 417 additions and 33 deletions

View File

@ -4,14 +4,14 @@ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas
# specify tensor path
BIN = xgunity.exe
OBJ =
OBJ = io.o
.PHONY: clean all
all: $(BIN) $(OBJ)
export LDFLAGS= -pthread -lm
xgunity.exe: src/xgunity.cpp
io.o: src/io/io.cpp
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
@ -23,4 +23,5 @@ install:
cp -f -r $(BIN) $(INSTALL_PATH)
clean:
$(RM) $(OBJ) $(BIN) *~ */*~
$(RM) $(OBJ) $(BIN) *~ */*~ */*/*~

View File

@ -20,7 +20,8 @@ Build
Project Logical Layout
=======
* Dependency order: learner->gbm->tree
* Dependency order: io->learner->gbm->tree
- All module depends on data.h
* tree are implementations of tree construction algorithms.
* gbm is gradient boosting interface, that takes trees and other base learner to do boosting.
- gbm only takes gradient as sufficient statistics, it does not compute the gradient.

View File

@ -1,5 +1,5 @@
#ifndef XGBOOST_UNITY_DATA_H
#define XGBOOST_UNITY_DATA_H
#ifndef XGBOOST_DATA_H
#define XGBOOST_DATA_H
/*!
* \file data.h
* \brief the input data structure for gradient boosting
@ -8,6 +8,7 @@
#include <cstdio>
#include <vector>
#include <limits>
#include <climits>
#include <algorithm>
#include "utils/io.h"
#include "utils/utils.h"
@ -27,7 +28,7 @@ const float rt_eps = 1e-5f;
const float rt_2eps = rt_eps * 2.0f;
/*! \brief gradient statistics pair usually needed in gradient boosting */
struct bst_gpair{
struct bst_gpair {
/*! \brief gradient statistics */
bst_float grad;
/*! \brief second order gradient statistics */
@ -139,7 +140,7 @@ class FMatrixInterface {
*/
inline float GetColDensity(size_t cidx) const;
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<SparseBatch>* RowIterator(void) const = 0;
inline utils::IIterator<SparseBatch>* RowIterator(void) const;
};
/*!
@ -180,11 +181,13 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
}
};
/*! \brief constructor */
explicit FMatrixS(utils::IIterator<SparseBatch> *base_iter)
: iter_(base_iter) {}
FMatrixS(void) {
iter_ = NULL;
num_buffered_row_ = 0;
}
// destructor
virtual ~FMatrixS(void) {
delete iter_;
~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const {
@ -219,15 +222,75 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
}
virtual void InitColAccess(void) {
inline void InitColAccess(size_t max_nrow = ULONG_MAX) {
if (this->HaveColAccess()) return;
const size_t max_nrow = std::numeric_limits<bst_uint>::max();
this->InitColData(max_nrow);
}
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<SparseBatch>* RowIterator(void) const {
inline utils::IIterator<SparseBatch>* RowIterator(void) const {
return iter_;
}
/*! \brief set iterator */
inline void set_iter(utils::IIterator<SparseBatch> *iter) {
this->iter_ = iter;
}
/*!
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) {
fo.Write(&num_buffered_row_, sizeof(num_buffered_row_));
if (num_buffered_row_ != 0) {
SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0,
"invalid input file format");
if (num_buffered_row_ != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<SparseBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<SparseBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0,
"invalid input file format");
}
}
protected:
/*!
@ -278,16 +341,15 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
&col_data_[col_ptr_[i + 1]], Entry::CmpValue);
}
}
private:
// --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_;
/*! \brief number */
size_t num_buffered_row_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<SparseBatch::Entry> col_data_;
std::vector<SparseBatch::Entry> col_data_;
};
} // namespace xgboost
#endif
#endif // XGBOOST_DATA_H

16
src/io/io.cpp Normal file
View File

@ -0,0 +1,16 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#include <string>
#include "./io.h"
#include "simple_dmatrix-inl.hpp"
// implements data loads using dmatrix simple for now
namespace xgboost {
namespace io {
DataMatrix* LoadDataMatrix(const char *fname) {
DMatrixSimple *dmat = new DMatrixSimple();
dmat->CacheLoad(fname);
return dmat;
}
} // namespace io
} // namespace xgboost

34
src/io/io.h Normal file
View File

@ -0,0 +1,34 @@
#ifndef XGBOOST_IO_IO_H_
#define XGBOOST_IO_IO_H_
/*!
* \file io.h
* \brief handles input data format of xgboost
* I/O module handles a specific DMatrix format
* \author Tianqi Chen
*/
#include "../data.h"
#include "../learner/dmatrix.h"
namespace xgboost {
/*! \brief namespace related to data format */
namespace io {
/*! \brief DMatrix object that I/O module support save/load */
typedef learner::DMatrix<FMatrixS> DataMatrix;
/*!
* \brief load DataMatrix from stream
* \param fname file name to be loaded
* \return a loaded DMatrix
*/
DataMatrix* LoadDataMatrix(const char *fname);
/*!
* \brief save DataMatrix into stream,
* note: the saved dmatrix format may not be in exactly same as input
* SaveDMatrix will choose the best way to materialize the dmatrix.
* \param dmat the dmatrix to be saved
* \param fname file name to be savd
*/
void SaveDMatrix(const DataMatrix &dmat, const char *fname);
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_IO_H_

View File

@ -0,0 +1,216 @@
#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
/*!
* \file simple_dmatrix-inl.hpp
* \brief simple implementation of DMatrixS that can be used
* the data format of xgboost is templatized, which means it can accept
* any data structure that implements the function defined by FMatrix
* this file is a specific implementation of input data structure that can be used by BoostLearner
* \author Tianqi Chen
*/
#include <string>
#include <cstring>
#include <vector>
#include <algorithm>
#include "../data.h"
#include "../utils/utils.h"
#include "../learner/dmatrix.h"
#include "./io.h"
namespace xgboost {
namespace io {
/*! \brief implementation of DataMatrix, in CSR format */
class DMatrixSimple : public DataMatrix {
public:
// constructor
DMatrixSimple(void) {
this->fmat.set_iter(new OneBatchIter(this));
this->Clear();
}
// virtual destructor
virtual ~DMatrixSimple(void) {}
/*! \brief clear the storage */
inline void Clear(void) {
row_ptr_.clear();
row_ptr_.push_back(0);
row_data_.clear();
info.Clear();
}
/*!
* \brief add a row to the matrix
* \param feats features
* \return the index of added row
*/
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) {
for (size_t i = 0; i < feats.size(); ++i) {
row_data_.push_back(feats[i]);
info.num_col = std::max(info.num_col, static_cast<size_t>(feats[i].findex+1));
}
row_ptr_.push_back(row_ptr_.back() + feats.size());
info.num_row += 1;
return row_ptr_.size() - 2;
}
/*!
* \brief load from text file
* \param fname name of text data
* \param silent whether print information or not
*/
inline void LoadText(const char* fname, bool silent = false) {
this->Clear();
FILE* file = utils::FopenCheck(fname, "r");
float label; bool init = true;
char tmp[1024];
std::vector<SparseBatch::Entry> feats;
while (fscanf(file, "%s", tmp) == 1) {
SparseBatch::Entry e;
if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) {
feats.push_back(e);
} else {
if (!init) {
info.labels.push_back(label);
this->AddRow(feats);
}
feats.clear();
utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format");
init = false;
}
}
info.labels.push_back(label);
this->AddRow(feats);
if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
info.num_row, info.num_col, row_data_.size(), fname);
}
fclose(file);
// try to load in additional file
std::string name = fname;
std::string gname = name + ".group";
if (info.TryLoadGroup(gname.c_str(), silent)) {
utils::Check(info.group_ptr.back() == info.num_row,
"DMatrix: group data does not match the number of rows in features");
}
std::string wname = name + ".weight";
if (info.TryLoadWeight(wname.c_str(), silent)) {
utils::Check(info.weights.size() == info.num_row,
"DMatrix: weight data does not match the number of rows in features");
}
}
/*!
* \brief load from binary file
* \param fname name of binary data
* \param silent whether print information or not
* \return whether loading is success
*/
inline bool LoadBinary(const char* fname, bool silent = false) {
FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false;
utils::FileStream fs(fp);
int magic;
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
utils::Check(magic == kMagic, "invalid format,magic number mismatch");
info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
fmat.LoadColAccess(fs);
fs.Close();
if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
info.num_row, info.num_col, row_data_.size(), fname);
if (info.group_ptr.size() != 0) {
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
}
}
return true;
}
/*!
* \brief save to binary file
* \param fname name of binary data
* \param silent whether print information or not
*/
inline void SaveBinary(const char* fname, bool silent = false) {
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
int magic = kMagic;
fs.Write(&magic, sizeof(magic));
info.SaveBinary(fs);
FMatrixS::SaveBinary(fs, row_ptr_, row_data_);
fmat.SaveColAccess(fs);
fs.Close();
if (!silent) {
printf("%lux%lu matrix with %lu entries is saved to %s\n",
info.num_row, info.num_col, row_data_.size(), fname);
if (info.group_ptr.size() != 0) {
printf("data contains %lu groups\n", info.group_ptr.size()-1);
}
}
}
/*!
* \brief cache load data given a file name, if filename ends with .buffer, direct load binary
* otherwise the function will first check if fname + '.buffer' exists,
* if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
* and try to create a buffer file
* \param fname name of binary data
* \param silent whether print information or not
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
int len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
if (!this->LoadBinary(fname, silent)) {
utils::Error("can not open file \"%s\"", fname);
}
return;
}
char bname[1024];
snprintf(bname, sizeof(bname), "%s.buffer", fname);
if (!this->LoadBinary(bname, silent)) {
this->LoadText(fname, silent);
if (savebuffer) this->SaveBinary(bname, silent);
}
}
// data fields
/*! \brief row pointer of CSR sparse storage */
std::vector<size_t> row_ptr_;
/*! \brief data in the row */
std::vector<SparseBatch::Entry> row_data_;
/*! \brief magic number used to identify DMatrix */
static const int kMagic = 0xff01;
protected:
// one batch iterator that return content in the matrix
struct OneBatchIter: utils::IIterator<SparseBatch> {
OneBatchIter(DMatrixSimple *parent)
: at_first_(true), parent_(parent) {}
virtual ~OneBatchIter(void) {}
virtual void BeforeFirst(void) {
at_first_ = true;
}
virtual bool Next(void) {
if (!at_first_) return false;
at_first_ = false;
batch_.size = parent_->row_ptr_.size() - 1;
batch_.base_rowid = 0;
batch_.row_ptr = &parent_->row_ptr_[0];
batch_.data_ptr = &parent_->row_data_[0];
return true;
}
virtual const SparseBatch &Value(void) const {
return batch_;
}
private:
// whether is at first
bool at_first_;
// pointer to parient
DMatrixSimple *parent_;
// temporal space for batch
SparseBatch batch_;
};
};
} // namespace io
} // namespace xgboost
#endif // namespace XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_

View File

@ -10,10 +10,14 @@
namespace xgboost {
namespace learner {
/*!
/*!
* \brief meta information needed in training, including label, weight
*/
struct MetaInfo {
/*! \brief number of rows in the data */
size_t num_row;
/*! \brief number of columns in the data */
size_t num_col;
/*! \brief label of each instance */
std::vector<float> labels;
/*!
@ -28,6 +32,15 @@ struct MetaInfo {
* can be used for multi task setting
*/
std::vector<unsigned> root_index;
MetaInfo(void) : num_row(0), num_col(0) {}
/*! \brief clear all the information */
inline void Clear(void) {
labels.clear();
group_ptr.clear();
weights.clear();
root_index.clear();
num_row = num_col = 0;
}
/*! \brief get weight of each instances */
inline float GetWeight(size_t i) const {
if(weights.size() != 0) {
@ -45,20 +58,53 @@ struct MetaInfo {
}
}
inline void SaveBinary(utils::IStream &fo) {
fo.Write(&num_row, sizeof(num_row));
fo.Write(&num_col, sizeof(num_col));
fo.Write(labels);
fo.Write(group_ptr);
fo.Write(weights);
fo.Write(root_index);
}
inline void LoadBinary(utils::IStream &fi) {
utils::Check(fi.Read(&num_row, sizeof(num_row)), "MetaInfo: invalid format");
utils::Check(fi.Read(&num_col, sizeof(num_col)), "MetaInfo: invalid format");
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
utils::Check(fi.Read(&root_index), "MetaInfo: invalid format");
}
// try to load group information from file, if exists
inline bool TryLoadGroup(const char* fname, bool silent = false) {
FILE *fi = fopen64(fname, "r");
if (fi == NULL) return false;
group_ptr.push_back(0);
unsigned nline;
while (fscanf(fi, "%u", &nline) == 1) {
group_ptr.push_back(group_ptr.back()+nline);
}
if (!silent) {
printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname);
}
fclose(fi);
return true;
}
// try to load weight information from file, if exists
inline bool TryLoadWeight(const char* fname, bool silent = false) {
FILE *fi = fopen64(fname, "r");
if (fi == NULL) return false;
float wt;
while (fscanf(fi, "%f", &wt) == 1) {
weights.push_back(wt);
}
if (!silent) {
printf("loading weight from %s\n", fname);
}
fclose(fi);
return true;
}
};
/*!
/*!
* \brief data object used for learning,
* \tparam FMatrix type of feature data source
*/
@ -66,8 +112,6 @@ template<typename FMatrix>
struct DMatrix {
/*! \brief meta information about the dataset */
MetaInfo info;
/*! \brief number of rows in the DMatrix */
size_t num_row;
/*! \brief feature matrix about data content */
FMatrix fmat;
/*!
@ -77,6 +121,8 @@ struct DMatrix {
void *cache_learner_ptr_;
/*! \brief default constructor */
DMatrix(void) : cache_learner_ptr_(NULL) {}
// virtual destructor
virtual ~DMatrix(void){}
};
} // namespace learner

View File

@ -55,9 +55,9 @@ class BoostLearner {
if (dupilicate) continue;
// set mats[i]'s cache learner pointer to this
mats[i]->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row));
buffer_size += mats[i]->num_row;
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->num_col));
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row));
buffer_size += mats[i]->info.num_row;
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col));
}
char str_temp[25];
if (num_feature > mparam.num_feature) {

View File

@ -42,7 +42,9 @@ class IStream {
inline void Write(const std::vector<T> &vec) {
uint64_t sz = vec.size();
this->Write(&sz, sizeof(sz));
this->Write(&vec[0], sizeof(T) * sz);
if (sz != 0) {
this->Write(&vec[0], sizeof(T) * sz);
}
}
/*!
* \brief binary load a vector
@ -54,7 +56,9 @@ class IStream {
uint64_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_vec->resize(sz);
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
if (sz != 0) {
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
}
return true;
}
/*!
@ -64,7 +68,9 @@ class IStream {
inline void Write(const std::string &str) {
uint64_t sz = str.length();
this->Write(&sz, sizeof(sz));
this->Write(&str[0], sizeof(char) * sz);
if (sz != 0) {
this->Write(&str[0], sizeof(char) * sz);
}
}
/*!
* \brief binary load a string
@ -75,7 +81,9 @@ class IStream {
uint64_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_str->resize(sz);
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
if (sz != 0) {
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
}
return true;
}
};

View File

@ -18,11 +18,11 @@ class IIterator {
/*!
* \brief set the parameter
* \param name name of parameter
* \param val value of parameter
* \param val value of parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
virtual void SetParam(const char *name, const char *val) {}
/*! \brief initalize the iterator so that we can use the iterator */
virtual void Init(void) = 0;
virtual void Init(void) {}
/*! \brief set before first of the item */
virtual void BeforeFirst(void) = 0;
/*! \brief move to next item */