diff --git a/Makefile b/Makefile index 61d3f6de0..a8eb89c12 100644 --- a/Makefile +++ b/Makefile @@ -4,14 +4,14 @@ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas # specify tensor path BIN = xgunity.exe -OBJ = +OBJ = io.o .PHONY: clean all all: $(BIN) $(OBJ) export LDFLAGS= -pthread -lm xgunity.exe: src/xgunity.cpp - +io.o: src/io/io.cpp $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) @@ -23,4 +23,5 @@ install: cp -f -r $(BIN) $(INSTALL_PATH) clean: - $(RM) $(OBJ) $(BIN) *~ */*~ + $(RM) $(OBJ) $(BIN) *~ */*~ */*/*~ + diff --git a/README.md b/README.md index 9a0a77a17..f5b64b78a 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,8 @@ Build Project Logical Layout ======= -* Dependency order: learner->gbm->tree +* Dependency order: io->learner->gbm->tree + - All module depends on data.h * tree are implementations of tree construction algorithms. * gbm is gradient boosting interface, that takes trees and other base learner to do boosting. - gbm only takes gradient as sufficient statistics, it does not compute the gradient. diff --git a/src/data.h b/src/data.h index 468482446..e37565a20 100644 --- a/src/data.h +++ b/src/data.h @@ -1,5 +1,5 @@ -#ifndef XGBOOST_UNITY_DATA_H -#define XGBOOST_UNITY_DATA_H +#ifndef XGBOOST_DATA_H +#define XGBOOST_DATA_H /*! * \file data.h * \brief the input data structure for gradient boosting @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "utils/io.h" #include "utils/utils.h" @@ -27,7 +28,7 @@ const float rt_eps = 1e-5f; const float rt_2eps = rt_eps * 2.0f; /*! \brief gradient statistics pair usually needed in gradient boosting */ -struct bst_gpair{ +struct bst_gpair { /*! \brief gradient statistics */ bst_float grad; /*! \brief second order gradient statistics */ @@ -139,7 +140,7 @@ class FMatrixInterface { */ inline float GetColDensity(size_t cidx) const; /*! \brief get the row iterator associated with FMatrix */ - virtual utils::IIterator* RowIterator(void) const = 0; + inline utils::IIterator* RowIterator(void) const; }; /*! @@ -180,11 +181,13 @@ class FMatrixS : public FMatrixInterface{ } }; /*! \brief constructor */ - explicit FMatrixS(utils::IIterator *base_iter) - : iter_(base_iter) {} + FMatrixS(void) { + iter_ = NULL; + num_buffered_row_ = 0; + } // destructor - virtual ~FMatrixS(void) { - delete iter_; + ~FMatrixS(void) { + if (iter_ != NULL) delete iter_; } /*! \return whether column access is enabled */ inline bool HaveColAccess(void) const { @@ -219,15 +222,75 @@ class FMatrixS : public FMatrixInterface{ size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]); return 1.0f - (static_cast(nmiss)) / num_buffered_row_; } - virtual void InitColAccess(void) { + inline void InitColAccess(size_t max_nrow = ULONG_MAX) { if (this->HaveColAccess()) return; - const size_t max_nrow = std::numeric_limits::max(); this->InitColData(max_nrow); } /*! \brief get the row iterator associated with FMatrix */ - virtual utils::IIterator* RowIterator(void) const { + inline utils::IIterator* RowIterator(void) const { return iter_; } + /*! \brief set iterator */ + inline void set_iter(utils::IIterator *iter) { + this->iter_ = iter; + } + /*! + * \brief save column access data into stream + * \param fo output stream to save to + */ + inline void SaveColAccess(utils::IStream &fo) { + fo.Write(&num_buffered_row_, sizeof(num_buffered_row_)); + if (num_buffered_row_ != 0) { + SaveBinary(fo, col_ptr_, col_data_); + } + } + /*! + * \brief load column access data from stream + * \param fo output stream to load from + */ + inline void LoadColAccess(utils::IStream &fi) { + utils::Check(fi.Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0, + "invalid input file format"); + if (num_buffered_row_ != 0) { + LoadBinary(fi, &col_ptr_, &col_data_); + } + } + /*! + * \brief save data to binary stream + * \param fo output stream + * \param ptr pointer data + * \param data data content + */ + inline static void SaveBinary(utils::IStream &fo, + const std::vector &ptr, + const std::vector &data) { + size_t nrow = ptr.size() - 1; + fo.Write(&nrow, sizeof(size_t)); + fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); + if (data.size() != 0) { + fo.Write(&data[0], data.size() * sizeof(SparseBatch::Entry)); + } + } + /*! + * \brief load data from binary stream + * \param fi input stream + * \param out_ptr pointer data + * \param out_data data content + */ + inline static void LoadBinary(utils::IStream &fi, + std::vector *out_ptr, + std::vector *out_data) { + size_t nrow; + utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); + out_ptr->resize(nrow + 1); + utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0, + "invalid input file format"); + out_data->resize(out_ptr->back()); + if (out_data->size() != 0) { + utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(SparseBatch::Entry)) != 0, + "invalid input file format"); + } + } protected: /*! @@ -278,16 +341,15 @@ class FMatrixS : public FMatrixInterface{ &col_data_[col_ptr_[i + 1]], Entry::CmpValue); } } - private: // --- data structure used to support InitColAccess -- utils::IIterator *iter_; /*! \brief number */ size_t num_buffered_row_; /*! \brief column pointer of CSC format */ - std::vector col_ptr_; + std::vector col_ptr_; /*! \brief column datas in CSC format */ - std::vector col_data_; + std::vector col_data_; }; } // namespace xgboost -#endif +#endif // XGBOOST_DATA_H diff --git a/src/io/io.cpp b/src/io/io.cpp new file mode 100644 index 000000000..93d91a61c --- /dev/null +++ b/src/io/io.cpp @@ -0,0 +1,16 @@ +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#include +#include "./io.h" +#include "simple_dmatrix-inl.hpp" +// implements data loads using dmatrix simple for now + +namespace xgboost { +namespace io { +DataMatrix* LoadDataMatrix(const char *fname) { + DMatrixSimple *dmat = new DMatrixSimple(); + dmat->CacheLoad(fname); + return dmat; +} +} // namespace io +} // namespace xgboost diff --git a/src/io/io.h b/src/io/io.h new file mode 100644 index 000000000..81f89de89 --- /dev/null +++ b/src/io/io.h @@ -0,0 +1,34 @@ +#ifndef XGBOOST_IO_IO_H_ +#define XGBOOST_IO_IO_H_ +/*! + * \file io.h + * \brief handles input data format of xgboost + * I/O module handles a specific DMatrix format + * \author Tianqi Chen + */ +#include "../data.h" +#include "../learner/dmatrix.h" + +namespace xgboost { +/*! \brief namespace related to data format */ +namespace io { +/*! \brief DMatrix object that I/O module support save/load */ +typedef learner::DMatrix DataMatrix; +/*! + * \brief load DataMatrix from stream + * \param fname file name to be loaded + * \return a loaded DMatrix + */ +DataMatrix* LoadDataMatrix(const char *fname); +/*! + * \brief save DataMatrix into stream, + * note: the saved dmatrix format may not be in exactly same as input + * SaveDMatrix will choose the best way to materialize the dmatrix. + * \param dmat the dmatrix to be saved + * \param fname file name to be savd + */ +void SaveDMatrix(const DataMatrix &dmat, const char *fname); + +} // namespace io +} // namespace xgboost +#endif // XGBOOST_IO_IO_H_ diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp new file mode 100644 index 000000000..5da6d1c0b --- /dev/null +++ b/src/io/simple_dmatrix-inl.hpp @@ -0,0 +1,216 @@ +#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ +#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ +/*! + * \file simple_dmatrix-inl.hpp + * \brief simple implementation of DMatrixS that can be used + * the data format of xgboost is templatized, which means it can accept + * any data structure that implements the function defined by FMatrix + * this file is a specific implementation of input data structure that can be used by BoostLearner + * \author Tianqi Chen + */ +#include +#include +#include +#include +#include "../data.h" +#include "../utils/utils.h" +#include "../learner/dmatrix.h" +#include "./io.h" + +namespace xgboost { +namespace io { +/*! \brief implementation of DataMatrix, in CSR format */ +class DMatrixSimple : public DataMatrix { + public: + // constructor + DMatrixSimple(void) { + this->fmat.set_iter(new OneBatchIter(this)); + this->Clear(); + } + // virtual destructor + virtual ~DMatrixSimple(void) {} + /*! \brief clear the storage */ + inline void Clear(void) { + row_ptr_.clear(); + row_ptr_.push_back(0); + row_data_.clear(); + info.Clear(); + } + /*! + * \brief add a row to the matrix + * \param feats features + * \return the index of added row + */ + inline size_t AddRow(const std::vector &feats) { + for (size_t i = 0; i < feats.size(); ++i) { + row_data_.push_back(feats[i]); + info.num_col = std::max(info.num_col, static_cast(feats[i].findex+1)); + } + row_ptr_.push_back(row_ptr_.back() + feats.size()); + info.num_row += 1; + return row_ptr_.size() - 2; + } + /*! + * \brief load from text file + * \param fname name of text data + * \param silent whether print information or not + */ + inline void LoadText(const char* fname, bool silent = false) { + this->Clear(); + FILE* file = utils::FopenCheck(fname, "r"); + float label; bool init = true; + char tmp[1024]; + std::vector feats; + while (fscanf(file, "%s", tmp) == 1) { + SparseBatch::Entry e; + if (sscanf(tmp, "%u:%f", &e.findex, &e.fvalue) == 2) { + feats.push_back(e); + } else { + if (!init) { + info.labels.push_back(label); + this->AddRow(feats); + } + feats.clear(); + utils::Check(sscanf(tmp, "%f", &label) == 1, "invalid LibSVM format"); + init = false; + } + } + + info.labels.push_back(label); + this->AddRow(feats); + + if (!silent) { + printf("%lux%lu matrix with %lu entries is loaded from %s\n", + info.num_row, info.num_col, row_data_.size(), fname); + } + fclose(file); + // try to load in additional file + std::string name = fname; + std::string gname = name + ".group"; + if (info.TryLoadGroup(gname.c_str(), silent)) { + utils::Check(info.group_ptr.back() == info.num_row, + "DMatrix: group data does not match the number of rows in features"); + } + std::string wname = name + ".weight"; + if (info.TryLoadWeight(wname.c_str(), silent)) { + utils::Check(info.weights.size() == info.num_row, + "DMatrix: weight data does not match the number of rows in features"); + } + } + /*! + * \brief load from binary file + * \param fname name of binary data + * \param silent whether print information or not + * \return whether loading is success + */ + inline bool LoadBinary(const char* fname, bool silent = false) { + FILE *fp = fopen64(fname, "rb"); + if (fp == NULL) return false; + utils::FileStream fs(fp); + int magic; + utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); + utils::Check(magic == kMagic, "invalid format,magic number mismatch"); + + info.LoadBinary(fs); + FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); + fmat.LoadColAccess(fs); + fs.Close(); + + if (!silent) { + printf("%lux%lu matrix with %lu entries is loaded from %s\n", + info.num_row, info.num_col, row_data_.size(), fname); + if (info.group_ptr.size() != 0) { + printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1); + } + } + return true; + } + /*! + * \brief save to binary file + * \param fname name of binary data + * \param silent whether print information or not + */ + inline void SaveBinary(const char* fname, bool silent = false) { + utils::FileStream fs(utils::FopenCheck(fname, "wb")); + int magic = kMagic; + fs.Write(&magic, sizeof(magic)); + + info.SaveBinary(fs); + FMatrixS::SaveBinary(fs, row_ptr_, row_data_); + fmat.SaveColAccess(fs); + fs.Close(); + + if (!silent) { + printf("%lux%lu matrix with %lu entries is saved to %s\n", + info.num_row, info.num_col, row_data_.size(), fname); + if (info.group_ptr.size() != 0) { + printf("data contains %lu groups\n", info.group_ptr.size()-1); + } + } + } + /*! + * \brief cache load data given a file name, if filename ends with .buffer, direct load binary + * otherwise the function will first check if fname + '.buffer' exists, + * if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file, + * and try to create a buffer file + * \param fname name of binary data + * \param silent whether print information or not + * \param savebuffer whether do save binary buffer if it is text + */ + inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) { + int len = strlen(fname); + if (len > 8 && !strcmp(fname + len - 7, ".buffer")) { + if (!this->LoadBinary(fname, silent)) { + utils::Error("can not open file \"%s\"", fname); + } + return; + } + char bname[1024]; + snprintf(bname, sizeof(bname), "%s.buffer", fname); + if (!this->LoadBinary(bname, silent)) { + this->LoadText(fname, silent); + if (savebuffer) this->SaveBinary(bname, silent); + } + } + // data fields + /*! \brief row pointer of CSR sparse storage */ + std::vector row_ptr_; + /*! \brief data in the row */ + std::vector row_data_; + /*! \brief magic number used to identify DMatrix */ + static const int kMagic = 0xff01; + + protected: + // one batch iterator that return content in the matrix + struct OneBatchIter: utils::IIterator { + OneBatchIter(DMatrixSimple *parent) + : at_first_(true), parent_(parent) {} + virtual ~OneBatchIter(void) {} + virtual void BeforeFirst(void) { + at_first_ = true; + } + virtual bool Next(void) { + if (!at_first_) return false; + at_first_ = false; + batch_.size = parent_->row_ptr_.size() - 1; + batch_.base_rowid = 0; + batch_.row_ptr = &parent_->row_ptr_[0]; + batch_.data_ptr = &parent_->row_data_[0]; + return true; + } + virtual const SparseBatch &Value(void) const { + return batch_; + } + + private: + // whether is at first + bool at_first_; + // pointer to parient + DMatrixSimple *parent_; + // temporal space for batch + SparseBatch batch_; + }; +}; +} // namespace io +} // namespace xgboost +#endif // namespace XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_ diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index 522be9b95..88a865399 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -10,10 +10,14 @@ namespace xgboost { namespace learner { -/*! +/*! * \brief meta information needed in training, including label, weight */ struct MetaInfo { + /*! \brief number of rows in the data */ + size_t num_row; + /*! \brief number of columns in the data */ + size_t num_col; /*! \brief label of each instance */ std::vector labels; /*! @@ -28,6 +32,15 @@ struct MetaInfo { * can be used for multi task setting */ std::vector root_index; + MetaInfo(void) : num_row(0), num_col(0) {} + /*! \brief clear all the information */ + inline void Clear(void) { + labels.clear(); + group_ptr.clear(); + weights.clear(); + root_index.clear(); + num_row = num_col = 0; + } /*! \brief get weight of each instances */ inline float GetWeight(size_t i) const { if(weights.size() != 0) { @@ -45,20 +58,53 @@ struct MetaInfo { } } inline void SaveBinary(utils::IStream &fo) { + fo.Write(&num_row, sizeof(num_row)); + fo.Write(&num_col, sizeof(num_col)); fo.Write(labels); fo.Write(group_ptr); fo.Write(weights); fo.Write(root_index); } inline void LoadBinary(utils::IStream &fi) { + utils::Check(fi.Read(&num_row, sizeof(num_row)), "MetaInfo: invalid format"); + utils::Check(fi.Read(&num_col, sizeof(num_col)), "MetaInfo: invalid format"); utils::Check(fi.Read(&labels), "MetaInfo: invalid format"); utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format"); utils::Check(fi.Read(&weights), "MetaInfo: invalid format"); utils::Check(fi.Read(&root_index), "MetaInfo: invalid format"); } + // try to load group information from file, if exists + inline bool TryLoadGroup(const char* fname, bool silent = false) { + FILE *fi = fopen64(fname, "r"); + if (fi == NULL) return false; + group_ptr.push_back(0); + unsigned nline; + while (fscanf(fi, "%u", &nline) == 1) { + group_ptr.push_back(group_ptr.back()+nline); + } + if (!silent) { + printf("%lu groups are loaded from %s\n", group_ptr.size()-1, fname); + } + fclose(fi); + return true; + } + // try to load weight information from file, if exists + inline bool TryLoadWeight(const char* fname, bool silent = false) { + FILE *fi = fopen64(fname, "r"); + if (fi == NULL) return false; + float wt; + while (fscanf(fi, "%f", &wt) == 1) { + weights.push_back(wt); + } + if (!silent) { + printf("loading weight from %s\n", fname); + } + fclose(fi); + return true; + } }; -/*! +/*! * \brief data object used for learning, * \tparam FMatrix type of feature data source */ @@ -66,8 +112,6 @@ template struct DMatrix { /*! \brief meta information about the dataset */ MetaInfo info; - /*! \brief number of rows in the DMatrix */ - size_t num_row; /*! \brief feature matrix about data content */ FMatrix fmat; /*! @@ -77,6 +121,8 @@ struct DMatrix { void *cache_learner_ptr_; /*! \brief default constructor */ DMatrix(void) : cache_learner_ptr_(NULL) {} + // virtual destructor + virtual ~DMatrix(void){} }; } // namespace learner diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index 62f852a12..e26f6a52d 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -55,9 +55,9 @@ class BoostLearner { if (dupilicate) continue; // set mats[i]'s cache learner pointer to this mats[i]->cache_learner_ptr_ = this; - cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row)); - buffer_size += mats[i]->num_row; - num_feature = std::max(num_feature, static_cast(mats[i]->num_col)); + cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row)); + buffer_size += mats[i]->info.num_row; + num_feature = std::max(num_feature, static_cast(mats[i]->info.num_col)); } char str_temp[25]; if (num_feature > mparam.num_feature) { diff --git a/src/utils/io.h b/src/utils/io.h index b52acf764..a18e6067a 100644 --- a/src/utils/io.h +++ b/src/utils/io.h @@ -42,7 +42,9 @@ class IStream { inline void Write(const std::vector &vec) { uint64_t sz = vec.size(); this->Write(&sz, sizeof(sz)); - this->Write(&vec[0], sizeof(T) * sz); + if (sz != 0) { + this->Write(&vec[0], sizeof(T) * sz); + } } /*! * \brief binary load a vector @@ -54,7 +56,9 @@ class IStream { uint64_t sz; if (this->Read(&sz, sizeof(sz)) == 0) return false; out_vec->resize(sz); - if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false; + if (sz != 0) { + if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false; + } return true; } /*! @@ -64,7 +68,9 @@ class IStream { inline void Write(const std::string &str) { uint64_t sz = str.length(); this->Write(&sz, sizeof(sz)); - this->Write(&str[0], sizeof(char) * sz); + if (sz != 0) { + this->Write(&str[0], sizeof(char) * sz); + } } /*! * \brief binary load a string @@ -75,7 +81,9 @@ class IStream { uint64_t sz; if (this->Read(&sz, sizeof(sz)) == 0) return false; out_str->resize(sz); - if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false; + if (sz != 0) { + if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false; + } return true; } }; diff --git a/src/utils/iterator.h b/src/utils/iterator.h index 32ab64aa9..3f5b23310 100644 --- a/src/utils/iterator.h +++ b/src/utils/iterator.h @@ -18,11 +18,11 @@ class IIterator { /*! * \brief set the parameter * \param name name of parameter - * \param val value of parameter + * \param val value of parameter */ - virtual void SetParam(const char *name, const char *val) = 0; + virtual void SetParam(const char *name, const char *val) {} /*! \brief initalize the iterator so that we can use the iterator */ - virtual void Init(void) = 0; + virtual void Init(void) {} /*! \brief set before first of the item */ virtual void BeforeFirst(void) = 0; /*! \brief move to next item */