#ifndef XGBOOST_DATA_H #define XGBOOST_DATA_H /*! * \file xgboost_data.h * \brief the input data structure for gradient boosting * \author Tianqi Chen: tianqi.tchen@gmail.com */ #include #include #include "../utils/xgboost_utils.h" #include "../utils/xgboost_stream.h" #include "../utils/xgboost_matrix_csr.h" namespace xgboost{ namespace booster{ /*! \brief interger type used in boost */ typedef int bst_int; /*! \brief unsigned interger type used in boost */ typedef unsigned bst_uint; /*! \brief float type used in boost */ typedef float bst_float; /*! \brief debug option for booster */ const bool bst_debug = false; }; }; namespace xgboost{ namespace booster{ /** * \brief This is a interface, defining the way to access features, * by column or by row. This interface is used to make implementation * of booster does not depend on how feature is stored. * * Why template instead of virtual class: for efficiency * feature matrix is going to be used by most inner loop of the algorithm * * \tparam Derived type of actual implementation * \sa FMatrixS: most of time FMatrixS is sufficient, refer to it if you find it confusing */ template struct FMatrix{ public: /*! \brief exmaple iterator over one row */ struct RowIter{ /*! * \brief move to next position * \return whether there is element in next position */ inline bool Next(void); /*! \return feature index in current position */ inline bst_uint findex(void) const; /*! \return feature value in current position */ inline bst_float fvalue(void) const; }; /*! \brief example iterator over one column */ struct ColIter{ /*! * \brief move to next position * \return whether there is element in next position */ inline bool Next(void); /*! \return row index of current position */ inline bst_uint rindex(void) const; /*! \return feature value in current position */ inline bst_float fvalue(void) const; }; /*! \brief backward iterator over column */ struct ColBackIter : public ColIter {}; public: /*! * \brief get number of rows * \return number of rows */ inline size_t NumRow(void) const; /*! * \brief get number of columns * \return number of columns */ inline size_t NumCol(void) const; /*! * \brief get row iterator * \param ridx row index * \return row iterator */ inline RowIter GetRow(size_t ridx) const; /*! * \brief get number of column groups, this ise used together with GetRow( ridx, gid ) * \return number of column group */ inline unsigned NumColGroup(void) const{ return 1; } /*! * \brief get row iterator, return iterator of specific column group * \param ridx row index * \param gid colmun group id * \return row iterator, only iterates over features of specified column group */ inline RowIter GetRow(size_t ridx, unsigned gid) const; /*! \return whether column access is enabled */ inline bool HaveColAccess(void) const; /*! * \brief get column iterator, the columns must be sorted by feature value * \param ridx column index * \return column iterator */ inline ColIter GetSortedCol(size_t ridx) const; /*! * \brief get column backward iterator, starts from biggest fvalue, and iterator back * \param ridx column index * \return reverse column iterator */ inline ColBackIter GetReverseSortedCol(size_t ridx) const; }; }; }; namespace xgboost{ namespace booster{ /*! * \brief feature matrix to store training instance, in sparse CSR format */ class FMatrixS : public FMatrix{ public: /*! \brief one entry in a row */ struct REntry{ /*! \brief feature index */ bst_uint findex; /*! \brief feature value */ bst_float fvalue; /*! \brief constructor */ REntry(void){} /*! \brief constructor */ REntry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue){} inline static bool cmp_fvalue(const REntry &a, const REntry &b){ return a.fvalue < b.fvalue; } }; /*! \brief one row of sparse feature matrix */ struct Line{ /*! \brief array of feature index */ const REntry *data_; /*! \brief size of the data */ bst_uint len; /*! \brief get k-th element */ inline const REntry& operator[](unsigned i) const{ return data_[i]; } }; /*! \brief row iterator */ struct RowIter{ const REntry *dptr_, *end_; RowIter(const REntry* dptr, const REntry* end) :dptr_(dptr), end_(end){} inline bool Next(void){ if (dptr_ == end_) return false; else{ ++dptr_; return true; } } inline bst_uint findex(void) const{ return dptr_->findex; } inline bst_float fvalue(void) const{ return dptr_->fvalue; } }; /*! \brief column iterator */ struct ColIter : public RowIter{ ColIter(const REntry* dptr, const REntry* end) :RowIter(dptr, end){} inline bst_uint rindex(void) const{ return this->findex(); } }; /*! \brief reverse column iterator */ struct ColBackIter : public ColIter{ ColBackIter(const REntry* dptr, const REntry* end) :ColIter(dptr, end){} // shadows RowIter::Next inline bool Next(void){ if (dptr_ == end_) return false; else{ --dptr_; return true; } } }; public: /*! \brief constructor */ FMatrixS(void){ this->Clear(); } /*! \brief get number of rows */ inline size_t NumRow(void) const{ return row_ptr_.size() - 1; } /*! * \brief get number of nonzero entries * \return number of nonzero entries */ inline size_t NumEntry(void) const{ return row_data_.size(); } /*! \brief clear the storage */ inline void Clear(void){ row_ptr_.clear(); row_ptr_.push_back(0); row_data_.clear(); col_ptr_.clear(); col_data_.clear(); } /*! \brief get sparse part of current row */ inline Line operator[](size_t sidx) const{ Line sp; utils::Assert(!bst_debug || sidx < this->NumRow(), "row id exceed bound"); sp.len = static_cast(row_ptr_[sidx + 1] - row_ptr_[sidx]); sp.data_ = &row_data_[row_ptr_[sidx]]; return sp; } /*! * \brief add a row to the matrix, with data stored in STL container * \param findex feature index * \param fvalue feature value * \param fstart start bound of feature * \param fend end bound range of feature * \return the row id added line */ inline size_t AddRow(const std::vector &findex, const std::vector &fvalue, unsigned fstart = 0, unsigned fend = UINT_MAX){ utils::Assert(findex.size() == fvalue.size()); unsigned cnt = 0; for (size_t i = 0; i < findex.size(); i++){ if (findex[i] < fstart || findex[i] >= fend) continue; row_data_.push_back(REntry(findex[i], fvalue[i])); cnt++; } row_ptr_.push_back(row_ptr_.back() + cnt); return row_ptr_.size() - 2; } /*! \brief get row iterator*/ inline RowIter GetRow(size_t ridx) const{ utils::Assert(!bst_debug || ridx < this->NumRow(), "row id exceed bound"); return RowIter(&row_data_[row_ptr_[ridx]] - 1, &row_data_[row_ptr_[ridx + 1]] - 1); } /*! \brief get row iterator*/ inline RowIter GetRow(size_t ridx, unsigned gid) const{ utils::Assert(gid == 0, "FMatrixS only have 1 column group"); return FMatrixS::GetRow(ridx); } public: /*! \return whether column access is enabled */ inline bool HaveColAccess(void) const{ return col_ptr_.size() != 0 && col_data_.size() == row_data_.size(); } /*! \brief get number of colmuns */ inline size_t NumCol(void) const{ utils::Assert(this->HaveColAccess()); return col_ptr_.size() - 1; } /*! \brief get col iterator*/ inline ColIter GetSortedCol(size_t cidx) const{ utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound"); return ColIter(&col_data_[col_ptr_[cidx]] - 1, &col_data_[col_ptr_[cidx + 1]] - 1); } /*! \brief get col iterator */ inline ColBackIter GetReverseSortedCol(size_t cidx) const{ utils::Assert(!bst_debug || cidx < this->NumCol(), "col id exceed bound"); return ColBackIter(&col_data_[col_ptr_[cidx + 1]], &col_data_[col_ptr_[cidx]]); } /*! * \brief intialize the data so that we have both column and row major * access, call this whenever we need column access */ inline void InitData(void){ utils::SparseCSRMBuilder builder(col_ptr_, col_data_); builder.InitBudget(0); for (size_t i = 0; i < this->NumRow(); i++){ for (RowIter it = this->GetRow(i); it.Next();){ builder.AddBudget(it.findex()); } } builder.InitStorage(); for (size_t i = 0; i < this->NumRow(); i++){ for (RowIter it = this->GetRow(i); it.Next();){ builder.PushElem(it.findex(), REntry((bst_uint)i, it.fvalue())); } } // sort columns unsigned ncol = static_cast(this->NumCol()); for (unsigned i = 0; i < ncol; i++){ std::sort(&col_data_[col_ptr_[i]], &col_data_[col_ptr_[i + 1]], REntry::cmp_fvalue); } } /*! * \brief save data to binary stream * note: since we have size_t in ptr, * the function is not consistent between 64bit and 32bit machine * \param fo output stream */ inline void SaveBinary(utils::IStream &fo) const{ FMatrixS::SaveBinary(fo, row_ptr_, row_data_); int col_access = this->HaveColAccess() ? 1 : 0; fo.Write(&col_access, sizeof(int)); if (col_access != 0){ FMatrixS::SaveBinary(fo, col_ptr_, col_data_); } } /*! * \brief load data from binary stream * note: since we have size_t in ptr, * the function is not consistent between 64bit and 32bit machin * \param fi input stream */ inline void LoadBinary(utils::IStream &fi){ FMatrixS::LoadBinary(fi, row_ptr_, row_data_); int col_access; fi.Read(&col_access, sizeof(int)); if (col_access != 0){ FMatrixS::LoadBinary(fi, col_ptr_, col_data_); } } /*! * \brief load from text file * \param fi input file pointer */ inline void LoadText(FILE *fi){ this->Clear(); int ninst; while (fscanf(fi, "%d", &ninst) == 1){ std::vector findex; std::vector fvalue; while (ninst--){ unsigned index; float value; utils::Assert(fscanf(fi, "%u:%f", &index, &value) == 2, "load Text"); findex.push_back(index); fvalue.push_back(value); } this->AddRow(findex, fvalue); } // initialize column support as well this->InitData(); } private: /*! * \brief save data to binary stream * \param fo output stream * \param ptr pointer data * \param data data content */ inline static void SaveBinary(utils::IStream &fo, const std::vector &ptr, const std::vector &data){ size_t nrow = ptr.size() - 1; fo.Write(&nrow, sizeof(size_t)); fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); if (data.size() != 0){ fo.Write(&data[0], data.size() * sizeof(REntry)); } } /*! * \brief load data from binary stream * \param fi input stream * \param ptr pointer data * \param data data content */ inline static void LoadBinary(utils::IStream &fi, std::vector &ptr, std::vector &data){ size_t nrow; utils::Assert(fi.Read(&nrow, sizeof(size_t)) != 0, "Load FMatrixS"); ptr.resize(nrow + 1); utils::Assert(fi.Read(&ptr[0], ptr.size() * sizeof(size_t)), "Load FMatrixS"); data.resize(ptr.back()); if (data.size() != 0){ utils::Assert(fi.Read(&data[0], data.size() * sizeof(REntry)), "Load FMatrixS"); } } protected: /*! \brief row pointer of CSR sparse storage */ std::vector row_ptr_; /*! \brief data in the row */ std::vector row_data_; /*! \brief column pointer of CSC format */ std::vector col_ptr_; /*! \brief column datas */ std::vector col_data_; }; }; }; #endif