From 7ff91fe5f94b8dfe6fe4a8066e41f2d1e557abde Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 27 Nov 2015 10:25:33 -0800 Subject: [PATCH] Data interface ready --- include/xgboost/base.h | 80 +++++++++++++++- include/xgboost/data.h | 206 ++++++++++++++++++++++++++++++++++++++++- old_src/data.h | 166 --------------------------------- old_src/io/io.h | 47 ---------- 4 files changed, 284 insertions(+), 215 deletions(-) delete mode 100644 old_src/data.h delete mode 100644 old_src/io/io.h diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 26219bae5..b283c93cb 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -1,7 +1,7 @@ /*! * Copyright (c) 2015 by Contributors * \file base.h - * \brief defines configuration macros of xgboost + * \brief defines configuration macros of xgboost. */ #ifndef XGBOOST_BASE_H_ #define XGBOOST_BASE_H_ @@ -9,7 +9,85 @@ #include namespace xgboost { +/*! + * \brief unsigned interger type used in boost, + * used for feature index and row index. + */ +typedef uint32_t bst_uint; +/*! \brief float type, used for storing statistics */ +typedef float bst_float; +const float rt_eps = 1e-5f; +// min gap between feature values to allow a split happen +const float rt_2eps = rt_eps * 2.0f; +/*! \brief read-only sparse instance batch in CSR format */ +struct SparseBatch { + /*! \brief an entry of sparse vector */ + struct Entry { + /*! \brief feature index */ + bst_uint index; + /*! \brief feature value */ + bst_float fvalue; + /*! \brief default constructor */ + Entry() {} + /*! + * \brief constructor with index and value + * \param index The feature or row index. + * \param fvalue THe feature value. + */ + Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {} + /*! \brief reversely compare feature values */ + inline static bool CmpValue(const Entry &a, const Entry &b) { + return a.fvalue < b.fvalue; + } + }; + + /*! \brief an instance of sparse vector in the batch */ + struct Inst { + /*! \brief pointer to the elements*/ + const Entry *data; + /*! \brief length of the instance */ + bst_uint length; + /*! \brief constructor */ + Inst(const Entry *data, bst_uint length) : data(data), length(length) {} + /*! \brief get i-th pair in the sparse vector*/ + inline const Entry& operator[](size_t i) const { + return data[i]; + } + }; + + /*! \brief batch size */ + size_t size; +}; + +/*! \brief read-only row batch, used to access row continuously */ +struct RowBatch : public SparseBatch { + /*! \brief the offset of rowid of this batch */ + size_t base_rowid; + /*! \brief array[size+1], row pointer of each of the elements */ + const size_t *ind_ptr; + /*! \brief array[ind_ptr.back()], content of the sparse element */ + const Entry *data_ptr; + /*! \brief get i-th row from the batch */ + inline Inst operator[](size_t i) const { + return Inst(data_ptr + ind_ptr[i], static_cast(ind_ptr[i+1] - ind_ptr[i])); + } +}; + +/*! + * \brief read-only column batch, used to access columns, + * the columns are not required to be continuous + */ +struct ColBatch : public SparseBatch { + /*! \brief column index of each columns in the data */ + const bst_uint *col_index; + /*! \brief pointer to the column data */ + const Inst *col_data; + /*! \brief get i-th column from the batch */ + inline Inst operator[](size_t i) const { + return col_data[i]; + } +}; } // namespace xgboost #endif // XGBOOST_BASE_H_ diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 29fcd4bb5..354563b5c 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -1,8 +1,212 @@ /*! * Copyright (c) 2014 by Contributors * \file data.h - * \brief the input data structure for gradient boosting + * \brief The input data structure for gradient boosting. * \author Tianqi Chen */ #ifndef XGBOOST_DATA_H_ #define XGBOOST_DATA_H_ + +#include +#include +#include +#include "./base.h" + +namespace xgboost { +// forward declare learner. +class Learner; + +/*! \brief data type accepted by xgboost interface */ +enum DataType { + kFloat32 = 1, + kDouble = 2, + kUInt32 = 3, + kUInt64 = 4 +}; + +/*! + * \brief Meta information about dataset, always sit in memory. + */ +struct MetaInfo { + /*! \brief number of rows in the data */ + size_t num_row; + /*! \brief number of columns in the data */ + size_t num_col; + /*! \brief label of each instance */ + std::vector labels; + /*! + * \brief specified root index of each instance, + * can be used for multi task setting + */ + std::vector root_index; + /*! + * \brief the index of begin and end of a group + * needed when the learning task is ranking. + */ + std::vector group_ptr; + /*! \brief weights of each instance, optional */ + std::vector weights; + /*! + * \brief initialized margins, + * if specified, xgboost will start from this init margin + * can be used to specify initial prediction to boost from. + */ + std::vector base_margin; + /*! \brief version flag, used to check version of this info */ + static const int kVersion = 0; + /*! \brief default constructor */ + MetaInfo() : num_row(0), num_col(0) {} + /*! + * \brief Get weight of each instances. + * \param i Instance index. + * \return The weight. + */ + inline float GetWeight(size_t i) const { + return weights.size() != 0 ? weights[i] : 1.0f; + } + /*! + * \brief Get the root index of i-th instance. + * \param i Instance index. + * \return The pre-defined root index of i-th instance. + */ + inline unsigned GetRoot(size_t i) const { + return root_index.size() != 0 ? root_index[i] : 0U; + } + /*! \brief clear all the information */ + void Clear(); + /*! + * \brief Load the Meta info from binary stream. + * \param fi The input stream + */ + void LoadBinary(dmlc::Stream *fi); + /*! + * \brief Save the Meta info to binary stream + * \param fo The output stream. + */ + void SaveBinary(dmlc::Stream *fo) const; + /*! + * \brief Set information in the meta info. + * \param key The key of the information. + * \param dptr The data pointer of the source array. + * \param dtype The type of the source data. + * \param num Number of elements in the source array. + */ + void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num); + /*! + * \brief Get information from meta info. + * \param key The key of the information. + * \param dptr The output data pointer of the source array. + * \param dtype The output data type of the information array. + * \param num Number of elements in the array. + */ + void GetInfo(const char* key, const void** dptr, DataType* dtype, size_t* num) const; +}; + +/*! + * \brief This is data structure that user can pass to DMatrix::Create + * to create a DMatrix for training, user can create this data structure + * for customized Data Loading on single machine. + */ +struct DataSource { + /*! + * \brief Used to initialize the meta information of DMatrix + * The created DMatrix can change its own info later. + */ + MetaInfo info; + /*! + * \brief Used for row based iteration of DMatrix, + */ + std::unique_ptr > row_iter; +}; + +/*! + * \brief Internal data structured used by XGBoost during training. + * There are two ways to create a customized DMatrix that reads in user defined-format. + * + * - Define a new dmlc::Parser and register by DMLC_REGISTER_DATA_PARSER; + * This works best for user defined data input source, such as data-base, filesystem. + * - Provdie a DataSource, that can be passed to DMatrix::Create + * This can be used to re-use inmemory data structure into DMatrix. + */ +class DMatrix { + public: + /*! \brief meta information that is always stored in DMatrix */ + MetaInfo info; + /*! + * \brief get the row iterator, reset to beginning position + * \note Only either RowIterator or column Iterator can be active. + */ + virtual dmlc::DataIter* RowIterator() = 0; + /*!\brief get column iterator, reset to the beginning position */ + virtual dmlc::DataIter* ColIterator() = 0; + /*! + * \brief get the column iterator associated with subset of column features. + * \param fset is the list of column index set that must be contained in the returning Column iterator + * \return the column iterator, initialized so that it reads the elements in fset + */ + virtual dmlc::DataIter* ColIterator(const std::vector& fset) = 0; + /*! + * \brief check if column access is supported, if not, initialize column access. + * \param enabled whether certain feature should be included in column access. + * \param subsample subsample ratio when generating column access. + * \param max_row_perbatch auxilary information, maximum row used in each column batch. + * this is a hint information that can be ignored by the implementation. + */ + virtual void InitColAccess(const std::vector& enabled, + float subsample, + size_t max_row_perbatch) = 0; + // the following are column meta data, should be able to answer them fast. + /*! \return whether column access is enabled */ + virtual bool HaveColAccess() const = 0; + /*! \brief get number of non-missing entries in column */ + virtual size_t GetColSize(size_t cidx) const = 0; + /*! \brief get column density */ + virtual float GetColDensity(size_t cidx) const = 0; + /*! \return reference of buffered rowset, in column access */ + virtual const std::vector &buffered_rowset() const = 0; + /*! \brief virtual destructor */ + virtual ~DMatrix() {} + /*! + * \brief Save DMatrix to local file. + * The saved file only works for non-sharded dataset(single machine training). + * \param fname The file name to be saved. + * \return The created DMatrix. + */ + virtual void SaveToLocalFile(const char* fname); + /*! + * \brief Load DMatrix from URI. + * \param uri The URI of input. + * \param silent Whether print information during loading. + * \param load_row_split Flag to read in part of rows, divided among the workers in distributed mode. + * \return The created DMatrix. + */ + static DMatrix* Load(const char* uri, + bool silent, + bool load_row_split); + /*! + * \brief create a new DMatrix, by wrapping a row_iterator, and meta info. + * \param source The source iterator of the data, the create function takes ownership of the source. + * \param info The meta information in the DMatrix, need to move ownership to DMatrix. + * \param cache_prefix The path to prefix of temporary cache file of the DMatrix when used in external memory mode. + * This can be nullptr for common cases, and in-memory mode will be used. + * \return a Created DMatrix. + */ + static DMatrix* Create(DataSource&& source, + const char* cache_prefix=nullptr); + /*! + * \brief Create a DMatrix by loaidng data from parser. + * Parser can later be deleted after the DMatrix i created. + * \param parser The input data parser + * \param cache_prefix The path to prefix of temporary cache file of the DMatrix when used in external memory mode. + * This can be nullptr for common cases, and in-memory mode will be used. + * \sa dmlc::Parser + * \note dmlc-core provides efficient distributed data parser for libsvm format. + * User can create and register customized parser to load their own format using DMLC_REGISTER_DATA_PARSER. + * See "dmlc-core/include/dmlc/data.h" for detail. + * \return A created DMatrix. + */ + static DMatrix* Create(dmlc::Parser* parser, + const char* cache_prefix=nullptr); +}; +} // namespace xgboost +#endif // XGBOOST_DATA_H_ diff --git a/old_src/data.h b/old_src/data.h deleted file mode 100644 index 9bcb84ced..000000000 --- a/old_src/data.h +++ /dev/null @@ -1,166 +0,0 @@ -/*! - * Copyright (c) 2014 by Contributors - * \file data.h - * \brief the input data structure for gradient boosting - * \author Tianqi Chen - */ -#ifndef XGBOOST_DATA_H_ -#define XGBOOST_DATA_H_ - -#include -#include -#include "utils/utils.h" -#include "utils/iterator.h" - -namespace xgboost { -/*! - * \brief unsigned integer type used in boost, - * used for feature index and row index - */ -typedef unsigned bst_uint; -/*! \brief float type, used for storing statistics */ -typedef float bst_float; -const float rt_eps = 1e-5f; -// min gap between feature values to allow a split happen -const float rt_2eps = rt_eps * 2.0f; - -/*! \brief gradient statistics pair usually needed in gradient boosting */ -struct bst_gpair { - /*! \brief gradient statistics */ - bst_float grad; - /*! \brief second order gradient statistics */ - bst_float hess; - bst_gpair(void) {} - bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {} -}; - -/*! - * \brief extra information that might be needed by gbm and tree module - * this information is not necessarily present, and can be empty - */ -struct BoosterInfo { - /*! \brief number of rows in the data */ - size_t num_row; - /*! \brief number of columns in the data */ - size_t num_col; - /*! - * \brief specified root index of each instance, - * can be used for multi task setting - */ - std::vector root_index; - /*! \brief set fold indicator */ - std::vector fold_index; - /*! \brief number of rows, number of columns */ - BoosterInfo(void) : num_row(0), num_col(0) { - } - /*! \brief get root of i-th instance */ - inline unsigned GetRoot(size_t i) const { - return root_index.size() == 0 ? 0 : root_index[i]; - } -}; - -/*! \brief read-only sparse instance batch in CSR format */ -struct SparseBatch { - /*! \brief an entry of sparse vector */ - struct Entry { - /*! \brief feature index */ - bst_uint index; - /*! \brief feature value */ - bst_float fvalue; - // default constructor - Entry(void) {} - Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {} - /*! \brief reversely compare feature values */ - inline static bool CmpValue(const Entry &a, const Entry &b) { - return a.fvalue < b.fvalue; - } - }; - /*! \brief an instance of sparse vector in the batch */ - struct Inst { - /*! \brief pointer to the elements*/ - const Entry *data; - /*! \brief length of the instance */ - bst_uint length; - /*! \brief constructor */ - Inst(const Entry *data, bst_uint length) : data(data), length(length) {} - /*! \brief get i-th pair in the sparse vector*/ - inline const Entry& operator[](size_t i) const { - return data[i]; - } - }; - /*! \brief batch size */ - size_t size; -}; -/*! \brief read-only row batch, used to access row continuously */ -struct RowBatch : public SparseBatch { - /*! \brief the offset of rowid of this batch */ - size_t base_rowid; - /*! \brief array[size+1], row pointer of each of the elements */ - const size_t *ind_ptr; - /*! \brief array[ind_ptr.back()], content of the sparse element */ - const Entry *data_ptr; - /*! \brief get i-th row from the batch */ - inline Inst operator[](size_t i) const { - return Inst(data_ptr + ind_ptr[i], static_cast(ind_ptr[i+1] - ind_ptr[i])); - } -}; -/*! - * \brief read-only column batch, used to access columns, - * the columns are not required to be continuous - */ -struct ColBatch : public SparseBatch { - /*! \brief column index of each columns in the data */ - const bst_uint *col_index; - /*! \brief pointer to the column data */ - const Inst *col_data; - /*! \brief get i-th column from the batch */ - inline Inst operator[](size_t i) const { - return col_data[i]; - } -}; -/** - * \brief interface of feature matrix, needed for tree construction - * this interface defines two ways to access features: - * row access is defined by iterator of RowBatch - * col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch - */ -class IFMatrix { - public: - // the interface only need to guarantee row iter - // column iter is active, when ColIterator is called, row_iter can be disabled - /*! \brief get the row iterator associated with FMatrix */ - virtual utils::IIterator *RowIterator(void) = 0; - /*!\brief get column iterator */ - virtual utils::IIterator *ColIterator(void) = 0; - /*! - * \brief get the column iterator associated with FMatrix with subset of column features - * \param fset is the list of column index set that must be contained in the returning Column iterator - * \return the column iterator, initialized so that it reads the elements in fset - */ - virtual utils::IIterator *ColIterator(const std::vector &fset) = 0; - /*! - * \brief check if column access is supported, if not, initialize column access - * \param enabled whether certain feature should be included in column access - * \param subsample subsample ratio when generating column access - * \param max_row_perbatch auxiliary information, maximum row used in each column batch - * this is a hint information that can be ignored by the implementation - */ - virtual void InitColAccess(const std::vector &enabled, - float subsample, - size_t max_row_perbatch) = 0; - // the following are column meta data, should be able to answer them fast - /*! \return whether column access is enabled */ - virtual bool HaveColAccess(void) const = 0; - /*! \return number of columns in the FMatrix */ - virtual size_t NumCol(void) const = 0; - /*! \brief get number of non-missing entries in column */ - virtual size_t GetColSize(size_t cidx) const = 0; - /*! \brief get column density */ - virtual float GetColDensity(size_t cidx) const = 0; - /*! \brief reference of buffered rowset */ - virtual const std::vector &buffered_rowset(void) const = 0; - // virtual destructor - virtual ~IFMatrix(void){} -}; -} // namespace xgboost -#endif // XGBOOST_DATA_H_ diff --git a/old_src/io/io.h b/old_src/io/io.h deleted file mode 100644 index 6ceff2698..000000000 --- a/old_src/io/io.h +++ /dev/null @@ -1,47 +0,0 @@ -/*! - * Copyright 2014 by Contributors - * \file io.h - * \brief handles input data format of xgboost - * I/O module handles a specific DMatrix format - * \author Tianqi Chen - */ -#ifndef XGBOOST_IO_IO_H_ -#define XGBOOST_IO_IO_H_ - -#include "../data.h" -#include "../learner/dmatrix.h" - -namespace xgboost { -/*! \brief namespace related to data format */ -namespace io { -/*! \brief DMatrix object that I/O module support save/load */ -typedef learner::DMatrix DataMatrix; -/*! - * \brief load DataMatrix from stream - * \param fname file name to be loaded - * \param silent whether print message during loading - * \param savebuffer whether temporal buffer the file if the file is in text format - * \param loadsplit whether we only load a split of input files - * such that each worker node get a split of the data - * \param cache_file name of cache_file, used by external memory version - * can be NULL, if cache_file is specified, this will be the temporal - * space that can be re-used to store intermediate data - * \return a loaded DMatrix - */ -DataMatrix* LoadDataMatrix(const char *fname, - bool silent, - bool savebuffer, - bool loadsplit, - const char *cache_file = NULL); -/*! - * \brief save DataMatrix into stream, - * note: the saved dmatrix format may not be in exactly same as input - * SaveDMatrix will choose the best way to materialize the dmatrix. - * \param dmat the dmatrix to be saved - * \param fname file name to be saved - * \param silent whether print message during saving - */ -void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false); -} // namespace io -} // namespace xgboost -#endif // XGBOOST_IO_IO_H_