[DATA] basic data refactor done, basic version of csr source.

This commit is contained in:
tqchen
2015-12-30 03:28:25 -08:00
parent 3d708e4788
commit 46bcba7173
7 changed files with 337 additions and 417 deletions

View File

@@ -16,78 +16,20 @@ namespace xgboost {
typedef uint32_t bst_uint;
/*! \brief float type, used for storing statistics */
typedef float bst_float;
const float rt_eps = 1e-5f;
// min gap between feature values to allow a split happen
const float rt_2eps = rt_eps * 2.0f;
/*! \brief read-only sparse instance batch in CSR format */
struct SparseBatch {
/*! \brief an entry of sparse vector */
struct Entry {
/*! \brief feature index */
bst_uint index;
/*! \brief feature value */
bst_float fvalue;
/*! \brief default constructor */
Entry() {}
/*!
* \brief constructor with index and value
* \param index The feature or row index.
* \param fvalue THe feature value.
*/
Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
/*! \brief reversely compare feature values */
inline static bool CmpValue(const Entry &a, const Entry &b) {
return a.fvalue < b.fvalue;
}
};
/*! \brief an instance of sparse vector in the batch */
struct Inst {
/*! \brief pointer to the elements*/
const Entry *data;
/*! \brief length of the instance */
bst_uint length;
/*! \brief constructor */
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
/*! \brief get i-th pair in the sparse vector*/
inline const Entry& operator[](size_t i) const {
return data[i];
}
};
/*! \brief batch size */
size_t size;
};
/*! \brief read-only row batch, used to access row continuously */
struct RowBatch : public SparseBatch {
/*! \brief the offset of rowid of this batch */
size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */
const size_t *ind_ptr;
/*! \brief array[ind_ptr.back()], content of the sparse element */
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
}
};
/*!
* \brief read-only column batch, used to access columns,
* the columns are not required to be continuous
* \brief define compatible keywords in g++
* Used to support g++-4.6 and g++4.7
*/
struct ColBatch : public SparseBatch {
/*! \brief column index of each columns in the data */
const bst_uint *col_index;
/*! \brief pointer to the column data */
const Inst *col_data;
/*! \brief get i-th column from the batch */
inline Inst operator[](size_t i) const {
return col_data[i];
}
};
#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
#define override
#define final
#endif
#endif
} // namespace xgboost
#endif // XGBOOST_BASE_H_

View File

@@ -1,7 +1,7 @@
/*!
* Copyright (c) 2014 by Contributors
* Copyright (c) 2015 by Contributors
* \file data.h
* \brief The input data structure for gradient boosting.
* \brief The input data structure of xgboost.
* \author Tianqi Chen
*/
#ifndef XGBOOST_DATA_H_
@@ -13,9 +13,6 @@
#include "./base.h"
namespace xgboost {
// forward declare learner.
class Learner;
/*! \brief data type accepted by xgboost interface */
enum DataType {
kFloat32 = 1,
@@ -29,9 +26,11 @@ enum DataType {
*/
struct MetaInfo {
/*! \brief number of rows in the data */
size_t num_row;
uint64_t num_row;
/*! \brief number of columns in the data */
size_t num_col;
uint64_t num_col;
/*! \brief number of nonzero entries in the data */
uint64_t num_nonzero;
/*! \brief label of each instance */
std::vector<bst_float> labels;
/*!
@@ -53,7 +52,7 @@ struct MetaInfo {
*/
std::vector<bst_float> base_margin;
/*! \brief version flag, used to check version of this info */
static const int kVersion = 0;
static const int kVersion = 1;
/*! \brief default constructor */
MetaInfo() : num_row(0), num_col(0) {}
/*!
@@ -78,12 +77,12 @@ struct MetaInfo {
* \brief Load the Meta info from binary stream.
* \param fi The input stream
*/
void LoadBinary(dmlc::Stream *fi);
void LoadBinary(dmlc::Stream* fi);
/*!
* \brief Save the Meta info to binary stream
* \param fo The output stream.
*/
void SaveBinary(dmlc::Stream *fo) const;
void SaveBinary(dmlc::Stream* fo) const;
/*!
* \brief Set information in the meta info.
* \param key The key of the information.
@@ -102,36 +101,105 @@ struct MetaInfo {
void GetInfo(const char* key, const void** dptr, DataType* dtype, size_t* num) const;
};
/*! \brief read-only sparse instance batch in CSR format */
struct SparseBatch {
/*! \brief an entry of sparse vector */
struct Entry {
/*! \brief feature index */
bst_uint index;
/*! \brief feature value */
bst_float fvalue;
/*! \brief default constructor */
Entry() {}
/*!
* \brief constructor with index and value
* \param index The feature or row index.
* \param fvalue THe feature value.
*/
Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
/*! \brief reversely compare feature values */
inline static bool CmpValue(const Entry& a, const Entry& b) {
return a.fvalue < b.fvalue;
}
};
/*! \brief an instance of sparse vector in the batch */
struct Inst {
/*! \brief pointer to the elements*/
const Entry *data;
/*! \brief length of the instance */
bst_uint length;
/*! \brief constructor */
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
/*! \brief get i-th pair in the sparse vector*/
inline const Entry& operator[](size_t i) const {
return data[i];
}
};
/*! \brief batch size */
size_t size;
};
/*! \brief read-only row batch, used to access row continuously */
struct RowBatch : public SparseBatch {
/*! \brief the offset of rowid of this batch */
size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */
const size_t *ind_ptr;
/*! \brief array[ind_ptr.back()], content of the sparse element */
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i + 1] - ind_ptr[i]));
}
};
/*!
* \brief read-only column batch, used to access columns,
* the columns are not required to be continuous
*/
struct ColBatch : public SparseBatch {
/*! \brief column index of each columns in the data */
const bst_uint *col_index;
/*! \brief pointer to the column data */
const Inst *col_data;
/*! \brief get i-th column from the batch */
inline Inst operator[](size_t i) const {
return col_data[i];
}
};
/*!
* \brief This is data structure that user can pass to DMatrix::Create
* to create a DMatrix for training, user can create this data structure
* for customized Data Loading on single machine.
*
* On distributed setting, usually an customized dmlc::Parser is needed instead.
*/
struct DataSource {
class DataSource : public dmlc::DataIter<RowBatch> {
public:
/*!
* \brief Used to initialize the meta information of DMatrix
* The created DMatrix can change its own info later.
* \brief Meta information about the dataset
* The subclass need to be able to load this correctly from data.
*/
MetaInfo info;
/*!
* \brief Used for row based iteration of DMatrix,
*/
std::unique_ptr<dmlc::DataIter<RowBatch> > row_iter;
};
/*!
* \brief Internal data structured used by XGBoost during training.
* There are two ways to create a customized DMatrix that reads in user defined-format.
*
* - Define a new dmlc::Parser and register by DMLC_REGISTER_DATA_PARSER;
* This works best for user defined data input source, such as data-base, filesystem.
* - Provide a dmlc::Parser and pass into the DMatrix::Create
* - Alternatively, if data can be represented by an URL, define a new dmlc::Parser and register by DMLC_REGISTER_DATA_PARSER;
* - This works best for user defined data input source, such as data-base, filesystem.
* - Provdie a DataSource, that can be passed to DMatrix::Create
* This can be used to re-use inmemory data structure into DMatrix.
*/
class DMatrix {
public:
/*! \brief meta information that is always stored in DMatrix */
MetaInfo info;
/*! \brief meta information of the dataset */
virtual MetaInfo& info() = 0;
/*!
* \brief get the row iterator, reset to beginning position
* \note Only either RowIterator or column Iterator can be active.
@@ -163,12 +231,13 @@ class DMatrix {
/*! \brief get column density */
virtual float GetColDensity(size_t cidx) const = 0;
/*! \return reference of buffered rowset, in column access */
virtual const std::vector<bst_uint> &buffered_rowset() const = 0;
virtual const std::vector<bst_uint>& buffered_rowset() const = 0;
/*! \brief virtual destructor */
virtual ~DMatrix() {}
/*!
* \brief Save DMatrix to local file.
* The saved file only works for non-sharded dataset(single machine training).
* This API is deprecated and dis-encouraged to use.
* \param fname The file name to be saved.
* \return The created DMatrix.
*/
@@ -191,7 +260,7 @@ class DMatrix {
* This can be nullptr for common cases, and in-memory mode will be used.
* \return a Created DMatrix.
*/
static DMatrix* Create(DataSource&& source,
static DMatrix* Create(std::unique_ptr<DataSource>&& source,
const char* cache_prefix=nullptr);
/*!
* \brief Create a DMatrix by loaidng data from parser.
@@ -208,5 +277,10 @@ class DMatrix {
static DMatrix* Create(dmlc::Parser<uint32_t>* parser,
const char* cache_prefix=nullptr);
};
} // namespace xgboost
namespace dmlc {
DMLC_DECLARE_TRAITS(is_pod, xgboost::SparseBatch::Entry, true);
}
#endif // XGBOOST_DATA_H_