xgboost/data.h
2014-08-15 20:15:58 -07:00

294 lines
9.3 KiB
C++

#ifndef XGBOOST_UNITY_DATA_H
#define XGBOOST_UNITY_DATA_H
/*!
* \file data.h
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#include <cstdio>
#include <vector>
#include <limits>
#include <algorithm>
#include "utils/io.h"
#include "utils/utils.h"
#include "utils/iterator.h"
#include "utils/matrix_csr.h"
namespace xgboost {
/*!
* \brief unsigned interger type used in boost,
* used for feature index and row index
*/
typedef unsigned bst_uint;
/*! \brief float type, used for storing statistics */
typedef float bst_float;
const float rt_eps = 1e-5f;
// min gap between feature values to allow a split happen
const float rt_2eps = rt_eps * 2.0f;
/*! \brief gradient statistics pair usually needed in gradient boosting */
struct bst_gpair{
/*! \brief gradient statistics */
bst_float grad;
/*! \brief second order gradient statistics */
bst_float hess;
bst_gpair(void) {}
bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {}
};
/*! \brief read-only sparse instance batch in CSR format */
struct SparseBatch {
/*! \brief an entry of sparse vector */
struct Entry {
/*! \brief feature index */
bst_uint findex;
/*! \brief feature value */
bst_float fvalue;
// default constructor
Entry(void) {}
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {}
/*! \brief reversely compare feature values */
inline static bool CmpValue(const Entry &a, const Entry &b) {
return a.fvalue < b.fvalue;
}
};
/*! \brief an instance of sparse vector in the batch */
struct Inst {
/*! \brief pointer to the elements*/
const Entry *data;
/*! \brief length of the instance */
const bst_uint length;
/*! \brief constructor */
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
/*! \brief get i-th pair in the sparse vector*/
inline const Entry& operator[](size_t i) const {
return data[i];
}
};
/*! \brief batch size */
size_t size;
/*! \brief the offset of rowid of this batch */
size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */
const size_t *row_ptr;
/*! \brief array[row_ptr.back()], content of the sparse element */
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + row_ptr[i], row_ptr[i+1] - row_ptr[i]);
}
};
/**
* \brief This is a interface convention via template, defining the way to access features,
* column access rule is defined by template, for efficiency purpose,
* row access is defined by iterator of sparse batches
* \tparam Derived type of actual implementation
*/
template<typename Derived>
class FMatrixInterface {
public:
/*! \brief example iterator over one column */
struct ColIter{
/*!
* \brief move to next position
* \return whether there is element in next position
*/
inline bool Next(void);
/*! \return row index of current position */
inline bst_uint rindex(void) const;
/*! \return feature value in current position */
inline bst_float fvalue(void) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
// column access is needed by some of tree construction algorithms
/*!
* \brief get column iterator, the columns must be sorted by feature value
* \param cidx column index
* \return column iterator
*/
inline ColIter GetSortedCol(size_t cidx) const;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param cidx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol(void) const;
/*!
* \brief check if column access is supported, if not, initialize column access
* \param max_rows maximum number of rows allowed in constructor
*/
inline void InitColAccess(void);
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const;
/*! \breif return #entries-in-col */
inline size_t GetColSize(size_t cidx) const;
/*!
* \breif return #entries-in-col / #rows
* \param cidx column index
* this function is used to help speedup,
* doese not necessarily implement it if not sure, return 0.0;
* \return column density
*/
inline float GetColDensity(size_t cidx) const;
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<SparseBatch>* RowIterator(void) const = 0;
};
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public FMatrixInterface<FMatrixS>{
public:
typedef SparseBatch::Entry Entry;
/*! \brief row iterator */
struct ColIter{
const Entry *dptr_, *end_;
ColIter(const Entry* begin, const Entry* end)
:dptr_(begin), end_(end) {}
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
++dptr_; return true;
}
}
inline bst_uint rindex(void) const {
return dptr_->findex;
}
inline bst_float fvalue(void) const {
return dptr_->fvalue;
}
};
/*! \brief reverse column iterator */
struct ColBackIter : public ColIter {
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
// shadows ColIter::Next
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
--dptr_; return true;
}
}
};
/*! \brief constructor */
explicit FMatrixS(utils::IIterator<SparseBatch> *base_iter)
: iter_(base_iter) {}
// destructor
virtual ~FMatrixS(void) {
delete iter_;
}
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
inline size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get col sorted iterator */
inline ColIter GetSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColIter(&col_data_[col_ptr_[cidx]] - 1,
&col_data_[col_ptr_[cidx + 1]] - 1);
}
/*!
* \brief get reversed col iterator,
* this function will be deprecated at some point
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColBackIter(&col_data_[col_ptr_[cidx + 1]],
&col_data_[col_ptr_[cidx]]);
}
/*! \brief get col size */
inline size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
inline float GetColDensity(size_t cidx) const {
size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
}
virtual void InitColAccess(void) {
if (this->HaveColAccess()) return;
const size_t max_nrow = std::numeric_limits<bst_uint>::max();
this->InitColData(max_nrow);
}
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<SparseBatch>* RowIterator(void) const {
return iter_;
}
protected:
/*!
* \brief intialize column data
* \param max_nrow maximum number of rows supported
*/
inline void InitColData(size_t max_nrow) {
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
num_buffered_row_ = 0;
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
if (batch.base_rowid >= max_nrow) break;
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid);
for (size_t i = 0; i < nbatch; ++i, ++num_buffered_row_) {
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < batch.size; ++j) {
builder.AddBudget(inst[j].findex);
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
if (batch.base_rowid >= max_nrow) break;
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid);
for (size_t i = 0; i < nbatch; ++i) {
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < batch.size; ++j) {
builder.PushElem(inst[j].findex,
Entry((bst_uint)(batch.base_rowid+j),
inst[j].fvalue));
}
}
}
// sort columns
unsigned ncol = static_cast<unsigned>(this->NumCol());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ncol; ++i) {
std::sort(&col_data_[col_ptr_[i]],
&col_data_[col_ptr_[i + 1]], Entry::CmpValue);
}
}
private:
// --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_;
/*! \brief number */
size_t num_buffered_row_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<SparseBatch::Entry> col_data_;
};
} // namespace xgboost
#endif