complete refactor data.h, now replies on iterator to access column

This commit is contained in:
tqchen@graphlab.com 2014-08-27 17:00:21 -07:00
parent a59f8945dc
commit 605269133e
15 changed files with 216 additions and 492 deletions

View File

@ -10,23 +10,22 @@ else
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp -funroll-loops export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp -funroll-loops
endif endif
# expose these flags to R CMD SHLIB
export PKG_CPPFLAGS = $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_
# specify tensor path # specify tensor path
BIN = xgboost BIN =
OBJ = OBJ = updater.o gbm.o xgboost_main.o
SLIB = wrapper/libxgboostwrapper.so #SLIB = wrapper/libxgboostwrapper.so
RLIB = wrapper/libxgboostR.so #RLIB = wrapper/libxgboostR.so
.PHONY: clean all R python .PHONY: clean all R python
all: $(BIN) wrapper/libxgboostwrapper.so all: $(BIN) $(OBJ)
R: wrapper/libxgboostR.so #python: wrapper/libxgboostwrapper.so
python: wrapper/libxgboostwrapper.so #xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp
xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp
# now the wrapper takes in two files. io and wrapper part # now the wrapper takes in two files. io and wrapper part
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h #wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h
wrapper/libxgboostR.so: wrapper/xgboost_wrapper.cpp wrapper/xgboost_R.cpp src/io/io.cpp src/*.h src/*/*.hpp src/*/*.h updater.o: src/tree/updater.cpp
gbm.o: src/gbm/gbm.cpp
xgboost_main.o: src/xgboost_main.cpp
$(BIN) : $(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
@ -34,9 +33,6 @@ $(BIN) :
$(SLIB) : $(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^) $(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(RLIB) :
R CMD SHLIB -c -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) : $(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) ) $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )

View File

@ -7,16 +7,8 @@
*/ */
#include <cstdio> #include <cstdio>
#include <vector> #include <vector>
#include <limits>
#include <climits>
#include <cstring>
#include <algorithm>
#include "utils/io.h"
#include "utils/omp.h"
#include "utils/utils.h" #include "utils/utils.h"
#include "utils/iterator.h" #include "utils/iterator.h"
#include "utils/random.h"
#include "utils/matrix_csr.h"
namespace xgboost { namespace xgboost {
/*! /*!
@ -96,308 +88,72 @@ struct SparseBatch {
}; };
/*! \brief batch size */ /*! \brief batch size */
size_t size; size_t size;
/*! \brief array[size+1], row pointer of each of the elements */
const size_t *row_ptr;
/*! \brief array[row_ptr.back()], content of the sparse element */
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i]));
}
}; };
/*! \brief read-only row batch, used to access row continuously */ /*! \brief read-only row batch, used to access row continuously */
struct RowBatch : public SparseBatch { struct RowBatch : public SparseBatch {
/*! \brief the offset of rowid of this batch */ /*! \brief the offset of rowid of this batch */
size_t base_rowid; size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */
const size_t *ind_ptr;
/*! \brief array[ind_ptr.back()], content of the sparse element */
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
}
}; };
/*! /*!
* \brief read-only column batch, used to access columns, * \brief read-only column batch, used to access columns,
* the columns are not required to be continuous * the columns are not required to be continuous
*/ */
struct ColBatch : public RowBatch { struct ColBatch : public SparseBatch {
/*! \brief column index of each columns in the data */ /*! \brief column index of each columns in the data */
bst_uint *col_index; const bst_uint *col_index;
/*! \brief pointer to the column data */
const Inst *col_data;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return col_data[i];
}
}; };
/** /**
* \brief This is a interface convention via template, defining the way to access features, * \brief interface of feature matrix, needed for tree construction
* column access rule is defined by template, for efficiency purpose, * this interface defines two way to access features,
* row access is defined by iterator of sparse batches * row access is defined by iterator of RowBatch
* \tparam Derived type of actual implementation * col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
*/ */
template<typename Derived> class IFMatrix {
class FMatrixInterface {
public: public:
/*! \brief example iterator over one column */ // the interface only need to ganrantee row iter
struct ColIter{ // column iter is active, when ColIterator is called, row_iter can be disabled
/*! /*! \brief get the row iterator associated with FMatrix */
* \brief move to next position virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
* \return whether there is element in next position /*!\brief get column iterator */
*/ virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
inline bool Next(void);
/*! \return row index of current position */
inline bst_uint rindex(void) const;
/*! \return feature value in current position */
inline bst_float fvalue(void) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
// column access is needed by some of tree construction algorithms
/*! /*!
* \brief get column iterator, the columns must be sorted by feature value * \brief get the column iterator associated with FMatrix with subset of column features
* \param cidx column index * \param fset is the list of column index set that must be contained in the returning Column iterator
* \return column iterator * \return the column iterator, initialized so that it reads the elements in fset
*/ */
inline ColIter GetSortedCol(size_t cidx) const; virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param cidx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol(void) const;
/*! /*!
* \brief check if column access is supported, if not, initialize column access * \brief check if column access is supported, if not, initialize column access
* \param max_rows maximum number of rows allowed in constructor * \param subsample subsample ratio when generating column access
*/ */
inline void InitColAccess(void); virtual void InitColAccess(float subsample) = 0;
// the following are column meta data, should be able to answer them fast
/*! \return whether column access is enabled */ /*! \return whether column access is enabled */
inline bool HaveColAccess(void) const; virtual bool HaveColAccess(void) const = 0;
/*! \breif return #entries-in-col */ /*! \return number of columns in the FMatrix */
inline size_t GetColSize(size_t cidx) const; virtual size_t NumCol(void) const = 0;
/*! /*! \brief get number of non-missing entries in column */
* \breif return #entries-in-col / #rows virtual float GetColSize(size_t cidx) const = 0;
* \param cidx column index /*! \brief get column density */
* this function is used to help speedup, virtual float GetColDensity(size_t cidx) const = 0;
* doese not necessarily implement it if not sure, return 0.0; /*! \brief reference of buffered rowset */
* \return column density virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
*/ // virtual destructor
inline float GetColDensity(size_t cidx) const; virtual ~IFMatrix(void){}
/*! \brief get the row iterator associated with FMatrix */
inline utils::IIterator<RowBatch>* RowIterator(void) const;
};
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public FMatrixInterface<FMatrixS>{
public:
typedef RowBatch::Entry Entry;
/*! \brief row iterator */
struct ColIter{
const Entry *dptr_, *end_;
ColIter(const Entry* begin, const Entry* end)
:dptr_(begin), end_(end) {}
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
++dptr_; return true;
}
}
inline bst_uint rindex(void) const {
return dptr_->index;
}
inline bst_float fvalue(void) const {
return dptr_->fvalue;
}
};
/*! \brief reverse column iterator */
struct ColBackIter : public ColIter {
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
// shadows ColIter::Next
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
--dptr_; return true;
}
}
};
/*! \brief constructor */
FMatrixS(void) {
iter_ = NULL;
}
// destructor
~FMatrixS(void) {
if (iter_ != NULL) delete iter_;
}
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
inline size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get number of buffered rows */
inline const std::vector<bst_uint> buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get col sorted iterator */
inline ColIter GetSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColIter(&col_data_[0] + col_ptr_[cidx] - 1,
&col_data_[0] + col_ptr_[cidx + 1] - 1);
}
/*!
* \brief get reversed col iterator,
* this function will be deprecated at some point
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColBackIter(&col_data_[0] + col_ptr_[cidx + 1],
&col_data_[0] + col_ptr_[cidx]);
}
/*! \brief get col size */
inline size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
inline float GetColDensity(size_t cidx) const {
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
inline void InitColAccess(float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
}
/*!
* \brief get the row iterator associated with FMatrix
* this function is not threadsafe, returns iterator stored in FMatrixS
*/
inline utils::IIterator<RowBatch>* RowIterator(void) const {
iter_->BeforeFirst();
return iter_;
}
/*! \brief set iterator */
inline void set_iter(utils::IIterator<RowBatch> *iter) {
this->iter_ = iter;
}
/*!
* \brief save column access data into stream
* \param fo output stream to save to
*/
inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(buffered_rowset_);
if (buffered_rowset_.size() != 0) {
SaveBinary(fo, col_ptr_, col_data_);
}
}
/*!
* \brief load column access data from stream
* \param fo output stream to load from
*/
inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
if (buffered_rowset_.size() != 0) {
LoadBinary(fi, &col_ptr_, &col_data_);
}
}
/*!
* \brief save data to binary stream
* \param fo output stream
* \param ptr pointer data
* \param data data content
*/
inline static void SaveBinary(utils::IStream &fo,
const std::vector<size_t> &ptr,
const std::vector<RowBatch::Entry> &data) {
size_t nrow = ptr.size() - 1;
fo.Write(&nrow, sizeof(size_t));
fo.Write(&ptr[0], ptr.size() * sizeof(size_t));
if (data.size() != 0) {
fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry));
}
}
/*!
* \brief load data from binary stream
* \param fi input stream
* \param out_ptr pointer data
* \param out_data data content
*/
inline static void LoadBinary(utils::IStream &fi,
std::vector<size_t> *out_ptr,
std::vector<RowBatch::Entry> *out_data) {
size_t nrow;
utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
out_ptr->resize(nrow + 1);
utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0,
"invalid input file format");
out_data->resize(out_ptr->back());
if (out_data->size() != 0) {
utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0,
"invalid input file format");
}
}
protected:
/*!
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index);
}
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
size_t ktop = 0;
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid+i) {
++ktop;
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
}
}
}
// sort columns
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
}
}
private:
// --- data structure used to support InitColAccess --
utils::IIterator<RowBatch> *iter_;
/*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<RowBatch::Entry> col_data_;
}; };
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_DATA_H #endif // XGBOOST_DATA_H

View File

@ -18,8 +18,7 @@ namespace gbm {
* \brief gradient boosted linear model * \brief gradient boosted linear model
* \tparam FMatrix the data type updater taking * \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix> class GBLinear : public IGradBooster {
class GBLinear : public IGradBooster<FMatrix> {
public: public:
virtual ~GBLinear(void) { virtual ~GBLinear(void) {
} }
@ -41,13 +40,12 @@ class GBLinear : public IGradBooster<FMatrix> {
virtual void InitModel(void) { virtual void InitModel(void) {
model.InitModel(); model.InitModel();
} }
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
this->InitFeatIndex(fmat);
std::vector<bst_gpair> &gpair = *in_gpair; std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// for all the output group // for all the output group
for (int gid = 0; gid < ngroup; ++gid) { for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0; double sum_grad = 0.0, sum_hess = 0.0;
@ -72,42 +70,46 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
} }
} }
// number of features utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size()); while (iter->Next()) {
#pragma omp parallel for schedule(static) // number of features
for (bst_omp_uint i = 0; i < nfeat; ++i) { const ColBatch &batch = iter->Value();
const bst_uint fid = feat_index[i]; const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
for (int gid = 0; gid < ngroup; ++gid) { #pragma omp parallel for schedule(static)
double sum_grad = 0.0, sum_hess = 0.0; for (bst_omp_uint i = 0; i < nfeat; ++i) {
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { const bst_uint fid = batch.col_index[i];
const float v = it.fvalue(); ColBatch::Inst col = batch[i];
bst_gpair &p = gpair[it.rindex() * ngroup + gid]; for (int gid = 0; gid < ngroup; ++gid) {
if (p.hess < 0.0f) continue; double sum_grad = 0.0, sum_hess = 0.0;
sum_grad += p.grad * v; for (bst_uint j = 0; j < col.length; ++j) {
sum_hess += p.hess * v * v; const float v = col[j].fvalue;
} bst_gpair &p = gpair[col[j].index * ngroup + gid];
float &w = model[fid][gid]; if (p.hess < 0.0f) continue;
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w)); sum_grad += p.grad * v;
w += dw; sum_hess += p.hess * v * v;
// update grad value }
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { float &w = model[fid][gid];
bst_gpair &p = gpair[it.rindex() * ngroup + gid]; bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
if (p.hess < 0.0f) continue; w += dw;
p.grad += p.hess * it.fvalue() * dw; // update grad value
for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue;
p.grad += p.hess * col[j].fvalue * dw;
}
} }
} }
} }
} }
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) { std::vector<float> *out_preds) {
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); preds.resize(0);
// start collecting the prediction // start collecting the prediction
utils::IIterator<RowBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
while (iter->Next()) { while (iter->Next()) {
const RowBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
@ -134,18 +136,6 @@ class GBLinear : public IGradBooster<FMatrix> {
} }
protected: protected:
inline void InitFeatIndex(const FMatrix &fmat) {
if (feat_index.size() != 0) return;
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
feat_index.reserve(ncol);
for (unsigned i = 0; i < ncol; ++i) {
if (fmat.GetColSize(i) != 0) {
feat_index.push_back(i);
}
}
random::Shuffle(feat_index);
}
inline void Pred(const RowBatch::Inst &inst, float *preds) { inline void Pred(const RowBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) { for (int gid = 0; gid < model.param.num_output_group; ++gid) {
float psum = model.bias()[gid]; float psum = model.bias()[gid];

View File

@ -7,6 +7,7 @@
*/ */
#include <vector> #include <vector>
#include "../data.h" #include "../data.h"
#include "../utils/io.h"
#include "../utils/fmap.h" #include "../utils/fmap.h"
namespace xgboost { namespace xgboost {
@ -14,9 +15,7 @@ namespace xgboost {
namespace gbm { namespace gbm {
/*! /*!
* \brief interface of gradient boosting model * \brief interface of gradient boosting model
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix>
class IGradBooster { class IGradBooster {
public: public:
/*! /*!
@ -41,17 +40,17 @@ class IGradBooster {
virtual void InitModel(void) = 0; virtual void InitModel(void) = 0;
/*! /*!
* \brief peform update to the model(boosting) * \brief peform update to the model(boosting)
* \param fmat feature matrix that provide access to features * \param p_fmat feature matrix that provide access to features
* \param info meta information about training * \param info meta information about training
* \param in_gpair address of the gradient pair statistics of the data * \param in_gpair address of the gradient pair statistics of the data
* the booster may change content of gpair * the booster may change content of gpair
*/ */
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) = 0; std::vector<bst_gpair> *in_gpair) = 0;
/*! /*!
* \brief generate predictions for given feature matrix * \brief generate predictions for given feature matrix
* \param fmat feature matrix * \param p_fmat feature matrix
* \param buffer_offset buffer index offset of these instances, if equals -1 * \param buffer_offset buffer index offset of these instances, if equals -1
* this means we do not have buffer index allocated to the gbm * this means we do not have buffer index allocated to the gbm
* a buffer index is assigned to each instance that requires repeative prediction * a buffer index is assigned to each instance that requires repeative prediction
@ -59,7 +58,7 @@ class IGradBooster {
* \param info extra side information that may be needed for prediction * \param info extra side information that may be needed for prediction
* \param out_preds output vector to hold the predictions * \param out_preds output vector to hold the predictions
*/ */
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) = 0; std::vector<float> *out_preds) = 0;
@ -73,21 +72,11 @@ class IGradBooster {
// destrcutor // destrcutor
virtual ~IGradBooster(void){} virtual ~IGradBooster(void){}
}; };
} // namespace gbm /*!
} // namespace xgboost * \breif create a gradient booster from given name
* \param name name of gradient booster
#include "gbtree-inl.hpp" */
#include "gblinear-inl.hpp" IGradBooster* CreateGradBooster(const char *name);
namespace xgboost {
namespace gbm {
template<typename FMatrix>
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
utils::Error("unknown booster type: %s", name);
return NULL;
}
} // namespace gbm } // namespace gbm
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_GBM_GBM_H_ #endif // XGBOOST_GBM_GBM_H_

View File

@ -9,16 +9,15 @@
#include <utility> #include <utility>
#include <string> #include <string>
#include "./gbm.h" #include "./gbm.h"
#include "../utils/omp.h"
#include "../tree/updater.h" #include "../tree/updater.h"
namespace xgboost { namespace xgboost {
namespace gbm { namespace gbm {
/*! /*!
* \brief gradient boosted tree * \brief gradient boosted tree
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix> class GBTree : public IGradBooster {
class GBTree : public IGradBooster<FMatrix> {
public: public:
virtual ~GBTree(void) { virtual ~GBTree(void) {
this->Clear(); this->Clear();
@ -82,12 +81,12 @@ class GBTree : public IGradBooster<FMatrix> {
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized"); utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized"); utils::Assert(trees.size() == 0, "GBTree: model already initialized");
} }
virtual void DoBoost(const FMatrix &fmat, virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
const std::vector<bst_gpair> &gpair = *in_gpair; const std::vector<bst_gpair> &gpair = *in_gpair;
if (mparam.num_output_group == 1) { if (mparam.num_output_group == 1) {
this->BoostNewTrees(gpair, fmat, info, 0); this->BoostNewTrees(gpair, p_fmat, info, 0);
} else { } else {
const int ngroup = mparam.num_output_group; const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0, utils::Check(gpair.size() % ngroup == 0,
@ -99,11 +98,11 @@ class GBTree : public IGradBooster<FMatrix> {
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid]; tmp[i] = gpair[i * ngroup + gid];
} }
this->BoostNewTrees(tmp, fmat, info, gid); this->BoostNewTrees(tmp, p_fmat, info, gid);
} }
} }
} }
virtual void Predict(const FMatrix &fmat, virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<float> *out_preds) { std::vector<float> *out_preds) {
@ -121,7 +120,7 @@ class GBTree : public IGradBooster<FMatrix> {
const size_t stride = info.num_row * mparam.num_output_group; const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1)); preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction // start collecting the prediction
utils::IIterator<RowBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const RowBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
@ -172,7 +171,7 @@ class GBTree : public IGradBooster<FMatrix> {
char *pstr; char *pstr;
pstr = strtok(&tval[0], ","); pstr = strtok(&tval[0], ",");
while (pstr != NULL) { while (pstr != NULL) {
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr)); updaters.push_back(tree::CreateUpdater(pstr));
for (size_t j = 0; j < cfg.size(); ++j) { for (size_t j = 0; j < cfg.size(); ++j) {
// set parameters // set parameters
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str()); updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
@ -183,7 +182,7 @@ class GBTree : public IGradBooster<FMatrix> {
} }
// do group specific group // do group specific group
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair, inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
int bst_group) { int bst_group) {
this->InitUpdater(); this->InitUpdater();
@ -198,7 +197,7 @@ class GBTree : public IGradBooster<FMatrix> {
} }
// update the trees // update the trees
for (size_t i = 0; i < updaters.size(); ++i) { for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, fmat, info, new_trees); updaters[i]->Update(gpair, p_fmat, info, new_trees);
} }
// push back to model // push back to model
for (size_t i = 0; i < new_trees.size(); ++i) { for (size_t i = 0; i < new_trees.size(); ++i) {
@ -361,7 +360,7 @@ class GBTree : public IGradBooster<FMatrix> {
// temporal storage for per thread // temporal storage for per thread
std::vector<tree::RegTree::FVec> thread_temp; std::vector<tree::RegTree::FVec> thread_temp;
// the updaters that can be applied to each of tree // the updaters that can be applied to each of tree
std::vector< tree::IUpdater<FMatrix>* > updaters; std::vector<tree::IUpdater*> updaters;
}; };
} // namespace gbm } // namespace gbm

View File

@ -13,7 +13,7 @@ namespace xgboost {
/*! \brief namespace related to data format */ /*! \brief namespace related to data format */
namespace io { namespace io {
/*! \brief DMatrix object that I/O module support save/load */ /*! \brief DMatrix object that I/O module support save/load */
typedef learner::DMatrix<FMatrixS> DataMatrix; typedef learner::DMatrix DataMatrix;
/*! /*!
* \brief load DataMatrix from stream * \brief load DataMatrix from stream
* \param fname file name to be loaded * \param fname file name to be loaded

View File

@ -229,7 +229,7 @@ class DMatrixSimple : public DataMatrix {
at_first_ = false; at_first_ = false;
batch_.size = parent_->row_ptr_.size() - 1; batch_.size = parent_->row_ptr_.size() - 1;
batch_.base_rowid = 0; batch_.base_rowid = 0;
batch_.row_ptr = &parent_->row_ptr_[0]; batch_.ind_ptr = &parent_->row_ptr_[0];
batch_.data_ptr = &parent_->row_data_[0]; batch_.data_ptr = &parent_->row_data_[0];
return true; return true;
} }

View File

@ -8,7 +8,7 @@
*/ */
#include <vector> #include <vector>
#include "../data.h" #include "../data.h"
#include "../utils/io.h"
namespace xgboost { namespace xgboost {
namespace learner { namespace learner {
/*! /*!
@ -142,7 +142,6 @@ struct MetaInfo {
* \brief data object used for learning, * \brief data object used for learning,
* \tparam FMatrix type of feature data source * \tparam FMatrix type of feature data source
*/ */
template<typename FMatrix>
struct DMatrix { struct DMatrix {
/*! /*!
* \brief magic number associated with this object * \brief magic number associated with this object
@ -152,7 +151,7 @@ struct DMatrix {
/*! \brief meta information about the dataset */ /*! \brief meta information about the dataset */
MetaInfo info; MetaInfo info;
/*! \brief feature matrix about data content */ /*! \brief feature matrix about data content */
FMatrix fmat; IFMatrix *fmat;
/*! /*!
* \brief cache pointer to verify if the data structure is cached in some learner * \brief cache pointer to verify if the data structure is cached in some learner
* used to verify if DMatrix is cached * used to verify if DMatrix is cached
@ -161,7 +160,9 @@ struct DMatrix {
/*! \brief default constructor */ /*! \brief default constructor */
explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {} explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
// virtual destructor // virtual destructor
virtual ~DMatrix(void){} virtual ~DMatrix(void){
delete fmat;
}
}; };
} // namespace learner } // namespace learner

View File

@ -21,7 +21,6 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions * \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction * and do training and prediction
*/ */
template<typename FMatrix>
class BoostLearner { class BoostLearner {
public: public:
BoostLearner(void) { BoostLearner(void) {
@ -44,7 +43,7 @@ class BoostLearner {
* data matrices to continue training otherwise it will cause error * data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached * \param mats array of pointers to matrix whose prediction result need to be cached
*/ */
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) { inline void SetCacheData(const std::vector<DMatrix*>& mats) {
// estimate feature bound // estimate feature bound
unsigned num_feature = 0; unsigned num_feature = 0;
// assign buffer index // assign buffer index
@ -158,15 +157,15 @@ class BoostLearner {
* if not intialize it * if not intialize it
* \param p_train pointer to the matrix used by training * \param p_train pointer to the matrix used by training
*/ */
inline void CheckInit(DMatrix<FMatrix> *p_train) { inline void CheckInit(DMatrix *p_train) {
p_train->fmat.InitColAccess(prob_buffer_row); p_train->fmat->InitColAccess(prob_buffer_row);
} }
/*! /*!
* \brief update the model for one iteration * \brief update the model for one iteration
* \param iter current iteration number * \param iter current iteration number
* \param p_train pointer to the data matrix * \param p_train pointer to the data matrix
*/ */
inline void UpdateOneIter(int iter, const DMatrix<FMatrix> &train) { inline void UpdateOneIter(int iter, const DMatrix &train) {
this->PredictRaw(train, &preds_); this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_); obj_->GetGradient(preds_, train.info, iter, &gpair_);
gbm_->DoBoost(train.fmat, train.info.info, &gpair_); gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
@ -179,7 +178,7 @@ class BoostLearner {
* \return a string corresponding to the evaluation result * \return a string corresponding to the evaluation result
*/ */
inline std::string EvalOneIter(int iter, inline std::string EvalOneIter(int iter,
const std::vector<const DMatrix<FMatrix>*> &evals, const std::vector<const DMatrix*> &evals,
const std::vector<std::string> &evname) { const std::vector<std::string> &evname) {
std::string res; std::string res;
char tmp[256]; char tmp[256];
@ -198,7 +197,7 @@ class BoostLearner {
* \param metric name of metric * \param metric name of metric
* \return a pair of <evaluation name, result> * \return a pair of <evaluation name, result>
*/ */
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) { std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric(); if (metric == "auto") metric = obj_->DefaultEvalMetric();
IEvaluator *ev = CreateEvaluator(metric.c_str()); IEvaluator *ev = CreateEvaluator(metric.c_str());
this->PredictRaw(data, &preds_); this->PredictRaw(data, &preds_);
@ -213,7 +212,7 @@ class BoostLearner {
* \param output_margin whether to only predict margin value instead of transformed prediction * \param output_margin whether to only predict margin value instead of transformed prediction
* \param out_preds output vector that stores the prediction * \param out_preds output vector that stores the prediction
*/ */
inline void Predict(const DMatrix<FMatrix> &data, inline void Predict(const DMatrix &data,
bool output_margin, bool output_margin,
std::vector<float> *out_preds) const { std::vector<float> *out_preds) const {
this->PredictRaw(data, out_preds); this->PredictRaw(data, out_preds);
@ -235,7 +234,7 @@ class BoostLearner {
if (obj_ != NULL) return; if (obj_ != NULL) return;
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL"); utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str()); obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str()); gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) { for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str()); gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@ -247,7 +246,7 @@ class BoostLearner {
* \param data training data matrix * \param data training data matrix
* \param out_preds output vector that stores the prediction * \param out_preds output vector that stores the prediction
*/ */
inline void PredictRaw(const DMatrix<FMatrix> &data, inline void PredictRaw(const DMatrix &data,
std::vector<float> *out_preds) const { std::vector<float> *out_preds) const {
gbm_->Predict(data.fmat, this->FindBufferOffset(data), gbm_->Predict(data.fmat, this->FindBufferOffset(data),
data.info.info, out_preds); data.info.info, out_preds);
@ -307,7 +306,7 @@ class BoostLearner {
// model parameter // model parameter
ModelParam mparam; ModelParam mparam;
// gbm model that back everything // gbm model that back everything
gbm::IGradBooster<FMatrix> *gbm_; gbm::IGradBooster *gbm_;
// name of gbm model used for training // name of gbm model used for training
std::string name_gbm_; std::string name_gbm_;
// objective fnction // objective fnction
@ -324,14 +323,14 @@ class BoostLearner {
private: private:
// cache entry object that helps handle feature caching // cache entry object that helps handle feature caching
struct CacheEntry { struct CacheEntry {
const DMatrix<FMatrix> *mat_; const DMatrix *mat_;
size_t buffer_offset_; size_t buffer_offset_;
size_t num_row_; size_t num_row_;
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row) CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {} :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
}; };
// find internal bufer offset for certain matrix, if not exist, return -1 // find internal bufer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const { inline int64_t FindBufferOffset(const DMatrix &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) { for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) { if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.info.num_row()) { if (cache_[i].num_row_ == mat.info.num_row()) {

View File

@ -14,9 +14,7 @@ namespace xgboost {
namespace tree { namespace tree {
/*! /*!
* \brief interface of tree update module, that performs update of a tree * \brief interface of tree update module, that performs update of a tree
* \tparam FMatrix the data type updater taking
*/ */
template<typename FMatrix>
class IUpdater { class IUpdater {
public: public:
/*! /*!
@ -28,7 +26,7 @@ class IUpdater {
/*! /*!
* \brief peform update to the tree models * \brief peform update to the tree models
* \param gpair the gradient pair statistics of the data * \param gpair the gradient pair statistics of the data
* \param fmat feature matrix that provide access to features * \param p_fmat feature matrix that provide access to features
* \param info extra side information that may be need, such as root index * \param info extra side information that may be need, such as root index
* \param trees pointer to the trese to be updated, upater will change the content of the tree * \param trees pointer to the trese to be updated, upater will change the content of the tree
* note: all the trees in the vector are updated, with the same statistics, * note: all the trees in the vector are updated, with the same statistics,
@ -36,36 +34,18 @@ class IUpdater {
* there can be multiple trees when we train random forest style model * there can be multiple trees when we train random forest style model
*/ */
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) = 0; const std::vector<RegTree*> &trees) = 0;
// destructor // destructor
virtual ~IUpdater(void) {} virtual ~IUpdater(void) {}
}; };
} // namespace tree
} // namespace xgboost
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
/*! /*!
* \brief create a updater based on name * \brief create a updater based on name
* \param name name of updater * \param name name of updater
* \return return the updater instance * \return return the updater instance
*/ */
template<typename FMatrix> IUpdater* CreateUpdater(const char *name);
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_H_ #endif // XGBOOST_TREE_UPDATER_H_

View File

@ -15,8 +15,8 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats> template<typename TStats>
class ColMaker: public IUpdater<FMatrix> { class ColMaker: public IUpdater {
public: public:
virtual ~ColMaker(void) {} virtual ~ColMaker(void) {}
// set training parameter // set training parameter
@ -24,7 +24,7 @@ class ColMaker: public IUpdater<FMatrix> {
param.SetParam(name, val); param.SetParam(name, val);
} }
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
TStats::CheckInfo(info); TStats::CheckInfo(info);
@ -34,7 +34,7 @@ class ColMaker: public IUpdater<FMatrix> {
// build tree // build tree
for (size_t i = 0; i < trees.size(); ++i) { for (size_t i = 0; i < trees.size(); ++i) {
Builder builder(param); Builder builder(param);
builder.Update(gpair, fmat, info, trees[i]); builder.Update(gpair, p_fmat, info, trees[i]);
} }
param.learning_rate = lr; param.learning_rate = lr;
} }
@ -77,16 +77,16 @@ class ColMaker: public IUpdater<FMatrix> {
explicit Builder(const TrainParam &param) : param(param) {} explicit Builder(const TrainParam &param) : param(param) {}
// update one tree, growing // update one tree, growing
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
this->InitData(gpair, fmat, info.root_index, *p_tree); this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree); this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) { for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree); this->FindSplit(depth, this->qexpand, gpair, p_fmat, info, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree); this->ResetPosition(this->qexpand, p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand); this->UpdateQueueExpand(*p_tree, &this->qexpand);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree); this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree);
// if nothing left to be expand, break // if nothing left to be expand, break
if (qexpand.size() == 0) break; if (qexpand.size() == 0) break;
} }
@ -107,7 +107,7 @@ class ColMaker: public IUpdater<FMatrix> {
private: private:
// initialize temp data structure // initialize temp data structure
inline void InitData(const std::vector<bst_gpair> &gpair, inline void InitData(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, const IFMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) { const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree"); utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
@ -137,8 +137,7 @@ class ColMaker: public IUpdater<FMatrix> {
if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1; if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
} }
} }
} }
{ {
// initialize feature index // initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol()); unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@ -175,7 +174,7 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */ /*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
inline void InitNewNode(const std::vector<int> &qexpand, inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, const IFMatrix &fmat,
const BoosterInfo &info, const BoosterInfo &info,
const RegTree &tree) { const RegTree &tree) {
{// setup statistics space for each tree node {// setup statistics space for each tree node
@ -222,24 +221,25 @@ class ColMaker: public IUpdater<FMatrix> {
qexpand = newnodes; qexpand = newnodes;
} }
// enumerate the split values of specific feature // enumerate the split values of specific feature
template<typename Iter> inline void EnumerateSplit(const ColBatch::Entry *begin,
inline void EnumerateSplit(Iter it, unsigned fid, const ColBatch::Entry *end,
int d_step,
bst_uint fid,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const BoosterInfo &info, const BoosterInfo &info,
std::vector<ThreadEntry> &temp, std::vector<ThreadEntry> &temp) {
bool is_forward_search) {
// clear all the temp statistics // clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) { for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear(); temp[qexpand[j]].stats.Clear();
} }
// left statistics // left statistics
TStats c(param); TStats c(param);
while (it.Next()) { for(const ColBatch::Entry *it = begin; it != end; it += d_step) {
const bst_uint ridx = it.rindex(); const bst_uint ridx = it->index;
const int nid = position[ridx]; const int nid = position[ridx];
if (nid < 0) continue; if (nid < 0) continue;
// start working // start working
const float fvalue = it.fvalue(); const float fvalue = it->fvalue;
// get the statistics of nid // get the statistics of nid
ThreadEntry &e = temp[nid]; ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init // test if first hit, this is fine, because we set 0 during init
@ -252,7 +252,7 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats); c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight) { if (c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search); e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
} }
} }
// update the statistics // update the statistics
@ -267,38 +267,46 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats); c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) { if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain); bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float delta = is_forward_search ? rt_eps : -rt_eps; const float delta = d_step == +1 ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search); e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
} }
} }
} }
// find splits at current level, do split per level // find splits at current level, do split per level
inline void FindSplit(int depth, const std::vector<int> &qexpand, inline void FindSplit(int depth,
const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
RegTree *p_tree) { RegTree *p_tree) {
std::vector<unsigned> feat_set = feat_index; std::vector<bst_uint> feat_set = feat_index;
if (param.colsample_bylevel != 1.0f) { if (param.colsample_bylevel != 1.0f) {
random::Shuffle(feat_set); random::Shuffle(feat_set);
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size()); unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included"); utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
feat_set.resize(n); feat_set.resize(n);
} }
// start enumeration utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size()); while (iter->Next()) {
#if defined(_OPENMP) const ColBatch &batch = iter->Value();
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1); // start enumeration
#endif const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(dynamic, batch_size) #if defined(_OPENMP)
for (bst_omp_uint i = 0; i < nsize; ++i) { const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
const unsigned fid = feat_set[i]; #endif
const int tid = omp_get_thread_num(); #pragma omp parallel for schedule(dynamic, batch_size)
if (param.need_forward_search(fmat.GetColDensity(fid))) { for (bst_omp_uint i = 0; i < nsize; ++i) {
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true); const bst_uint fid = batch.col_index[i];
} const int tid = omp_get_thread_num();
if (param.need_backward_search(fmat.GetColDensity(fid))) { const ColBatch::Inst c = batch[i];
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false); if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]);
}
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]);
}
} }
} }
// after this each thread's stemp will get the best candidates, aggregate results // after this each thread's stemp will get the best candidates, aggregate results
@ -318,8 +326,8 @@ class ColMaker: public IUpdater<FMatrix> {
} }
} }
// reset position of each data points after split is created in the tree // reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) { inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = fmat.buffered_rowset(); const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1 // step 1, set default direct nodes to default, and leaf nodes to -1
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size()); const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
@ -343,22 +351,28 @@ class ColMaker: public IUpdater<FMatrix> {
} }
std::sort(fsplits.begin(), fsplits.end()); std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin()); fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// start put things into right place
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size()); utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
#pragma omp parallel for schedule(dynamic, 1) while (iter->Next()) {
for (bst_omp_uint i = 0; i < nfeats; ++i) { const ColBatch &batch = iter->Value();
const unsigned fid = fsplits[i]; for (size_t i = 0; i < batch.size; ++i) {
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) { ColBatch::Inst col = batch[i];
const bst_uint ridx = it.rindex(); const bst_uint fid = batch.col_index[i];
int nid = position[ridx]; const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
if (nid == -1) continue; #pragma omp parallel for schedule(static)
// go back to parent, correct those who are not default for (bst_omp_uint j = 0; j < ndata; ++j) {
nid = tree[nid].parent(); const bst_uint ridx = col[j].index;
if (tree[nid].split_index() == fid) { const float fvalue = col[j].fvalue;
if (it.fvalue() < tree[nid].split_cond()) { int nid = position[ridx];
position[ridx] = tree[nid].cleft(); if (nid == -1) continue;
} else { // go back to parent, correct those who are not default
position[ridx] = tree[nid].cright(); nid = tree[nid].parent();
if (tree[nid].split_index() == fid) {
if (fvalue < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft();
} else {
position[ridx] = tree[nid].cright();
}
} }
} }
} }
@ -369,7 +383,7 @@ class ColMaker: public IUpdater<FMatrix> {
// number of omp thread used during training // number of omp thread used during training
int nthread; int nthread;
// Per feature: shuffle index of each feature index // Per feature: shuffle index of each feature index
std::vector<unsigned> feat_index; std::vector<bst_uint> feat_index;
// Instance Data: current node position in the tree of each instance // Instance Data: current node position in the tree of each instance
std::vector<int> position; std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction // PerThread x PerTreeNode: statistics for per thread construction

View File

@ -12,8 +12,7 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix> class TreePruner: public IUpdater {
class TreePruner: public IUpdater<FMatrix> {
public: public:
virtual ~TreePruner(void) {} virtual ~TreePruner(void) {}
// set training parameter // set training parameter
@ -23,7 +22,7 @@ class TreePruner: public IUpdater<FMatrix> {
} }
// update the tree, do pruning // update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
@ -75,7 +74,6 @@ class TreePruner: public IUpdater<FMatrix> {
// training parameter // training parameter
TrainParam param; TrainParam param;
}; };
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_ #endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_

View File

@ -9,12 +9,13 @@
#include <limits> #include <limits>
#include "./param.h" #include "./param.h"
#include "./updater.h" #include "./updater.h"
#include "../utils/omp.h"
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */ /*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats> template<typename TStats>
class TreeRefresher: public IUpdater<FMatrix> { class TreeRefresher: public IUpdater {
public: public:
virtual ~TreeRefresher(void) {} virtual ~TreeRefresher(void) {}
// set training parameter // set training parameter
@ -23,7 +24,7 @@ class TreeRefresher: public IUpdater<FMatrix> {
} }
// update the tree, do pruning // update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat, IFMatrix *p_fmat,
const BoosterInfo &info, const BoosterInfo &info,
const std::vector<RegTree*> &trees) { const std::vector<RegTree*> &trees) {
if (trees.size() == 0) return; if (trees.size() == 0) return;
@ -50,7 +51,7 @@ class TreeRefresher: public IUpdater<FMatrix> {
fvec_temp[tid].Init(trees[0]->param.num_feature); fvec_temp[tid].Init(trees[0]->param.num_feature);
} }
// start accumulating statistics // start accumulating statistics
utils::IIterator<RowBatch> *iter = fmat.RowIterator(); utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const RowBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();

View File

@ -8,6 +8,7 @@
#define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS
#include <cstdio> #include <cstdio>
#include <cstdarg> #include <cstdarg>
#include <string>
#include <cstdlib> #include <cstdlib>
#ifdef _MSC_VER #ifdef _MSC_VER
#define fopen64 fopen #define fopen64 fopen

View File

@ -234,7 +234,7 @@ class BoostLearnTask{
std::vector<io::DataMatrix*> deval; std::vector<io::DataMatrix*> deval;
std::vector<const io::DataMatrix*> devalall; std::vector<const io::DataMatrix*> devalall;
utils::FeatMap fmap; utils::FeatMap fmap;
learner::BoostLearner<FMatrixS> learner; learner::BoostLearner learner;
}; };
} }