complete refactor data.h, now replies on iterator to access column

This commit is contained in:
tqchen@graphlab.com
2014-08-27 17:00:21 -07:00
parent a59f8945dc
commit 605269133e
15 changed files with 216 additions and 492 deletions

View File

@@ -14,9 +14,7 @@ namespace xgboost {
namespace tree {
/*!
* \brief interface of tree update module, that performs update of a tree
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class IUpdater {
public:
/*!
@@ -28,7 +26,7 @@ class IUpdater {
/*!
* \brief peform update to the tree models
* \param gpair the gradient pair statistics of the data
* \param fmat feature matrix that provide access to features
* \param p_fmat feature matrix that provide access to features
* \param info extra side information that may be need, such as root index
* \param trees pointer to the trese to be updated, upater will change the content of the tree
* note: all the trees in the vector are updated, with the same statistics,
@@ -36,36 +34,18 @@ class IUpdater {
* there can be multiple trees when we train random forest style model
*/
virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) = 0;
// destructor
virtual ~IUpdater(void) {}
};
} // namespace tree
} // namespace xgboost
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
/*!
* \brief create a updater based on name
* \param name name of updater
* \return return the updater instance
*/
template<typename FMatrix>
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
IUpdater* CreateUpdater(const char *name);
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_H_

View File

@@ -15,8 +15,8 @@
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats>
class ColMaker: public IUpdater<FMatrix> {
template<typename TStats>
class ColMaker: public IUpdater {
public:
virtual ~ColMaker(void) {}
// set training parameter
@@ -24,7 +24,7 @@ class ColMaker: public IUpdater<FMatrix> {
param.SetParam(name, val);
}
virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
TStats::CheckInfo(info);
@@ -34,7 +34,7 @@ class ColMaker: public IUpdater<FMatrix> {
// build tree
for (size_t i = 0; i < trees.size(); ++i) {
Builder builder(param);
builder.Update(gpair, fmat, info, trees[i]);
builder.Update(gpair, p_fmat, info, trees[i]);
}
param.learning_rate = lr;
}
@@ -77,16 +77,16 @@ class ColMaker: public IUpdater<FMatrix> {
explicit Builder(const TrainParam &param) : param(param) {}
// update one tree, growing
virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
RegTree *p_tree) {
this->InitData(gpair, fmat, info.root_index, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree);
this->FindSplit(depth, this->qexpand, gpair, p_fmat, info, p_tree);
this->ResetPosition(this->qexpand, p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
this->InitNewNode(qexpand, gpair, *p_fmat, info, *p_tree);
// if nothing left to be expand, break
if (qexpand.size() == 0) break;
}
@@ -107,7 +107,7 @@ class ColMaker: public IUpdater<FMatrix> {
private:
// initialize temp data structure
inline void InitData(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
const IFMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
@@ -137,8 +137,7 @@ class ColMaker: public IUpdater<FMatrix> {
if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
}
}
}
}
{
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@@ -175,7 +174,7 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
const IFMatrix &fmat,
const BoosterInfo &info,
const RegTree &tree) {
{// setup statistics space for each tree node
@@ -222,24 +221,25 @@ class ColMaker: public IUpdater<FMatrix> {
qexpand = newnodes;
}
// enumerate the split values of specific feature
template<typename Iter>
inline void EnumerateSplit(Iter it, unsigned fid,
inline void EnumerateSplit(const ColBatch::Entry *begin,
const ColBatch::Entry *end,
int d_step,
bst_uint fid,
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
std::vector<ThreadEntry> &temp,
bool is_forward_search) {
std::vector<ThreadEntry> &temp) {
// clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear();
}
// left statistics
TStats c(param);
while (it.Next()) {
const bst_uint ridx = it.rindex();
for(const ColBatch::Entry *it = begin; it != end; it += d_step) {
const bst_uint ridx = it->index;
const int nid = position[ridx];
if (nid < 0) continue;
// start working
const float fvalue = it.fvalue();
const float fvalue = it->fvalue;
// get the statistics of nid
ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init
@@ -252,7 +252,7 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
}
}
// update the statistics
@@ -267,38 +267,46 @@ class ColMaker: public IUpdater<FMatrix> {
c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float delta = is_forward_search ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search);
const float delta = d_step == +1 ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
}
}
}
// find splits at current level, do split per level
inline void FindSplit(int depth, const std::vector<int> &qexpand,
inline void FindSplit(int depth,
const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
RegTree *p_tree) {
std::vector<unsigned> feat_set = feat_index;
std::vector<bst_uint> feat_set = feat_index;
if (param.colsample_bylevel != 1.0f) {
random::Shuffle(feat_set);
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
feat_set.resize(n);
}
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size());
#if defined(_OPENMP)
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
#endif
#pragma omp parallel for schedule(dynamic, batch_size)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const unsigned fid = feat_set[i];
const int tid = omp_get_thread_num();
if (param.need_forward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true);
}
if (param.need_backward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false);
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#if defined(_OPENMP)
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
#endif
#pragma omp parallel for schedule(dynamic, batch_size)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const bst_uint fid = batch.col_index[i];
const int tid = omp_get_thread_num();
const ColBatch::Inst c = batch[i];
if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]);
}
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]);
}
}
}
// after this each thread's stemp will get the best candidates, aggregate results
@@ -318,8 +326,8 @@ class ColMaker: public IUpdater<FMatrix> {
}
}
// reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
@@ -343,22 +351,28 @@ class ColMaker: public IUpdater<FMatrix> {
}
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// start put things into right place
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size());
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nfeats; ++i) {
const unsigned fid = fsplits[i];
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
const bst_uint ridx = it.rindex();
int nid = position[ridx];
if (nid == -1) continue;
// go back to parent, correct those who are not default
nid = tree[nid].parent();
if (tree[nid].split_index() == fid) {
if (it.fvalue() < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft();
} else {
position[ridx] = tree[nid].cright();
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
while (iter->Next()) {
const ColBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
ColBatch::Inst col = batch[i];
const bst_uint fid = batch.col_index[i];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const float fvalue = col[j].fvalue;
int nid = position[ridx];
if (nid == -1) continue;
// go back to parent, correct those who are not default
nid = tree[nid].parent();
if (tree[nid].split_index() == fid) {
if (fvalue < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft();
} else {
position[ridx] = tree[nid].cright();
}
}
}
}
@@ -369,7 +383,7 @@ class ColMaker: public IUpdater<FMatrix> {
// number of omp thread used during training
int nthread;
// Per feature: shuffle index of each feature index
std::vector<unsigned> feat_index;
std::vector<bst_uint> feat_index;
// Instance Data: current node position in the tree of each instance
std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction

View File

@@ -12,8 +12,7 @@
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix>
class TreePruner: public IUpdater<FMatrix> {
class TreePruner: public IUpdater {
public:
virtual ~TreePruner(void) {}
// set training parameter
@@ -23,7 +22,7 @@ class TreePruner: public IUpdater<FMatrix> {
}
// update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
// rescale learning rate according to size of trees
@@ -75,7 +74,6 @@ class TreePruner: public IUpdater<FMatrix> {
// training parameter
TrainParam param;
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_

View File

@@ -9,12 +9,13 @@
#include <limits>
#include "./param.h"
#include "./updater.h"
#include "../utils/omp.h"
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats>
class TreeRefresher: public IUpdater<FMatrix> {
template<typename TStats>
class TreeRefresher: public IUpdater {
public:
virtual ~TreeRefresher(void) {}
// set training parameter
@@ -23,7 +24,7 @@ class TreeRefresher: public IUpdater<FMatrix> {
}
// update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
if (trees.size() == 0) return;
@@ -50,7 +51,7 @@ class TreeRefresher: public IUpdater<FMatrix> {
fvec_temp[tid].Init(trees[0]->param.num_feature);
}
// start accumulating statistics
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();