change dense fvec logic to tree

This commit is contained in:
tqchen 2014-08-18 19:03:32 -07:00
parent 1d8c2391e8
commit dbf3a21942
2 changed files with 57 additions and 35 deletions

View File

@ -110,7 +110,11 @@ class GBTree : public IGradBooster<FMatrix> {
{
nthread = omp_get_num_threads();
}
this->InitThreadTemp(nthread);
thread_temp.resize(nthread, tree::RegTree::FVec());
for (int i = 0; i < nthread; ++i) {
thread_temp[i].Init(mparam.num_feature);
}
std::vector<float> &preds = *out_preds;
preds.resize(0);
// start collecting the prediction
@ -128,7 +132,7 @@ class GBTree : public IGradBooster<FMatrix> {
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num();
std::vector<float> &feats = thread_temp[tid];
tree::RegTree::FVec &feats = thread_temp[tid];
const size_t ridx = batch.base_rowid + i;
const unsigned root_idx = root_index.size() == 0 ? 0 : root_index[ridx];
// loop over output groups
@ -210,7 +214,7 @@ class GBTree : public IGradBooster<FMatrix> {
int64_t buffer_index,
int bst_group,
unsigned root_index,
std::vector<float> *p_feats) {
tree::RegTree::FVec *p_feats) {
size_t itop = 0;
float psum = 0.0f;
const int bid = mparam.BufferOffset(buffer_index, bst_group);
@ -220,13 +224,13 @@ class GBTree : public IGradBooster<FMatrix> {
psum = pred_buffer[bid];
}
if (itop != trees.size()) {
FillThreadTemp(inst, p_feats);
p_feats->Fill(inst);
for (size_t i = itop; i < trees.size(); ++i) {
if (tree_info[i] == bst_group) {
psum += trees[i]->Predict(*p_feats, root_index);
}
}
DropThreadTemp(inst, p_feats);
p_feats->Drop(inst);
}
// updated the buffered results
if (bid >= 0) {
@ -235,30 +239,6 @@ class GBTree : public IGradBooster<FMatrix> {
}
return psum;
}
// initialize thread local space for prediction
inline void InitThreadTemp(int nthread) {
thread_temp.resize(nthread);
for (size_t i = 0; i < thread_temp.size(); ++i) {
thread_temp[i].resize(mparam.num_feature);
std::fill(thread_temp[i].begin(), thread_temp[i].end(), NAN);
}
}
// fill in a thread local dense vector using a sparse instance
inline static void FillThreadTemp(const SparseBatch::Inst &inst,
std::vector<float> *p_feats) {
std::vector<float> &feats = *p_feats;
for (bst_uint i = 0; i < inst.length; ++i) {
feats[inst[i].findex] = inst[i].fvalue;
}
}
// clear up a thread local dense vector
inline static void DropThreadTemp(const SparseBatch::Inst &inst,
std::vector<float> *p_feats) {
std::vector<float> &feats = *p_feats;
for (bst_uint i = 0; i < inst.length; ++i) {
feats[inst[i].findex] = NAN;
}
}
// --- data structure ---
/*! \brief training parameters */
struct TrainParam {
@ -361,7 +341,7 @@ class GBTree : public IGradBooster<FMatrix> {
// configurations for tree
std::vector< std::pair<std::string, std::string> > cfg;
// temporal storage for per thread
std::vector< std::vector<float> > thread_temp;
std::vector<tree::RegTree::FVec> thread_temp;
// the updaters that can be applied to each of tree
std::vector< tree::IUpdater<FMatrix>* > updaters;
};

View File

@ -422,7 +422,7 @@ class TreeModel {
};
/*! \brief node statistics used in regression tree */
struct RTreeNodeStat{
struct RTreeNodeStat {
/*! \brief loss chg caused by current split */
float loss_chg;
/*! \brief sum of hessian values, used to measure coverage of data */
@ -444,20 +444,61 @@ struct RTreeNodeStat{
/*! \brief define regression tree to be the most common tree model */
class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
public:
/*!
* \brief dense feature vector that can be taken by RegTree
* to do tranverse efficiently
* and can be construct from sparse feature vector
*/
struct FVec {
/*!
* \brief a union value of value and flag
* when flag == -1, this indicate the value is missing
*/
union Entry{
float fvalue;
int flag;
};
std::vector<Entry> data;
/*! \brief intialize the vector with size vector */
inline void Init(size_t size) {
Entry e; e.flag = -1;
data.resize(size);
std::fill(data.begin(), data.end(), e);
}
/*! \brief fill the vector with sparse vector */
inline void Fill(const SparseBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].findex].fvalue = inst[i].fvalue;
}
}
/*! \brief drop the trace after fill, must be called after fill */
inline void Drop(const SparseBatch::Inst &inst) {
for (bst_uint i = 0; i < inst.length; ++i) {
data[inst[i].findex].flag = -1;
}
}
/*! \brief get ith value */
inline float fvalue(size_t i) const {
return data[i].fvalue;
}
/*! \brief check whether i-th entry is missing */
inline bool is_missing(size_t i) const {
return data[i].flag == -1;
}
};
/*!
* \brief get the leaf index
* \param feats dense feature vector, if the feature is missing the field is set to NaN
* \param root_gid starting root index of the instance
* \return the leaf index of the given feature
*/
inline int GetLeafIndex(const std::vector<float> &feat, unsigned root_id = 0) const {
inline int GetLeafIndex(const FVec&feat, unsigned root_id = 0) const {
// start from groups that belongs to current data
int pid = static_cast<int>(root_id);
// tranverse tree
while (!(*this)[ pid ].is_leaf()) {
unsigned split_index = (*this)[pid].split_index();
const float fvalue = feat[split_index];
pid = this->GetNext(pid, fvalue, std::isnan(fvalue));
pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
}
return pid;
}
@ -467,10 +508,11 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
* \param root_gid starting root index of the instance
* \return the leaf index of the given feature
*/
inline float Predict(const std::vector<float> &feat, unsigned root_id = 0) const {
inline float Predict(const FVec &feat, unsigned root_id = 0) const {
int pid = this->GetLeafIndex(feat, root_id);
return (*this)[pid].leaf_value();
}
private:
/*! \brief get next position of the tree given current pid */
inline int GetNext(int pid, float fvalue, bool is_unknown) const {