From c640485f1d0d90ca8742eb1d641f7ba1ed5d21d7 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 24 Aug 2014 18:48:19 -0700 Subject: [PATCH] initial correction for vec tree --- src/gbm/gbtree-inl.hpp | 58 ++++++++++++++++++++++------------ src/learner/evaluation-inl.hpp | 5 +-- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index 3d5ca4492..dfc317d8d 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -117,17 +117,13 @@ class GBTree : public IGradBooster { } std::vector &preds = *out_preds; - preds.resize(0); + const size_t stride = info.num_row * mparam.num_output_group; + preds.resize(stride * (mparam.size_leaf_vector+1)); // start collecting the prediction utils::IIterator *iter = fmat.RowIterator(); iter->BeforeFirst(); while (iter->Next()) { const SparseBatch &batch = iter->Value(); - utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(), - "base_rowid is not set correctly"); - // output convention: nrow * k, where nrow is number of rows - // k is number of group - preds.resize(preds.size() + batch.size * mparam.num_output_group); // parallel over local batch const unsigned nsize = static_cast(batch.size); #pragma omp parallel for schedule(static) @@ -135,13 +131,13 @@ class GBTree : public IGradBooster { const int tid = omp_get_thread_num(); tree::RegTree::FVec &feats = thread_temp[tid]; const size_t ridx = batch.base_rowid + i; - const unsigned root_idx = info.GetRoot(ridx); + utils::Assert(ridx < info.num_row, "data row index exceed bound"); // loop over output groups for (int gid = 0; gid < mparam.num_output_group; ++gid) { - preds[ridx * mparam.num_output_group + gid] = - this->Pred(batch[i], - buffer_offset < 0 ? -1 : buffer_offset+ridx, - gid, root_idx, &feats); + this->Pred(batch[i], + buffer_offset < 0 ? -1 : buffer_offset + ridx, + gid, info.GetRoot(ridx), &feats, + &preds[ridx * mparam.num_output_group + gid], stride); } } } @@ -211,24 +207,34 @@ class GBTree : public IGradBooster { mparam.num_trees += tparam.num_parallel_tree; } // make a prediction for a single instance - inline float Pred(const SparseBatch::Inst &inst, - int64_t buffer_index, - int bst_group, - unsigned root_index, - tree::RegTree::FVec *p_feats) { + inline void Pred(const SparseBatch::Inst &inst, + int64_t buffer_index, + int bst_group, + unsigned root_index, + tree::RegTree::FVec *p_feats, + float *out_pred, size_t stride) { size_t itop = 0; float psum = 0.0f; + // sum of leaf vector + std::vector vec_psum(mparam.size_leaf_vector, 0.0f); const int bid = mparam.BufferOffset(buffer_index, bst_group); // load buffered results if any if (bid >= 0) { itop = pred_counter[bid]; psum = pred_buffer[bid]; + for (int i = 0; i < mparam.size_leaf_vector; ++i) { + vec_psum[i] = pred_buffer[bid + i + 1]; + } } if (itop != trees.size()) { p_feats->Fill(inst); for (size_t i = itop; i < trees.size(); ++i) { if (tree_info[i] == bst_group) { - psum += trees[i]->Predict(*p_feats, root_index); + int tid = trees[i]->GetLeafIndex(*p_feats, root_index); + psum += (*trees[i])[tid].leaf_value(); + for (int j = 0; j < mparam.size_leaf_vector; ++j) { + vec_psum[j] += trees[i]->leafvec(tid)[j]; + } } } p_feats->Drop(inst); @@ -237,8 +243,14 @@ class GBTree : public IGradBooster { if (bid >= 0) { pred_counter[bid] = static_cast(trees.size()); pred_buffer[bid] = psum; + for (int i = 0; i < mparam.size_leaf_vector; ++i) { + pred_buffer[bid + i + 1] = vec_psum[i]; + } + } + out_pred[0] = psum; + for (int i = 0; i < mparam.size_leaf_vector; ++i) { + out_pred[stride * (i + 1)] = vec_psum[i]; } - return psum; } // --- data structure --- /*! \brief training parameters */ @@ -291,14 +303,17 @@ class GBTree : public IGradBooster { * suppose we have n instance and k group, output will be k*n */ int num_output_group; + /*! \brief size of leaf vector needed in tree */ + int size_leaf_vector; /*! \brief reserved parameters */ - int reserved[32]; + int reserved[31]; /*! \brief constructor */ ModelParam(void) { num_trees = 0; num_roots = num_feature = 0; num_pbuffer = 0; num_output_group = 1; + size_leaf_vector = 0; memset(reserved, 0, sizeof(reserved)); } /*! @@ -311,10 +326,11 @@ class GBTree : public IGradBooster { if (!strcmp("num_output_group", name)) num_output_group = atol(val); if (!strcmp("bst:num_roots", name)) num_roots = atoi(val); if (!strcmp("bst:num_feature", name)) num_feature = atoi(val); + if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val); } /*! \return size of prediction buffer actually needed */ inline size_t PredBufferSize(void) const { - return num_output_group * num_pbuffer; + return num_output_group * num_pbuffer * (size_leaf_vector + 1); } /*! * \brief get the buffer offset given a buffer index and group id @@ -323,7 +339,7 @@ class GBTree : public IGradBooster { inline size_t BufferOffset(int64_t buffer_index, int bst_group) const { if (buffer_index < 0) return -1; utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer"); - return buffer_index + num_pbuffer * bst_group; + return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1); } }; // training parameter diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 72085be46..4df9cec4c 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -24,9 +24,10 @@ template struct EvalEWiseBase : public IEvaluator { virtual float Eval(const std::vector &preds, const MetaInfo &info) const { - utils::Check(preds.size() == info.labels.size(), + utils::Check(info.labels.size() != 0, "label set cannot be empty"); + utils::Check(preds.size() % info.labels.size() == 0, "label and prediction size not match"); - const unsigned ndata = static_cast(preds.size()); + const unsigned ndata = static_cast(info.labels.size()); float sum = 0.0, wsum = 0.0; #pragma omp parallel for reduction(+: sum, wsum) schedule(static) for (unsigned i = 0; i < ndata; ++i) {