diff --git a/python/xgboost_wrapper.cpp b/python/xgboost_wrapper.cpp index 7f2365ba3..df05d9521 100644 --- a/python/xgboost_wrapper.cpp +++ b/python/xgboost_wrapper.cpp @@ -37,7 +37,7 @@ class Booster: public learner::BoostLearner { for (unsigned j = 0; j < ndata; ++j) { gpair_[j] = bst_gpair(grad[j], hess[j]); } - gbm_->DoBoost(gpair_, train.fmat, train.info.root_index); + gbm_->DoBoost(gpair_, train.fmat, train.info.info); } inline void CheckInitModel(void) { if (!init_model) { @@ -151,8 +151,8 @@ extern "C"{ if (src.info.weights.size() != 0) { ret.info.weights.push_back(src.info.weights[ridx]); } - if (src.info.root_index.size() != 0) { - ret.info.weights.push_back(src.info.root_index[ridx]); + if (src.info.info.root_index.size() != 0) { + ret.info.info.root_index.push_back(src.info.info.root_index[ridx]); } } return p_ret; diff --git a/src/data.h b/src/data.h index fea3f7a48..61d61e6a0 100644 --- a/src/data.h +++ b/src/data.h @@ -39,6 +39,24 @@ struct bst_gpair { bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {} }; +/*! + * \brief extra information that might needed by gbm and tree module + * these information are not necessarily presented, and can be empty + */ +struct BoosterInfo { + /*! + * \brief specified root index of each instance, + * can be used for multi task setting + */ + std::vector root_index; + /*! \brief set fold indicator */ + std::vector fold_index; + /*! \brief get root of ith instance */ + inline unsigned GetRoot(size_t i) const { + return root_index.size() == 0 ? 0 : root_index[i]; + } +}; + /*! \brief read-only sparse instance batch in CSR format */ struct SparseBatch { /*! \brief an entry of sparse vector */ diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h index dcc204868..f47adfdd2 100644 --- a/src/gbm/gbm.h +++ b/src/gbm/gbm.h @@ -43,12 +43,11 @@ class IGradBooster { * \brief peform update to the model(boosting) * \param gpair the gradient pair statistics of the data * \param fmat feature matrix that provide access to features - * \param root_index pre-partitioned root_index of each instance, - * root_index.size() can be 0 which indicates that no pre-partition involved + * \param info meta information about training */ virtual void DoBoost(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index) = 0; + const BoosterInfo &info) = 0; /*! * \brief generate predictions for given feature matrix * \param fmat feature matrix @@ -56,13 +55,12 @@ class IGradBooster { * this means we do not have buffer index allocated to the gbm * a buffer index is assigned to each instance that requires repeative prediction * the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size") - * \param root_index pre-partitioned root_index of each instance, - * root_index.size() can be 0 which indicates that no pre-partition involved + * \param info extra side information that may be needed for prediction * \param out_preds output vector to hold the predictions */ virtual void Predict(const FMatrix &fmat, int64_t buffer_offset, - const std::vector &root_index, + const BoosterInfo &info, std::vector *out_preds) = 0; /*! * \brief dump the model in text format diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index 876e13c2d..3fa0f4dd7 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -84,9 +84,9 @@ class GBTree : public IGradBooster { } virtual void DoBoost(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index) { + const BoosterInfo &info) { if (mparam.num_output_group == 1) { - this->BoostNewTrees(gpair, fmat, root_index, 0); + this->BoostNewTrees(gpair, fmat, info, 0); } else { const int ngroup = mparam.num_output_group; utils::Check(gpair.size() % ngroup == 0, @@ -97,13 +97,13 @@ class GBTree : public IGradBooster { for (size_t i = 0; i < tmp.size(); ++i) { tmp[i] = gpair[i * ngroup + gid]; } - this->BoostNewTrees(tmp, fmat, root_index, gid); + this->BoostNewTrees(tmp, fmat, info, gid); } } } virtual void Predict(const FMatrix &fmat, int64_t buffer_offset, - const std::vector &root_index, + const BoosterInfo &info, std::vector *out_preds) { int nthread; #pragma omp parallel @@ -134,7 +134,7 @@ class GBTree : public IGradBooster { const int tid = omp_get_thread_num(); tree::RegTree::FVec &feats = thread_temp[tid]; const size_t ridx = batch.base_rowid + i; - const unsigned root_idx = root_index.size() == 0 ? 0 : root_index[ridx]; + const unsigned root_idx = info.GetRoot(i); // loop over output groups for (int gid = 0; gid < mparam.num_output_group; ++gid) { preds[ridx * mparam.num_output_group + gid] = @@ -186,7 +186,7 @@ class GBTree : public IGradBooster { // do group specific group inline void BoostNewTrees(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index, + const BoosterInfo &info, int bst_group) { this->InitUpdater(); // create the trees @@ -200,7 +200,7 @@ class GBTree : public IGradBooster { } // update the trees for (size_t i = 0; i < updaters.size(); ++i) { - updaters[i]->Update(gpair, fmat, root_index, new_trees); + updaters[i]->Update(gpair, fmat, info, new_trees); } // push back to model for (size_t i = 0; i < new_trees.size(); ++i) { diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h index 5786fe6a1..b66cf86d0 100644 --- a/src/learner/dmatrix.h +++ b/src/learner/dmatrix.h @@ -28,11 +28,8 @@ struct MetaInfo { std::vector group_ptr; /*! \brief weights of each instance, optional */ std::vector weights; - /*! - * \brief specified root index of each instance, - * can be used for multi task setting - */ - std::vector root_index; + /*! \brief information needed by booster */ + BoosterInfo info; /*! * \brief initialized margins, * if specified, xgboost will start from this init margin @@ -48,7 +45,7 @@ struct MetaInfo { labels.clear(); group_ptr.clear(); weights.clear(); - root_index.clear(); + info.root_index.clear(); base_margin.clear(); num_row = num_col = 0; } @@ -60,14 +57,6 @@ struct MetaInfo { return 1.0f; } } - /*! \brief get root index of i-th instance */ - inline float GetRoot(size_t i) const { - if (root_index.size() != 0) { - return static_cast(root_index[i]); - } else { - return 0; - } - } inline void SaveBinary(utils::IStream &fo) const { int version = kVersion; fo.Write(&version, sizeof(version)); @@ -76,7 +65,7 @@ struct MetaInfo { fo.Write(labels); fo.Write(group_ptr); fo.Write(weights); - fo.Write(root_index); + fo.Write(info.root_index); fo.Write(base_margin); } inline void LoadBinary(utils::IStream &fi) { @@ -87,7 +76,7 @@ struct MetaInfo { utils::Check(fi.Read(&labels), "MetaInfo: invalid format"); utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format"); utils::Check(fi.Read(&weights), "MetaInfo: invalid format"); - utils::Check(fi.Read(&root_index), "MetaInfo: invalid format"); + utils::Check(fi.Read(&info.root_index), "MetaInfo: invalid format"); utils::Check(fi.Read(&base_margin), "MetaInfo: invalid format"); } // try to load group information from file, if exists diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index bd5cf6e3b..18611bea5 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -161,7 +161,7 @@ class BoostLearner { inline void UpdateOneIter(int iter, const DMatrix &train) { this->PredictRaw(train, &preds_); obj_->GetGradient(preds_, train.info, iter, &gpair_); - gbm_->DoBoost(gpair_, train.fmat, train.info.root_index); + gbm_->DoBoost(gpair_, train.fmat, train.info.info); } /*! * \brief evaluate the model for specific iteration @@ -242,7 +242,7 @@ class BoostLearner { inline void PredictRaw(const DMatrix &data, std::vector *out_preds) const { gbm_->Predict(data.fmat, this->FindBufferOffset(data), - data.info.root_index, out_preds); + data.info.info, out_preds); // add base margin std::vector &preds = *out_preds; const unsigned ndata = static_cast(preds.size()); diff --git a/src/tree/updater.h b/src/tree/updater.h index 2664e2ebd..cdb625266 100644 --- a/src/tree/updater.h +++ b/src/tree/updater.h @@ -29,8 +29,7 @@ class IUpdater { * \brief peform update to the tree models * \param gpair the gradient pair statistics of the data * \param fmat feature matrix that provide access to features - * \param root_index pre-partitioned root_index of each instance, - * root_index.size() can be 0 which indicates that no pre-partition involved + * \param info extra side information that may be need, such as root index * \param trees pointer to the trese to be updated, upater will change the content of the tree * note: all the trees in the vector are updated, with the same statistics, * but maybe different random seeds, usually one tree is passed in at a time, @@ -38,7 +37,7 @@ class IUpdater { */ virtual void Update(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index, + const BoosterInfo &info, const std::vector &trees) = 0; // destructor virtual ~IUpdater(void) {} diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index 35880b70e..919dfcc28 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -25,7 +25,7 @@ class ColMaker: public IUpdater { } virtual void Update(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index, + const BoosterInfo &info, const std::vector &trees) { // rescale learning rate according to size of trees float lr = param.learning_rate; @@ -33,7 +33,7 @@ class ColMaker: public IUpdater { // build tree for (size_t i = 0; i < trees.size(); ++i) { Builder builder(param); - builder.Update(gpair, fmat, root_index, trees[i]); + builder.Update(gpair, fmat, info, trees[i]); } param.learning_rate = lr; } @@ -77,9 +77,9 @@ class ColMaker: public IUpdater { // update one tree, growing virtual void Update(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index, + const BoosterInfo &info, RegTree *p_tree) { - this->InitData(gpair, fmat, root_index, *p_tree); + this->InitData(gpair, fmat, info.root_index, *p_tree); this->InitNewNode(qexpand, gpair, fmat, *p_tree); for (int depth = 0; depth < param.max_depth; ++depth) { diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp index 363d6eec1..d92685ad7 100644 --- a/src/tree/updater_prune-inl.hpp +++ b/src/tree/updater_prune-inl.hpp @@ -24,7 +24,7 @@ class TreePruner: public IUpdater { // update the tree, do pruning virtual void Update(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index, + const BoosterInfo &info, const std::vector &trees) { // rescale learning rate according to size of trees float lr = param.learning_rate; diff --git a/src/tree/updater_refresh-inl.hpp b/src/tree/updater_refresh-inl.hpp index 12bbcf864..e23174e51 100644 --- a/src/tree/updater_refresh-inl.hpp +++ b/src/tree/updater_refresh-inl.hpp @@ -24,7 +24,7 @@ class TreeRefresher: public IUpdater { // update the tree, do pruning virtual void Update(const std::vector &gpair, const FMatrix &fmat, - const std::vector &root_index, + const BoosterInfo &info, const std::vector &trees) { if (trees.size() == 0) return; // number of threads @@ -66,7 +66,7 @@ class TreeRefresher: public IUpdater { feats.Fill(inst); for (size_t j = 0; j < trees.size(); ++j) { AddStats(*trees[j], feats, gpair[ridx], - root_index.size() == 0 ? 0 : root_index[ridx], + info.GetRoot(j), &stemp[tid * trees.size() + j]); } feats.Drop(inst);