From b762231b0280feecedff132ff9f96c2ae4f340ed Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 15 Jan 2015 21:32:31 -0800 Subject: [PATCH] change makefile to lazy checkpt, fix col splt code --- Makefile | 8 ++-- .../col-split/mushroom-col-rabit-mock.sh | 2 +- multi-node/col-split/mushroom-col-rabit.sh | 8 ++-- .../row-split/machine-row-rabit-mock.sh | 2 +- src/gbm/gbm.h | 9 +++- src/gbm/gbtree-inl.hpp | 48 +++++++++++++------ src/learner/learner-inl.hpp | 6 +++ src/xgboost_main.cpp | 34 +++++++++---- 8 files changed, 84 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 7c89d24da..9716f8149 100644 --- a/Makefile +++ b/Makefile @@ -18,9 +18,9 @@ endif # specify tensor path BIN = xgboost -MOCKBIN = xgboost-mock +MOCKBIN = xgboost.mock OBJ = updater.o gbm.o io.o main.o -MPIBIN = xgboost-mpi +MPIBIN = xgboost.mpi SLIB = wrapper/libxgboostwrapper.so .PHONY: clean all mpi python Rpack librabit librabit_mpi @@ -42,8 +42,8 @@ updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/* gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h -xgboost-mpi: updater.o gbm.o io.o main.o librabit_mpi -xgboost-mock: updater.o gbm.o io.o main.o librabit +xgboost.mpi: updater.o gbm.o io.o main.o librabit_mpi +xgboost.mock: updater.o gbm.o io.o main.o librabit xgboost: updater.o gbm.o io.o main.o librabit wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o librabit diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh index 269967419..65e62309a 100755 --- a/multi-node/col-split/mushroom-col-rabit-mock.sh +++ b/multi-node/col-split/mushroom-col-rabit-mock.sh @@ -16,7 +16,7 @@ k=$1 python splitsvm.py ../../demo/data/agaricus.txt.train train $k # run xgboost mpi -../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0 +../../rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0 # the model can be directly loaded by single machine xgboost solver, as usuall #../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh index b9595e5b7..f958305aa 100755 --- a/multi-node/col-split/mushroom-col-rabit.sh +++ b/multi-node/col-split/mushroom-col-rabit.sh @@ -16,13 +16,13 @@ k=$1 python splitsvm.py ../../demo/data/agaricus.txt.train train $k # run xgboost mpi -../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col +../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col # the model can be directly loaded by single machine xgboost solver, as usuall -../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt +../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt # run for one round, and continue training -../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col num_round=1 -../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model +../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col num_round=1 +../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model cat dump.nice.$k.txt diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh index b08e7d4e6..b8ef10b2d 100755 --- a/multi-node/row-split/machine-row-rabit-mock.sh +++ b/multi-node/row-split/machine-row-rabit-mock.sh @@ -17,4 +17,4 @@ cd - python splitrows.py ../../demo/regression/machine.txt.train train-machine $k # run xgboost mpi -../../rabit/tracker/rabit_mpi.py -n $k ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 +../../rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 mock=0,0,3,0 mock=2,2,3,0 diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h index 8799a7af0..57b8c0573 100644 --- a/src/gbm/gbm.h +++ b/src/gbm/gbm.h @@ -46,6 +46,14 @@ class IGradBooster { * and recalculate from scratch */ virtual void ResetPredBuffer(size_t num_pbuffer) {} + /*! + * \brief whether the model allow lazy checkpoint + * return true if model is only updated in DoBoost + * after all Allreduce calls + */ + virtual bool AllowLazyCheckPoint(void) const { + return false; + } /*! * \brief peform update to the model(boosting) * \param p_fmat feature matrix that provide access to features @@ -76,7 +84,6 @@ class IGradBooster { const BoosterInfo &info, std::vector *out_preds, unsigned ntree_limit = 0) = 0; - /*! * \brief predict the leaf index of each tree, the output will be nsample * ntree vector * this is only valid in gbtree predictor diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index e8f1b1933..c08d15dd7 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -90,13 +90,17 @@ class GBTree : public IGradBooster { pred_buffer.resize(mparam.PredBufferSize(), 0.0f); pred_counter.resize(mparam.PredBufferSize(), 0); } + virtual bool AllowLazyCheckPoint(void) const { + return !(tparam.distcol_mode != 0 && mparam.num_output_group != 1); + } virtual void DoBoost(IFMatrix *p_fmat, int64_t buffer_offset, const BoosterInfo &info, std::vector *in_gpair) { const std::vector &gpair = *in_gpair; - if (mparam.num_output_group == 1) { - this->BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0); + std::vector > new_trees; + if (mparam.num_output_group == 1) { + new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0)); } else { const int ngroup = mparam.num_output_group; utils::Check(gpair.size() % ngroup == 0, @@ -108,9 +112,12 @@ class GBTree : public IGradBooster { for (bst_omp_uint i = 0; i < nsize; ++i) { tmp[i] = gpair[i * ngroup + gid]; } - this->BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid); + new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid)); } } + for (int gid = 0; gid < mparam.num_output_group; ++gid) { + this->CommitModel(new_trees[gid], gid); + } } virtual void Predict(IFMatrix *p_fmat, int64_t buffer_offset, @@ -208,14 +215,15 @@ class GBTree : public IGradBooster { tparam.updater_initialized = 1; } // do group specific group - inline void BoostNewTrees(const std::vector &gpair, - IFMatrix *p_fmat, - int64_t buffer_offset, - const BoosterInfo &info, - int bst_group) { + inline std::vector + BoostNewTrees(const std::vector &gpair, + IFMatrix *p_fmat, + int64_t buffer_offset, + const BoosterInfo &info, + int bst_group) { + std::vector new_trees; this->InitUpdater(); // create the trees - std::vector new_trees; for (int i = 0; i < tparam.num_parallel_tree; ++i) { new_trees.push_back(new tree::RegTree()); for (size_t j = 0; j < cfg.size(); ++j) { @@ -226,9 +234,12 @@ class GBTree : public IGradBooster { // update the trees for (size_t i = 0; i < updaters.size(); ++i) { updaters[i]->Update(gpair, p_fmat, info, new_trees); - } + } // optimization, update buffer, if possible - if (buffer_offset >= 0 && + // this is only under distributed column mode + // for safety check of lazy checkpoint + if ( + buffer_offset >= 0 && new_trees.size() == 1 && updaters.size() > 0 && updaters.back()->GetLeafPosition() != NULL) { utils::Check(info.num_row == p_fmat->buffered_rowset().size(), @@ -238,12 +249,15 @@ class GBTree : public IGradBooster { *new_trees[0], updaters.back()->GetLeafPosition()); } - // push back to model + return new_trees; + } + // commit new trees all at once + inline void CommitModel(const std::vector &new_trees, int bst_group) { for (size_t i = 0; i < new_trees.size(); ++i) { trees.push_back(new_trees[i]); tree_info.push_back(bst_group); } - mparam.num_trees += tparam.num_parallel_tree; + mparam.num_trees += static_cast(new_trees.size()); } // update buffer by pre-cached position inline void UpdateBufferByPosition(IFMatrix *p_fmat, @@ -264,7 +278,7 @@ class GBTree : public IGradBooster { for (int i = 0; i < mparam.size_leaf_vector; ++i) { pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i]; } - pred_counter[bid] += 1; + pred_counter[bid] += tparam.num_parallel_tree; } } // make a prediction for a single instance @@ -362,6 +376,8 @@ class GBTree : public IGradBooster { int num_parallel_tree; /*! \brief whether updater is already initialized */ int updater_initialized; + /*! \brief distributed column mode */ + int distcol_mode; /*! \brief tree updater sequence */ std::string updater_seq; // construction @@ -370,6 +386,7 @@ class GBTree : public IGradBooster { updater_seq = "grow_colmaker,prune"; num_parallel_tree = 1; updater_initialized = 0; + distcol_mode = 0; } inline void SetParam(const char *name, const char *val){ using namespace std; @@ -378,6 +395,9 @@ class GBTree : public IGradBooster { updater_seq = val; updater_initialized = 0; } + if (!strcmp(name, "dsplit") && !strcmp(val, "col")) { + distcol_mode = 1; + } if (!strcmp(name, "nthread")) { omp_set_num_threads(nthread = atoi(val)); } diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index ae0967ce8..5e3622e4d 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -270,6 +270,12 @@ class BoostLearner : public rabit::ISerializable { obj_->GetGradient(preds_, train.info, iter, &gpair_); gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_); } + /*! + * \brief whether model allow lazy checkpoint + */ + inline bool AllowLazyCheckPoint(void) const { + return gbm_->AllowLazyCheckPoint(); + } /*! * \brief evaluate the model for specific iteration * \param iter iteration number diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp index db37cbd1d..94e6d6bc1 100644 --- a/src/xgboost_main.cpp +++ b/src/xgboost_main.cpp @@ -48,7 +48,7 @@ class BoostLearnTask { std::string pname = rabit::GetProcessorName(); fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank()); } - if (rabit::IsDistributed()) { + if (rabit::IsDistributed() && data_split == "NONE") { this->SetParam("dsplit", "row"); } if (rabit::GetRank() != 0) { @@ -89,6 +89,7 @@ class BoostLearnTask { if (!strcmp("fmap", name)) name_fmap = val; if (!strcmp("name_dump", name)) name_dump = val; if (!strcmp("name_pred", name)) name_pred = val; + if (!strcmp("dsplit", name)) data_split = val; if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); if (!strncmp("eval[", name, 5)) { char evname[256]; @@ -116,6 +117,7 @@ class BoostLearnTask { name_pred = "pred.txt"; name_dump = "dump.txt"; model_dir_path = "./"; + data_split = "NONE"; load_part = 0; data = NULL; } @@ -172,14 +174,24 @@ class BoostLearnTask { inline void TaskTrain(void) { int version = rabit::LoadCheckPoint(&learner); if (version == 0) this->InitLearner(); - const time_t start = time(NULL); unsigned long elapsed = 0; learner.CheckInit(data); - for (int i = version; i < num_round; ++i) { + + bool allow_lazy = learner.AllowLazyCheckPoint(); + for (int i = version / 2; i < num_round; ++i) { elapsed = (unsigned long)(time(NULL) - start); - if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); - learner.UpdateOneIter(i, *data); + if (version % 2 == 0) { + if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); + learner.UpdateOneIter(i, *data); + if (allow_lazy) { + rabit::LazyCheckPoint(&learner); + } else { + rabit::CheckPoint(&learner); + } + version += 1; + } + utils::Assert(version == rabit::VersionNumber(), "consistent check"); std::string res = learner.EvalOneIter(i, devalall, eval_data_names); if (rabit::IsDistributed()){ if (rabit::GetRank() == 0) { @@ -193,9 +205,13 @@ class BoostLearnTask { if (save_period != 0 && (i + 1) % save_period == 0) { this->SaveModel(i); } - utils::Assert(rabit::VersionNumber() == i, "incorrect version number"); - // checkpoint the model - rabit::CheckPoint(&learner); + if (allow_lazy) { + rabit::LazyCheckPoint(&learner); + } else { + rabit::CheckPoint(&learner); + } + version += 1; + utils::Assert(version == rabit::VersionNumber(), "consistent check"); elapsed = (unsigned long)(time(NULL) - start); } // always save final round @@ -272,6 +288,8 @@ class BoostLearnTask { std::string task; /*! \brief name of predict file */ std::string name_pred; + /*! \brief data split mode */ + std::string data_split; /*!\brief limit number of trees in prediction */ int ntree_limit; /*!\brief whether to directly output margin value */