From b762231b0280feecedff132ff9f96c2ae4f340ed Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Thu, 15 Jan 2015 21:32:31 -0800
Subject: [PATCH] change makefile to lazy checkpt, fix col splt code

---
 Makefile                                      |  8 ++--
 .../col-split/mushroom-col-rabit-mock.sh      |  2 +-
 multi-node/col-split/mushroom-col-rabit.sh    |  8 ++--
 .../row-split/machine-row-rabit-mock.sh       |  2 +-
 src/gbm/gbm.h                                 |  9 +++-
 src/gbm/gbtree-inl.hpp                        | 48 +++++++++++++------
 src/learner/learner-inl.hpp                   |  6 +++
 src/xgboost_main.cpp                          | 34 +++++++++----
 8 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/Makefile b/Makefile
index 7c89d24da..9716f8149 100644
--- a/Makefile
+++ b/Makefile
@@ -18,9 +18,9 @@ endif
 
 # specify tensor path
 BIN = xgboost 
-MOCKBIN = xgboost-mock
+MOCKBIN = xgboost.mock
 OBJ = updater.o gbm.o io.o main.o 
-MPIBIN = xgboost-mpi
+MPIBIN = xgboost.mpi
 SLIB = wrapper/libxgboostwrapper.so 
 
 .PHONY: clean all mpi python Rpack librabit librabit_mpi
@@ -42,8 +42,8 @@ updater.o: src/tree/updater.cpp  src/tree/*.hpp src/*.h src/tree/*.h src/utils/*
 gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h 
 io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
 main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h 
-xgboost-mpi:  updater.o gbm.o io.o main.o librabit_mpi
-xgboost-mock: updater.o gbm.o io.o main.o librabit
+xgboost.mpi:  updater.o gbm.o io.o main.o librabit_mpi
+xgboost.mock: updater.o gbm.o io.o main.o librabit
 xgboost:  updater.o gbm.o io.o main.o  librabit
 wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h  updater.o gbm.o io.o librabit
 
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
index 269967419..65e62309a 100755
--- a/multi-node/col-split/mushroom-col-rabit-mock.sh
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -16,7 +16,7 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
 #../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh
index b9595e5b7..f958305aa 100755
--- a/multi-node/col-split/mushroom-col-rabit.sh
+++ b/multi-node/col-split/mushroom-col-rabit.sh
@@ -16,13 +16,13 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col
+../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
 
 # the model can be directly loaded by single machine xgboost solver, as usuall
-../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt 
 
 # run for one round, and continue training
-../../rabit/tracker/rabit_mpi.py $k local  ../../xgboost mushroom-col.conf dsplit=col num_round=1
-../../rabit/tracker/rabit_mpi.py $k local  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost mushroom-col.conf  mushroom-col.conf dsplit=col model_in=0001.model
 
 cat dump.nice.$k.txt
diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
index b08e7d4e6..b8ef10b2d 100755
--- a/multi-node/row-split/machine-row-rabit-mock.sh
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -17,4 +17,4 @@ cd -
 python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
 
 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py -n $k ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 
+../../rabit/tracker/rabit_demo.py -n $k  ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0  mock=0,0,3,0 mock=2,2,3,0
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 8799a7af0..57b8c0573 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -46,6 +46,14 @@ class IGradBooster {
    * and recalculate from scratch
    */
   virtual void ResetPredBuffer(size_t num_pbuffer) {}
+  /*! 
+   * \brief whether the model allow lazy checkpoint
+   * return true if model is only updated in DoBoost 
+   * after all Allreduce calls
+   */
+  virtual bool AllowLazyCheckPoint(void) const {
+    return false;
+  }
   /*!
    * \brief peform update to the model(boosting)
    * \param p_fmat feature matrix that provide access to features
@@ -76,7 +84,6 @@ class IGradBooster {
                        const BoosterInfo &info,
                        std::vector<float> *out_preds,
                        unsigned ntree_limit = 0) = 0;
-  
   /*!
    * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
    *        this is only valid in gbtree predictor
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index e8f1b1933..c08d15dd7 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -90,13 +90,17 @@ class GBTree : public IGradBooster {
     pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
     pred_counter.resize(mparam.PredBufferSize(), 0);
   }
+  virtual bool AllowLazyCheckPoint(void) const {
+    return !(tparam.distcol_mode != 0  && mparam.num_output_group != 1);
+  }
   virtual void DoBoost(IFMatrix *p_fmat,
                        int64_t buffer_offset,
                        const BoosterInfo &info,
                        std::vector<bst_gpair> *in_gpair) {
     const std::vector<bst_gpair> &gpair = *in_gpair;
-    if (mparam.num_output_group == 1) {
-      this->BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0);
+    std::vector<std::vector<tree::RegTree*> > new_trees;
+    if (mparam.num_output_group == 1) {      
+      new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
     } else {
       const int ngroup = mparam.num_output_group;
       utils::Check(gpair.size() % ngroup == 0,
@@ -108,9 +112,12 @@ class GBTree : public IGradBooster {
         for (bst_omp_uint i = 0; i < nsize; ++i) {
           tmp[i] = gpair[i * ngroup + gid];
         }
-        this->BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid);
+        new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
       }
     }
+    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+      this->CommitModel(new_trees[gid], gid);
+    }
   }
   virtual void Predict(IFMatrix *p_fmat,
                        int64_t buffer_offset,
@@ -208,14 +215,15 @@ class GBTree : public IGradBooster {
     tparam.updater_initialized = 1;
   }
   // do group specific group
-  inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
-                            IFMatrix *p_fmat,
-                            int64_t buffer_offset,
-                            const BoosterInfo &info,
-                            int bst_group) {
+  inline std::vector<tree::RegTree*>
+  BoostNewTrees(const std::vector<bst_gpair> &gpair,
+                IFMatrix *p_fmat,
+                int64_t buffer_offset,
+                const BoosterInfo &info,
+                int bst_group) {
+    std::vector<tree::RegTree *> new_trees;
     this->InitUpdater();
     // create the trees
-    std::vector<tree::RegTree *> new_trees;
     for (int i = 0; i < tparam.num_parallel_tree; ++i) {
       new_trees.push_back(new tree::RegTree());
       for (size_t j = 0; j < cfg.size(); ++j) {
@@ -226,9 +234,12 @@ class GBTree : public IGradBooster {
     // update the trees
     for (size_t i = 0; i < updaters.size(); ++i) {
       updaters[i]->Update(gpair, p_fmat, info, new_trees);
-    }
+    }    
     // optimization, update buffer, if possible
-    if (buffer_offset >= 0 &&
+    // this is only under distributed column mode
+    // for safety check of lazy checkpoint
+    if (
+        buffer_offset >= 0 &&
         new_trees.size() == 1 && updaters.size() > 0 &&
         updaters.back()->GetLeafPosition() != NULL) {
       utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
@@ -238,12 +249,15 @@ class GBTree : public IGradBooster {
                                    *new_trees[0],
                                    updaters.back()->GetLeafPosition());
     }
-    // push back to model
+    return new_trees;
+  }
+  // commit new trees all at once
+  inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
     for (size_t i = 0; i < new_trees.size(); ++i) {
       trees.push_back(new_trees[i]);
       tree_info.push_back(bst_group);
     }
-    mparam.num_trees += tparam.num_parallel_tree;
+    mparam.num_trees += static_cast<int>(new_trees.size());
   }
   // update buffer by pre-cached position
   inline void UpdateBufferByPosition(IFMatrix *p_fmat,
@@ -264,7 +278,7 @@ class GBTree : public IGradBooster {
       for (int i = 0; i < mparam.size_leaf_vector; ++i) {
         pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
       }
-      pred_counter[bid] += 1;
+      pred_counter[bid] += tparam.num_parallel_tree;
     }
   }
   // make a prediction for a single instance
@@ -362,6 +376,8 @@ class GBTree : public IGradBooster {
     int num_parallel_tree;
     /*! \brief whether updater is already initialized */
     int updater_initialized;
+    /*! \brief distributed column mode */
+    int distcol_mode;
     /*! \brief tree updater sequence */
     std::string updater_seq;
     // construction
@@ -370,6 +386,7 @@ class GBTree : public IGradBooster {
       updater_seq = "grow_colmaker,prune";
       num_parallel_tree = 1;
       updater_initialized = 0;
+      distcol_mode = 0;
     }
     inline void SetParam(const char *name, const char *val){
       using namespace std;
@@ -378,6 +395,9 @@ class GBTree : public IGradBooster {
         updater_seq = val;
         updater_initialized = 0;
       }
+      if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
+        distcol_mode = 1;
+      }
       if (!strcmp(name, "nthread")) {
         omp_set_num_threads(nthread = atoi(val));
       }
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index ae0967ce8..5e3622e4d 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -270,6 +270,12 @@ class BoostLearner : public rabit::ISerializable {
     obj_->GetGradient(preds_, train.info, iter, &gpair_);
     gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
   }
+  /*!
+   * \brief whether model allow lazy checkpoint
+   */
+  inline bool AllowLazyCheckPoint(void) const {
+    return gbm_->AllowLazyCheckPoint();
+  }
   /*!
    * \brief evaluate the model for specific iteration
    * \param iter iteration number
diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp
index db37cbd1d..94e6d6bc1 100644
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -48,7 +48,7 @@ class BoostLearnTask {
       std::string pname = rabit::GetProcessorName();
       fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
     }
-    if (rabit::IsDistributed()) {
+    if (rabit::IsDistributed() && data_split == "NONE") {
       this->SetParam("dsplit", "row");
     }
     if (rabit::GetRank() != 0) {
@@ -89,6 +89,7 @@ class BoostLearnTask {
     if (!strcmp("fmap", name)) name_fmap = val;
     if (!strcmp("name_dump", name)) name_dump = val;
     if (!strcmp("name_pred", name)) name_pred = val;
+    if (!strcmp("dsplit", name)) data_split = val;
     if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
     if (!strncmp("eval[", name, 5)) {
       char evname[256];
@@ -116,6 +117,7 @@ class BoostLearnTask {
     name_pred = "pred.txt";
     name_dump = "dump.txt";
     model_dir_path = "./";
+    data_split = "NONE";
     load_part = 0;
     data = NULL;
   }
@@ -172,14 +174,24 @@ class BoostLearnTask {
   inline void TaskTrain(void) {
     int version = rabit::LoadCheckPoint(&learner);
     if (version == 0) this->InitLearner();
-
     const time_t start = time(NULL);
     unsigned long elapsed = 0;
     learner.CheckInit(data);
-    for (int i = version; i < num_round; ++i) {
+
+    bool allow_lazy = learner.AllowLazyCheckPoint();
+    for (int i = version / 2; i < num_round; ++i) {
       elapsed = (unsigned long)(time(NULL) - start);
-      if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
-      learner.UpdateOneIter(i, *data); 
+      if (version % 2 == 0) { 
+        if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
+        learner.UpdateOneIter(i, *data);
+        if (allow_lazy) {
+          rabit::LazyCheckPoint(&learner);
+        } else {
+          rabit::CheckPoint(&learner);
+        }
+        version += 1;
+      }
+      utils::Assert(version == rabit::VersionNumber(), "consistent check");
       std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
       if (rabit::IsDistributed()){
         if (rabit::GetRank() == 0) {
@@ -193,9 +205,13 @@ class BoostLearnTask {
       if (save_period != 0 && (i + 1) % save_period == 0) {
         this->SaveModel(i);
       }
-      utils::Assert(rabit::VersionNumber() == i, "incorrect version number");
-      // checkpoint the model
-      rabit::CheckPoint(&learner);
+      if (allow_lazy) {
+        rabit::LazyCheckPoint(&learner);
+      } else {
+        rabit::CheckPoint(&learner);
+      }
+      version += 1;
+      utils::Assert(version == rabit::VersionNumber(), "consistent check");
       elapsed = (unsigned long)(time(NULL) - start);
     }
     // always save final round
@@ -272,6 +288,8 @@ class BoostLearnTask {
   std::string task;
   /*! \brief name of predict file */
   std::string name_pred;
+  /*! \brief data split mode */
+  std::string data_split;
   /*!\brief limit number of trees in prediction */
   int ntree_limit;
   /*!\brief whether to directly output margin value */