fix the row split recovery, add per iteration random number seed

2014-12-21 17:31:42 -08:00
parent eff5c6baa8
commit 677475529f
6 changed files with 34 additions and 23 deletions
--- a/4
+++ b/4
@@ -13,7 +13,7 @@ endif
 # by default use c++11
 ifeq ($(no_cxx11),1)
 else 
-	CFLAGS += -std=c++11
+	CFLAGS += 
 endif

 # specify tensor path
@@ -30,7 +30,7 @@ mpi: $(MPIBIN)
 # rules to get rabit library
 librabit:
 	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
-	cd rabit;make lib/librabit.a; cd -
+	cd rabit;make lib/librabit.a lib/librabit_mock.a; cd -
 librabit_mpi:
 	if [ ! -d rabit ]; then git clone https://github.com/tqchen/rabit.git; fi
 	cd rabit;make lib/librabit_mpi.a; cd -
--- a/multi-node/col-split/mushroom-col-rabit-mock.sh
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -16,7 +16,7 @@ k=$1
 python splitsvm.py ../../demo/data/agaricus.txt.train train $k

 # run xgboost mpi
-../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,0,1,0 mock=1,1,0,0
+../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0

 # the model can be directly loaded by single machine xgboost solver, as usuall
 #../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -34,6 +34,8 @@ class BoostLearner : public rabit::ISerializable {
    prob_buffer_row = 1.0f;
    distributed_mode = 0;
    pred_buffer_size = 0;
+    seed_per_iteration = 0;
+    seed = 0;
  }
  virtual ~BoostLearner(void) {
    if (obj_ != NULL) delete obj_;
@@ -102,7 +104,10 @@ class BoostLearner : public rabit::ISerializable {
      this->SetParam("updater", "grow_colmaker,refresh,prune");
    }
    if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
-    if (!strcmp("seed", name)) random::Seed(atoi(val));
+    if (!strcmp("seed", name)) {
+      this->seed = seed; random::Seed(atoi(val));
+    }
+    if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
    if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
    if (!strcmp(name, "nthread")) {
      omp_set_num_threads(atoi(val));
@@ -222,6 +227,9 @@ class BoostLearner : public rabit::ISerializable {
   * \param p_train pointer to the data matrix
   */
  inline void UpdateOneIter(int iter, const DMatrix &train) {
+    if (seed_per_iteration || rabit::IsDistributed()) {
+      random::Seed(this->seed * kRandSeedMagic);
+    }
    this->PredictRaw(train, &preds_);
    obj_->GetGradient(preds_, train.info, iter, &gpair_);
    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
@@ -369,6 +377,12 @@ class BoostLearner : public rabit::ISerializable {
    }
  };
  // data fields
+  // stored random seed
+  int seed;
+  // whether seed the PRNG each iteration
+  // this is important for restart from existing iterations
+  // default set to no, but will auto switch on in distributed mode
+  int seed_per_iteration;
  // silent during training
  int silent;
  // distributed learning mode, if any, 0:none, 1:col, 2:row
@@ -397,6 +411,8 @@ class BoostLearner : public rabit::ISerializable {
  std::vector<bst_gpair> gpair_;

 protected:
+  // magic number to transform random seed
+  const static int kRandSeedMagic = 127;
  // cache entry object that helps handle feature caching
  struct CacheEntry {
    const DMatrix *mat_;
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -76,19 +76,15 @@ class BaseMaker: public IUpdater {
      unsigned n = static_cast<unsigned>(p * findex.size());
      random::Shuffle(findex);
      findex.resize(n);
-      if (n != findex.size()) {
-        // sync the findex if it is subsample
-        std::string s_cache;
-        utils::MemoryBufferStream fc(&s_cache);
-        utils::IStream &fs = fc;
-        if (rabit::GetRank() == 0) {
-          fs.Write(findex);
-          rabit::Broadcast(&s_cache, 0);
-        } else {
-          rabit::Broadcast(&s_cache, 0);
-          fs.Read(&findex);
-        }
+      // sync the findex if it is subsample
+      std::string s_cache;
+      utils::MemoryBufferStream fc(&s_cache);
+      utils::IStream &fs = fc;
+      if (rabit::GetRank() == 0) {
+        fs.Write(findex);
      }
+      rabit::Broadcast(&s_cache, 0);
+      fs.Read(&findex);
    }
    
   private:
--- a/src/tree/updater_sync-inl.hpp
+++ b/src/tree/updater_sync-inl.hpp
@@ -40,12 +40,11 @@ class TreeSyncher: public IUpdater {
      for (size_t i = 0; i < trees.size(); ++i) {
        trees[i]->SaveModel(fs);
      }
-      rabit::Broadcast(&s_model, 0);
-    } else {
-      rabit::Broadcast(&s_model, 0);
-      for (size_t i = 0; i < trees.size(); ++i) {      
-        trees[i]->LoadModel(fs);
-      }
+    }
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    for (size_t i = 0; i < trees.size(); ++i) {      
+      trees[i]->LoadModel(fs);
    }
  }
 };
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -284,8 +284,8 @@ class BoostLearnTask {
 }

 int main(int argc, char *argv[]){
-  xgboost::random::Seed(0);
  xgboost::BoostLearnTask tsk;
+  tsk.SetParam("seed", "0");
  int ret = tsk.Run(argc, argv);
  rabit::Finalize();
  return ret;