get multinode in

2014-11-19 19:19:53 -08:00 · 2014-11-19 19:19:53 -08:00 · c42ba8d281
commit c42ba8d281
parent 7c3a392136
14 changed files with 157 additions and 23 deletions
--- a/multi-node/README.md
+++ b/multi-node/README.md
@ -28,6 +28,7 @@ Design Choice
    this will reduce the communication overhead and improve the performance.
  - One way to do that is limit mpi slot in each machine to be 1, or reserve nthread processors for each process.
  
-Examples
+Usage
 ====
 * [Column-based version](col-split)
+* [Row-based version](row-split)
--- a/multi-node/col-split/README.md
+++ b/multi-node/col-split/README.md
@ -1,6 +1,6 @@
 Distributed XGBoost: Column Split Version
 ====
-* run ```bash run-mushroom.sh```
+* run ```bash mushroom-row.sh <n-mpi-process>```

 How to Use
 ====
--- a/multi-node/row-split/README.md
+++ b/multi-node/row-split/README.md
@ -0,0 +1,17 @@
+Distributed XGBoost: Row Split Version
+====
+* Mushroom: run ```bash mushroom-row.sh <n-mpi-process>```
+* Machine: run ```bash machine-row.sh <n-mpi-process>```
+
+How to Use
+====
+* First split the data by rows
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable ow split mode by ```dsplit=row```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
+  and will only examine subset of potential split points as opposed to all split points.
+* ```colsample_bytree``` is not enabled in row split mode so far
--- a/multi-node/row-split/machine-row.conf
+++ b/multi-node/row-split/machine-row.conf
@ -0,0 +1,31 @@
+# General Parameters, see comment for each definition
+# choose the tree booster, can also change to gblinear
+booster = gbtree
+# this is the only difference with classification, use reg:linear to do linear classification
+# when labels are in [0,1] we can also use reg:logistic
+objective = reg:linear
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data
+data = "train-machine.row%d" 
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/regression/machine.txt.test" 
+# The path of test data 
+test:data = "../../demo/regression/machine.txt.test" 
+
--- a/multi-node/row-split/machine-row.sh
+++ b/multi-node/row-split/machine-row.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi machine-row.conf dsplit=row 
+
--- a/multi-node/row-split/mushroom-row.conf
+++ b/multi-node/row-split/mushroom-row.conf
@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0 
+# minimum loss reduction required to make a further partition
+gamma = 1.0 
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1 
+# maximum depth of a tree
+max_depth = 3 
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0 
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.row%d" 
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test" 
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"      
--- a/multi-node/row-split/mushroom-row.sh
+++ b/multi-node/row-split/mushroom-row.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+    echo "Usage: nprocess"
+    exit -1
+fi
+
+rm -rf train.row* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+mpirun -n $k ../../xgboost-mpi mushroom-row.conf dsplit=row nthread=1
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-row.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+cat dump.nice.$k.txt
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@ -160,13 +160,13 @@ class SerializeReducer {
  inline void AllReduce(DType *sendrecvobj, size_t max_n4byte, size_t count) {
    buffer.resize(max_n4byte * count);
    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte * 4, max_n4byte * 4);
-      sendrecvobj[i]->Save(fs);
+      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte, max_n4byte * 4);
+      sendrecvobj[i].Save(fs);
    }
    handle.AllReduce(BeginPtr(buffer), max_n4byte, count);
    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte * 4, max_n4byte * 4);
-      sendrecvobj[i]->Load(fs);
+      utils::MemoryFixSizeBuffer fs(BeginPtr(buffer) + i * max_n4byte, max_n4byte * 4);
+      sendrecvobj[i].Load(fs);
    }
  }

@ -178,12 +178,12 @@ class SerializeReducer {
    // temp space
    DType tsrc, tdst;
    for (int i = 0; i < len_; ++i) {
-      utils::MemoryFixSizeBuffer fsrc((void*)(src_) + i * nbytes, nbytes);
-      utils::MemoryFixSizeBuffer fdst(dst_ + i * nbytes, nbytes);
+      utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes);
+      utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes);
      tsrc.Load(fsrc);
      tdst.Load(fdst);
      // govern const check
-      tdst.Reduce(static_cast<const DType &>(tsrc));
+      tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
      fdst.Seek(0);
      tdst.Save(fdst);
    }
--- a/src/sync/sync_empty.cpp
+++ b/src/sync/sync_empty.cpp
@ -38,6 +38,9 @@ void Bcast(std::string *sendrecv_data, int root) {

 ReduceHandle::ReduceHandle(void) : handle(NULL) {}
 ReduceHandle::~ReduceHandle(void) {}
+int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
+  return 0;
+}
 void ReduceHandle::Init(ReduceFunction redfunc, size_t type_n4bytes, bool commute) {}
 void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t n4byte) {}
 }  // namespace sync
--- a/src/sync/sync_mpi.cpp
+++ b/src/sync/sync_mpi.cpp
@ -97,9 +97,12 @@ void ReduceHandle::AllReduce(void *sendrecvbuf, size_t type_n4bytes, size_t coun
  utils::Assert(handle != NULL, "must intialize handle to call AllReduce");
  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle);
  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype);
-
-  if (created_type_n4bytes != type_n4bytes || htype == NULL) {
-    dtype->Free();
+  if (created_type_n4bytes != type_n4bytes || dtype == NULL) {
+    if (dtype == NULL) {
+      dtype = new MPI::Datatype();
+    } else {
+      dtype->Free();
+    }
    *dtype = MPI::INT.Create_contiguous(type_n4bytes);
    dtype->Commit();
    created_type_n4bytes = type_n4bytes;
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@ -18,7 +18,7 @@ IUpdater* CreateUpdater(const char *name) {
  if (!strcmp(name, "sync")) return new TreeSyncher();
  if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
  if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-  //if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
+  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
  //if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();

--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@ -507,7 +507,7 @@ class CQHistMaker: public HistMaker<TStats> {
  // node statistics
  std::vector<TStats> node_stats;
  // summary array
-  std::vector< WXQSketch::SummaryContainer> summary_array;
+  std::vector<WXQSketch::SummaryContainer> summary_array;
  // reducer for summary
  sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
  // per node, per feature sketch
@ -517,6 +517,7 @@ class CQHistMaker: public HistMaker<TStats> {
 template<typename TStats>
 class QuantileHistMaker: public HistMaker<TStats> {  
 protected:
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
                                  IFMatrix *p_fmat,
                                  const BoosterInfo &info,
@ -624,9 +625,8 @@ class QuantileHistMaker: public HistMaker<TStats> {
  }

 private:
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
  // summary array
-  std::vector< WXQSketch::SummaryContainer> summary_array;
+  std::vector<WXQSketch::SummaryContainer> summary_array;
  // reducer for summary
  sync::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
  // local temp column data structure
--- a/src/utils/io.h
+++ b/src/utils/io.h
@ -106,7 +106,7 @@ struct MemoryFixSizeBuffer : public ISeekStream {
  }
  virtual ~MemoryFixSizeBuffer(void) {}
  virtual size_t Read(void *ptr, size_t size) {
-    utils::Assert(curr_ptr_ <= buffer_size_,
+    utils::Assert(curr_ptr_ + size <= buffer_size_,
                  "read can not have position excceed buffer length");
    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
    if (nread != 0) memcpy(ptr, p_buffer_ + curr_ptr_, nread);
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@ -519,12 +519,12 @@ class QuantileSketchTemplate {
  /*! \brief same as summary, but use STL to backup the space */
  struct SummaryContainer : public Summary {
    std::vector<Entry> space;
-    explicit SummaryContainer(void) : Summary(NULL, 0) { 
-    }
-    explicit SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { 
+    SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) { 
      this->space = src.space;
      this->data = BeginPtr(this->space);
    }
+    SummaryContainer(void) : Summary(NULL, 0) { 
+    }
    /*! \brief reserve space for summary */
    inline void Reserve(size_t size) {
      if (size > space.size()) {
@ -576,13 +576,17 @@ class QuantileSketchTemplate {
    /*! \brief save the data structure into stream */
    inline void Save(IStream &fo) const {
      fo.Write(&(this->size), sizeof(this->size));
-      fo.Write(data, this->size * sizeof(Entry));
+      if (this->size != 0) {
+        fo.Write(this->data, this->size * sizeof(Entry));
+      }
    }
    /*! \brief load data structure from input stream */
    inline void Load(IStream &fi) {
      utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
      this->Reserve(this->size);
-      utils::Check(fi.Read(data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
+      if (this->size != 0) {
+        utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
+      }
    }
  };
  /*!