Merge branch 'unity'

Conflicts: .gitignore R-package/src/xgboost_R.cpp src/gbm/gblinear-inl.hpp tools/xgcombine_buffer.cpp
2015-01-18 20:09:21 -08:00
parent d50079f993 b898672753
commit f49fd88de8
70 changed files with 6411 additions and 275 deletions
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -68,8 +68,9 @@ class TreeModel {
    }
  };
  /*! \brief tree node */
-  class Node{
+  class Node {
   public:
+	Node(void) : sindex_(0) {}
    /*! \brief index of left child */
    inline int cleft(void) const {
      return this->cleft_;
@@ -110,6 +111,10 @@ class TreeModel {
    inline bool is_left_child(void) const {
      return (parent_ & (1U << 31)) != 0;
    }
+    /*! \brief whether this node is deleted */
+    inline bool is_deleted(void) const {
+      return sindex_ == std::numeric_limits<unsigned>::max();
+    }
    /*! \brief whether current node is root */
    inline bool is_root(void) const {
      return parent_ == -1;
@@ -144,7 +149,11 @@ class TreeModel {
      this->cleft_ = -1;
      this->cright_ = right;
    }
-
+    /*! \brief mark that this node is deleted */
+    inline void mark_delete(void) {
+      this->sindex_ = std::numeric_limits<unsigned>::max();
+    }
+    
   private:
    friend class TreeModel<TSplitCond, TNodeStat>;
    /*! 
@@ -197,11 +206,11 @@ class TreeModel {
    leaf_vector.resize(param.num_nodes * param.size_leaf_vector); 
    return nd;
  }
-  // delete a tree node
+  // delete a tree node, keep the parent field to allow trace back
  inline void DeleteNode(int nid) {
    utils::Assert(nid >= param.num_roots, "can not delete root");
    deleted_nodes.push_back(nid);
-    nodes[nid].set_parent(-1);
+    nodes[nid].mark_delete();
    ++param.num_deleted;
  }

@@ -296,11 +305,12 @@ class TreeModel {
    }
    // chg deleted nodes
    deleted_nodes.resize(0);
-    for (int i = param.num_roots; i < param.num_nodes; i ++) {
-      if (nodes[i].is_root()) deleted_nodes.push_back(i);
+    for (int i = param.num_roots; i < param.num_nodes; ++i) {
+      if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
    }
    utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
-                  "number of deleted nodes do not match");
+                  "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
+                  param.num_deleted, deleted_nodes.size(), param.num_nodes);
  }
  /*! 
   * \brief save model to stream
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -36,8 +36,14 @@ struct TrainParam{
  float colsample_bytree;
  // speed optimization for dense column
  float opt_dense_col;
+  // accuracy of sketch
+  float sketch_eps;
+  // accuracy of sketch
+  float sketch_ratio;
  // leaf vector size
-  int size_leaf_vector;
+  int size_leaf_vector;  
+  // option for parallelization
+  int parallel_option;
  // number of threads to be used for tree construction,
  // if OpenMP is enabled, if equals 0, use system default
  int nthread;
@@ -55,6 +61,9 @@ struct TrainParam{
    opt_dense_col = 1.0f;
    nthread = 0;
    size_leaf_vector = 0;
+    parallel_option = 2;
+    sketch_eps = 0.1f;
+    sketch_ratio = 2.0f;
  }
  /*! 
   * \brief set parameters from outside 
@@ -76,10 +85,13 @@ struct TrainParam{
    if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
    if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
    if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
    if (!strcmp(name, "nthread")) nthread = atoi(val);
+    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
    if (!strcmp(name, "default_direction")) {
      if (!strcmp(val, "learn")) default_direction = 0;
      if (!strcmp(val, "left")) default_direction = 1;
@@ -132,6 +144,12 @@ struct TrainParam{
  inline bool cannot_split(double sum_hess, int depth) const {
    return sum_hess < this->min_child_weight * 2.0;
  }
+  /*! \brief maximum sketch size */
+  inline unsigned max_sketch_size(void) const {
+    unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
+    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
+    return ret;
+  }

 protected:
  // functions for L1 cost
@@ -186,6 +204,10 @@ struct GradStats {
  inline void Add(const GradStats &b) {
    this->Add(b.sum_grad, b.sum_hess);
  }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline void Reduce(const GradStats &b) {
+    this->Add(b);
+  }
  /*! \brief set current value to a - b */
  inline void SetSubstract(const GradStats &a, const GradStats &b) {
    sum_grad = a.sum_grad - b.sum_grad;
@@ -262,6 +284,10 @@ struct CVGradStats : public GradStats {
      valid[i].Add(b.valid[i]);
    }
  }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline void Reduce(const CVGradStats &b) {
+    this->Add(b);
+  }
  /*! \brief set current value to a - b */
  inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
    GradStats::SetSubstract(a, b);
@@ -341,6 +367,10 @@ struct SplitEntry{
      return false;
    }
  }
+  /*! \brief same as update, used by AllReduce*/
+  inline void Reduce(const SplitEntry &e) {
+    this->Update(e);
+  }
  /*!\return feature index to split on */
  inline unsigned split_index(void) const {
    return sindex & ((1U << 31) - 1U);
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -1,18 +1,28 @@
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
 #include <cstring>
 #include "./updater.h"
+#include "./updater_sync-inl.hpp"
 #include "./updater_prune-inl.hpp"
 #include "./updater_refresh-inl.hpp"
 #include "./updater_colmaker-inl.hpp"
+#include "./updater_distcol-inl.hpp"
+#include "./updater_histmaker-inl.hpp"
+//#include "./updater_skmaker-inl.hpp"

 namespace xgboost {
 namespace tree {
 IUpdater* CreateUpdater(const char *name) {
  using namespace std;
  if (!strcmp(name, "prune")) return new TreePruner();
+  if (!strcmp(name, "sync")) return new TreeSyncher();
  if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
  if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
+  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
+  //if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
+  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
+
  utils::Error("unknown updater:%s", name);
  return NULL;
 }
--- a/src/tree/updater.h
+++ b/src/tree/updater.h
@@ -37,6 +37,16 @@ class IUpdater {
                      IFMatrix *p_fmat,
                      const BoosterInfo &info,
                      const std::vector<RegTree*> &trees) = 0;
+
+  /*! 
+   * \brief this is simply a function for optimizing performance
+   * this function asks the updater to return the leaf position of each instance in the p_fmat,
+   * if it is cached in the updater, if it is not available, return NULL
+   * \return array of leaf position of each instance in the last updated tree
+   */
+  virtual const int* GetLeafPosition(void) const {
+    return NULL;
+  }
  // destructor
  virtual ~IUpdater(void) {}
 };
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -0,0 +1,409 @@
+#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+/*!
+ * \file updater_basemaker-inl.hpp
+ * \brief implement a common tree constructor
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include <limits>
+#include <rabit.h>
+#include "../utils/random.h"
+#include "../utils/quantile.h"
+
+namespace xgboost {
+namespace tree {
+/*! 
+ * \brief base tree maker class that defines common operation
+ *  needed in tree making
+ */
+class BaseMaker: public IUpdater {
+ public:
+  // destructor
+  virtual ~BaseMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+   
+ protected:
+  // helper to collect and query feature meta information
+  struct FMetaHelper {
+   public:
+    /*! \brief find type of each feature, use column format */
+    inline void InitByCol(IFMatrix *p_fmat,
+                          const RegTree &tree) {
+      fminmax.resize(tree.param.num_feature * 2);
+      std::fill(fminmax.begin(), fminmax.end(),
+                -std::numeric_limits<bst_float>::max());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (bst_uint i = 0; i < batch.size; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const ColBatch::Inst &c = batch[i];
+          if (c.length != 0) {
+            fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
+            fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
+          }
+        }
+      }      
+      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
+    }
+    // get feature type, 0:empty 1:binary 2:real
+    inline int Type(bst_uint fid) const {
+      utils::Assert(fid * 2 + 1 < fminmax.size(),
+                    "FeatHelper fid exceed query bound ");
+      bst_float a = fminmax[fid * 2];
+      bst_float b = fminmax[fid * 2 + 1];
+      if (a == -std::numeric_limits<bst_float>::max()) return 0;
+      if (-a == b) return 1;
+      else return 2;
+    }
+    inline bst_float MaxValue(bst_uint fid) const {
+      return fminmax[fid *2 + 1];
+    }
+    inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
+      std::vector<bst_uint> &findex = *p_findex;
+      findex.clear();
+      for (size_t i = 0; i < fminmax.size(); i += 2) {
+		const bst_uint fid = static_cast<bst_uint>(i / 2);
+        if (this->Type(fid) != 0) findex.push_back(fid);
+      }
+      unsigned n = static_cast<unsigned>(p * findex.size());
+      random::Shuffle(findex);
+      findex.resize(n);
+      // sync the findex if it is subsample
+      std::string s_cache;
+      utils::MemoryBufferStream fc(&s_cache);
+      utils::IStream &fs = fc;
+      if (rabit::GetRank() == 0) {
+        fs.Write(findex);
+      }
+      rabit::Broadcast(&s_cache, 0);
+      fs.Read(&findex);
+    }
+    
+   private:
+    std::vector<bst_float> fminmax;
+  };
+  // ------static helper functions ------
+  // helper function to get to next level of the tree
+  /*! \brief this is  helper function for row based data*/
+  inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
+    const RegTree::Node &n = tree[nid];
+    bst_uint findex = n.split_index();
+    for (unsigned i = 0; i < inst.length; ++i) {
+      if (findex == inst[i].index) {
+        if (inst[i].fvalue < n.split_cond()) {
+          return n.cleft();
+        } else {
+          return n.cright();
+        }
+      }
+    }
+    return n.cdefault();
+  }
+  /*! \brief get number of omp thread in current context */
+  inline static int get_nthread(void) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    return nthread;
+  }
+  // ------class member helpers---------
+  /*! \brief initialize temp data structure */
+  inline void InitData(const std::vector<bst_gpair> &gpair,
+                       const IFMatrix &fmat,
+                       const std::vector<unsigned> &root_index,
+                       const RegTree &tree) {
+    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+                  "TreeMaker: can only grow new tree");
+    {// setup position
+      position.resize(gpair.size());
+      if (root_index.size() == 0) {
+        std::fill(position.begin(), position.end(), 0);
+      } else {
+        for (size_t i = 0; i < position.size(); ++i) {
+          position[i] = root_index[i];
+          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
+                        "root index exceed setting");
+        }
+      }
+      // mark delete for the deleted datas
+      for (size_t i = 0; i < position.size(); ++i) {
+        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
+      }
+      // mark subsample
+      if (param.subsample < 1.0f) {
+        for (size_t i = 0; i < position.size(); ++i) {
+          if (gpair[i].hess < 0.0f) continue;
+          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+        }
+      }
+    }
+    {// expand query
+      qexpand.reserve(256); qexpand.clear();
+      for (int i = 0; i < tree.param.num_roots; ++i) {
+        qexpand.push_back(i);
+      }
+      this->UpdateNode2WorkIndex(tree);
+    }
+  }
+  /*! \brief update queue expand add in new leaves */
+  inline void UpdateQueueExpand(const RegTree &tree) {
+    std::vector<int> newnodes;
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      if (!tree[nid].is_leaf()) {
+        newnodes.push_back(tree[nid].cleft());
+        newnodes.push_back(tree[nid].cright());
+      }
+    }
+    // use new nodes for qexpand
+    qexpand = newnodes;
+    this->UpdateNode2WorkIndex(tree);
+  }
+  // return decoded position
+  inline int DecodePosition(bst_uint ridx) const{
+    const int pid = position[ridx];
+    return pid < 0 ? ~pid : pid;
+  }
+  // encode the encoded position value for ridx
+  inline void SetEncodePosition(bst_uint ridx, int nid) {
+    if (position[ridx] < 0) {
+      position[ridx] = ~nid;
+    } else {
+      position[ridx] = nid;
+    }
+  }
+  /*! 
+   * \brief this is helper function uses column based data structure,
+   *        reset the positions to the lastest one
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  inline void ResetPositionCol(const std::vector<int> &nodes, IFMatrix *p_fmat, const RegTree &tree) {
+    // set the positions in the nondefault
+    this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
+    // set rest of instances to default position
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+    // set default direct nodes to default
+    // for leaf nodes that are not fresh, mark then to ~nid, 
+    // so that they are ignored in future statistics collection
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = this->DecodePosition(ridx);
+      if (tree[nid].is_leaf()) {
+        // mark finish when it is not a fresh leaf
+        if (tree[nid].cright() == -1) {
+          position[ridx] = ~nid;
+        }
+        } else {
+        // push to default branch
+        if (tree[nid].default_left()) {
+          this->SetEncodePosition(ridx, tree[nid].cleft());
+        } else {
+          this->SetEncodePosition(ridx, tree[nid].cright());
+        }
+      }
+    }
+  }
+  /*!
+   * \brief this is helper function uses column based data structure,
+   *        update all positions into nondefault branch, if any, ignore the default branch
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
+                                        IFMatrix *p_fmat, const RegTree &tree) {
+    // step 1, classify the non-default data into right places
+    std::vector<unsigned> fsplits;
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      const int nid = nodes[i];
+      if (!tree[nid].is_leaf()) {
+        fsplits.push_back(tree[nid].split_index());
+      }
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        ColBatch::Inst col = batch[i];
+        const bst_uint fid = batch.col_index[i];
+        const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+          const bst_uint ridx = col[j].index;
+          const float fvalue = col[j].fvalue;
+          const int nid = this->DecodePosition(ridx);
+          // go back to parent, correct those who are not default
+          if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+            if(fvalue < tree[nid].split_cond()) {
+              this->SetEncodePosition(ridx, tree[nid].cleft());
+            } else {
+              this->SetEncodePosition(ridx, tree[nid].cright());
+            }
+          }
+        }
+      }
+    }
+  }
+  /*! \brief helper function to get statistics from a tree */
+  template<typename TStats>
+  inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
+                           const IFMatrix &fmat,
+                           const RegTree &tree,
+                           const BoosterInfo &info,
+                           std::vector< std::vector<TStats> > *p_thread_temp,
+                           std::vector<TStats> *p_node_stats) {
+    std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
+    thread_temp.resize(this->get_nthread());
+    p_node_stats->resize(tree.param.num_nodes);
+    #pragma omp parallel
+    {
+      const int tid = omp_get_thread_num();
+      thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const unsigned nid = qexpand[i];
+        thread_temp[tid][nid].Clear();
+      }
+    }
+    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    // setup position
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = position[ridx];
+      const int tid = omp_get_thread_num();
+      if (nid >= 0) {
+        thread_temp[tid][nid].Add(gpair, info, ridx);
+      }
+    }
+    // sum the per thread statistics together
+    for (size_t j = 0; j < qexpand.size(); ++j) {
+      const int nid = qexpand[j];
+      TStats &s = (*p_node_stats)[nid];
+      s.Clear();
+      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
+        s.Add(thread_temp[tid][nid]);
+      }
+    }
+  }
+  /*! \brief common helper data structure to build sketch*/
+  struct SketchEntry {
+    /*! \brief total sum of amount to be met */
+    bst_float sum_total;
+    /*! \brief statistics used in the sketch */
+    bst_float rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    bst_float next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = -1.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch 
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (next_goal == -1.0f) {
+        next_goal = 0.0f;
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        bst_float rmax = rmin + wmin;
+        if (rmax >= next_goal) {
+          if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(rmin, rmax, wmin, last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else{
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      bst_float rmax = rmin + wmin;
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size );
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(rmin, rmax, wmin, last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
+  /*! \brief training parameter of tree grower */
+  TrainParam param;
+  /*! \brief queue of nodes to be expanded */
+  std::vector<int> qexpand;
+  /*!
+   * \brief map active node to is working index offset in qexpand,
+   *   can be -1, which means the node is node actively expanding
+   */
+  std::vector<int> node2workindex;
+  /*!
+   * \brief position of each instance in the tree
+   *   can be negative, which means this position is no longer expanding
+   *   see also Decode/EncodePosition
+   */
+  std::vector<int> position;
+
+ private:
+  inline void UpdateNode2WorkIndex(const RegTree &tree) {
+    // update the node2workindex
+    std::fill(node2workindex.begin(), node2workindex.end(), -1);
+    node2workindex.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node2workindex[qexpand[i]] = static_cast<int>(i);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -14,7 +14,7 @@

 namespace xgboost {
 namespace tree {
-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief colunwise update to construct a tree */
 template<typename TStats>
 class ColMaker: public IUpdater {
 public:
@@ -36,24 +36,29 @@ class ColMaker: public IUpdater {
      Builder builder(param);
      builder.Update(gpair, p_fmat, info, trees[i]);
    }
+
    param.learning_rate = lr;
  }

- private:
+ protected:
  // training parameter
  TrainParam param;
  // data structure
  /*! \brief per thread x per node entry to store tmp data */
  struct ThreadEntry {
-    /*! \brief statistics of data*/
+    /*! \brief statistics of data */
    TStats stats;
+    /*! \brief extra statistics of data */
+    TStats stats_extra;
    /*! \brief last feature value scanned */
    float  last_fvalue;
+    /*! \brief first feature value scanned */
+    float  first_fvalue;
    /*! \brief current best solution */
    SplitEntry best;
    // constructor
    explicit ThreadEntry(const TrainParam &param)
-        : stats(param) {
+        : stats(param), stats_extra(param) {
    }
  };
  struct NodeEntry {
@@ -104,7 +109,7 @@ class ColMaker: public IUpdater {
      }
    }

-   private:
+   protected:
    // initialize temp data structure
    inline void InitData(const std::vector<bst_gpair> &gpair,
                         const IFMatrix &fmat,
@@ -127,17 +132,17 @@ class ColMaker: public IUpdater {
        // mark delete for the deleted datas
        for (size_t i = 0; i < rowset.size(); ++i) {
          const bst_uint ridx = rowset[i];
-          if (gpair[ridx].hess < 0.0f) position[ridx] = -1;
+          if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
        }
        // mark subsample
        if (param.subsample < 1.0f) {
          for (size_t i = 0; i < rowset.size(); ++i) {
            const bst_uint ridx = rowset[i];
            if (gpair[ridx].hess < 0.0f) continue;
-            if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
+            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
          }
        }
-      }    
+      }
      {
        // initialize feature index
        unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@@ -219,7 +224,138 @@ class ColMaker: public IUpdater {
      }
      // use new nodes for qexpand
      qexpand = newnodes;
-    }
+    }    
+    // parallel find the best split of current fid
+    // this function does not support nested functions
+    inline void ParallelFindSplit(const ColBatch::Inst &col,
+                                  bst_uint fid,
+                                  const IFMatrix &fmat,
+                                  const std::vector<bst_gpair> &gpair,
+                                  const BoosterInfo &info) {
+      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
+      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
+      const std::vector<int> &qexpand = qexpand_;
+      int nthread;
+      #pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        // cleanup temp statistics
+        for (size_t j = 0; j < qexpand.size(); ++j) {
+          temp[qexpand[j]].stats.Clear();
+        }
+        nthread = omp_get_num_threads();
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          if (temp[nid].stats.Empty()) {
+            temp[nid].first_fvalue = fvalue;
+          }
+          temp[nid].stats.Add(gpair, info, ridx);
+          temp[nid].last_fvalue = fvalue;
+        }
+      }
+      // start collecting the partial sum statistics
+      bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint j = 0; j < nnode; ++j) {
+        const int nid = qexpand[j];
+        TStats sum(param), tmp(param), c(param);
+        for (int tid = 0; tid < nthread; ++tid) {
+          tmp = stemp[tid][nid].stats;
+          stemp[tid][nid].stats = sum;
+          sum.Add(tmp);
+          if (tid != 0) {
+            std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
+          }
+        }
+        for (int tid = 0; tid < nthread; ++tid) {
+          stemp[tid][nid].stats_extra = sum;
+          ThreadEntry &e = stemp[tid][nid];
+          float fsplit;
+          if (tid != 0) {
+            if(fabsf(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
+              fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
+            } else {
+              continue;
+            }
+          } else {
+            fsplit = e.first_fvalue - rt_eps;
+          }                        
+          if (need_forward && tid != 0) {
+            c.SetSubstract(snode[nid].stats, e.stats);
+            if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, false);
+            }
+          }
+          if (need_backward) {
+            tmp.SetSubstract(sum, e.stats);
+            c.SetSubstract(snode[nid].stats, tmp);
+            if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, true);
+            }
+          }
+        }
+        if (need_backward) {
+          tmp = sum;
+          ThreadEntry &e = stemp[nthread-1][nid];
+          c.SetSubstract(snode[nid].stats, tmp);
+          if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
+          }
+        }
+      }
+      // rescan, generate candidate split
+      #pragma omp parallel
+      {
+        TStats c(param), cright(param);
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        nthread = static_cast<bst_uint>(omp_get_num_threads());
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          // get the statistics of nid
+          ThreadEntry &e = temp[nid];
+          if (e.stats.Empty()) {
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;
+          } else {
+            // forward default right
+            if (fabsf(fvalue - e.first_fvalue) > rt_2eps){
+              if (need_forward) { 
+                c.SetSubstract(snode[nid].stats, e.stats);
+                if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
+                }
+              }
+              if (need_backward) {
+                cright.SetSubstract(e.stats_extra, e.stats);
+                c.SetSubstract(snode[nid].stats, cright);
+                if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
+                }
+              }
+            }          
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;            
+          }
+        }
+      }
+    }    
    // enumerate the split values of specific feature
    inline void EnumerateSplit(const ColBatch::Entry *begin,
                               const ColBatch::Entry *end,
@@ -273,6 +409,42 @@ class ColMaker: public IUpdater {
        }
      }
    }
+    // update the solution candidate 
+    virtual void UpdateSolution(const ColBatch &batch,
+                                const std::vector<bst_gpair> &gpair,
+                                const IFMatrix &fmat,
+                                const BoosterInfo &info) {
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #if defined(_OPENMP)                                                                
+      const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
+      #endif
+      int poption = param.parallel_option;
+      if (poption == 2) {
+        poption = nsize * 2 < nthread ? 1 : 0;
+      }
+      if (poption == 0) {
+        #pragma omp parallel for schedule(dynamic, batch_size)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const int tid = omp_get_thread_num();
+          const ColBatch::Inst c = batch[i];
+          if (param.need_forward_search(fmat.GetColDensity(fid))) {
+            this->EnumerateSplit(c.data, c.data + c.length, +1, 
+                                 fid, gpair, info, stemp[tid]);
+          }
+          if (param.need_backward_search(fmat.GetColDensity(fid))) {
+            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
+                                 fid, gpair, info, stemp[tid]);
+          }
+        }
+      } else {
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          this->ParallelFindSplit(batch[i], batch.col_index[i],
+                                  fmat, gpair, info);
+        }
+      }      
+    }
    // find splits at current level, do split per level
    inline void FindSplit(int depth,
                          const std::vector<int> &qexpand,
@@ -289,66 +461,76 @@ class ColMaker: public IUpdater {
      }
      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        // start enumeration
-        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-        #if defined(_OPENMP)                                                                
-        const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
-        #endif
-        #pragma omp parallel for schedule(dynamic, batch_size)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
-          if (param.need_forward_search(p_fmat->GetColDensity(fid))) {            
-            this->EnumerateSplit(c.data, c.data + c.length, +1, 
-                                 fid, gpair, info, stemp[tid]);
+        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
+      }
+      // after this each thread's stemp will get the best candidates, aggregate results
+      this->SyncBestSolution(qexpand);
+      // get the best result, we can synchronize the solution
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        NodeEntry &e = snode[nid];        
+        // now we know the solution in snode[nid], set split
+        if (e.best.loss_chg > rt_eps) {
+          p_tree->AddChilds(nid);
+          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
+          // mark right child as 0, to indicate fresh leaf
+          (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+          (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        } else {
+          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
+        }
+      } 
+    }
+    // reset position of each data points after split is created in the tree
+    inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
+      // set the positions in the nondefault
+      this->SetNonDefaultPosition(qexpand, p_fmat, tree);      
+      // set rest of instances to default position
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // set default direct nodes to default
+      // for leaf nodes that are not fresh, mark then to ~nid, 
+      // so that they are ignored in future statistics collection
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int nid = this->DecodePosition(ridx);
+        if (tree[nid].is_leaf()) {
+          // mark finish when it is not a fresh leaf
+          if (tree[nid].cright() == -1) {
+            position[ridx] = ~nid;
          }
-          if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
-            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1, 
-                                 fid, gpair, info, stemp[tid]);
+        } else {
+          // push to default branch
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cright());
          }
        }
      }
-      // after this each thread's stemp will get the best candidates, aggregate results
+    }
+    // customization part
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
      for (size_t i = 0; i < qexpand.size(); ++i) {
        const int nid = qexpand[i];
        NodeEntry &e = snode[nid];
        for (int tid = 0; tid < this->nthread; ++tid) {
          e.best.Update(stemp[tid][nid].best);
        }
-        // now we know the solution in snode[nid], set split
-        if (e.best.loss_chg > rt_eps) {
-          p_tree->AddChilds(nid);
-          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
-        } else {
-          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
-        }
      }
    }
-    // reset position of each data points after split is created in the tree
-    inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // step 1, set default direct nodes to default, and leaf nodes to -1
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int nid = position[ridx];
-        if (nid >= 0) {
-          if (tree[nid].is_leaf()) {
-            position[ridx] = -1;
-          } else {
-            // push to default branch, correct latter
-            position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
-          }
-        }
-      }
-      // step 2, classify the non-default data into right places
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 1, classify the non-default data into right places
      std::vector<unsigned> fsplits;
      for (size_t i = 0; i < qexpand.size(); ++i) {
        const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) fsplits.push_back(tree[nid].split_index());
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
      }
      std::sort(fsplits.begin(), fsplits.end());
      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
@@ -364,21 +546,33 @@ class ColMaker: public IUpdater {
          for (bst_omp_uint j = 0; j < ndata; ++j) {
            const bst_uint ridx = col[j].index;
            const float fvalue = col[j].fvalue;
-            int nid = position[ridx];
-            if (nid == -1) continue;
+            const int nid = this->DecodePosition(ridx);
            // go back to parent, correct those who are not default
-            nid = tree[nid].parent();
-            if (tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                position[ridx] = tree[nid].cleft();
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if(fvalue < tree[nid].split_cond()) {
+                this->SetEncodePosition(ridx, tree[nid].cleft());
              } else {
-                position[ridx] = tree[nid].cright();
+                this->SetEncodePosition(ridx, tree[nid].cright());
              }
            }
          }
        }
      }
    }
+    // utils to get/set position, with encoded format
+    // return decoded position
+    inline int DecodePosition(bst_uint ridx) const{
+      const int pid = position[ridx];
+      return pid < 0 ? ~pid : pid;
+    }
+    // encode the encoded position value for ridx
+    inline void SetEncodePosition(bst_uint ridx, int nid) {
+      if (position[ridx] < 0) {
+        position[ridx] = ~nid;
+      } else {
+        position[ridx] = nid;
+      }
+    }
    //--data fields--
    const TrainParam &param;
    // number of omp thread used during training
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -0,0 +1,169 @@
+#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+/*!
+ * \file updater_distcol-inl.hpp
+ * \brief beta distributed version that takes a sub-column 
+ *        and construct a tree
+ * \author Tianqi Chen
+ */
+#include <rabit.h>
+#include "../utils/bitmap.h"
+#include "../utils/io.h"
+#include "./updater_colmaker-inl.hpp"
+#include "./updater_prune-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class DistColMaker : public ColMaker<TStats> {
+ public:
+  DistColMaker(void) : builder(param) {}
+  virtual ~DistColMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+    pruner.SetParam(name, val);
+  }
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {    
+    TStats::CheckInfo(info);
+    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
+    // build the tree
+    builder.Update(gpair, p_fmat, info, trees[0]);
+    //// prune the tree, note that pruner will sync the tree
+    pruner.Update(gpair, p_fmat, info, trees);
+    // update position after the tree is pruned
+    builder.UpdatePosition(p_fmat, *trees[0]);
+  }
+  virtual const int* GetLeafPosition(void) const {
+    return builder.GetLeafPosition();
+  }  
+ private:
+  struct Builder : public ColMaker<TStats>::Builder {
+   public:
+    Builder(const TrainParam &param) 
+        : ColMaker<TStats>::Builder(param) {
+    }
+    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        int nid = this->DecodePosition(ridx);
+        while (tree[nid].is_deleted()) {
+          nid = tree[nid].parent();
+          utils::Assert(nid >=0, "distributed learning error");
+        }
+        this->position[ridx] = nid;
+      }
+    }
+    virtual const int* GetLeafPosition(void) const {
+      return BeginPtr(this->position);
+    }
+   protected:    
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 2, classify the non-default data into right places
+      std::vector<unsigned> fsplits;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
+      }
+      // get the candidate split index
+      std::sort(fsplits.begin(), fsplits.end());
+      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
+        fsplits.pop_back();
+      }
+      // bitmap is only word concurrent, set to bool first
+      {
+        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
+        boolmap.resize(ndata);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+            boolmap[j] = 0;
+        }        
+      }
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (size_t i = 0; i < batch.size; ++i) {
+          ColBatch::Inst col = batch[i];
+          const bst_uint fid = batch.col_index[i];
+          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+          #pragma omp parallel for schedule(static)
+          for (bst_omp_uint j = 0; j < ndata; ++j) {
+            const bst_uint ridx = col[j].index;
+            const float fvalue = col[j].fvalue;
+            const int nid = this->DecodePosition(ridx);
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if (fvalue < tree[nid].split_cond()) {
+                if (!tree[nid].default_left()) boolmap[ridx] = 1;
+              } else {
+                if (tree[nid].default_left()) boolmap[ridx] = 1;
+              }
+            }
+          }
+        }
+      }
+      
+      bitmap.InitFromBool(boolmap);
+      // communicate bitmap
+      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // get the new position
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int nid = this->DecodePosition(ridx);
+        if (bitmap.Get(ridx)) {
+          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cright());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          }
+        }
+      }
+    }
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      std::vector<SplitEntry> vec;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          this->snode[nid].best.Update(this->stemp[tid][nid].best);
+        }
+        vec.push_back(this->snode[nid].best);
+      }
+      // TODO, lazy version
+      // communicate best solution
+      reducer.Allreduce(BeginPtr(vec), vec.size());
+      // assign solution back
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        this->snode[nid].best = vec[i];
+      }
+    }
+    
+   private:
+    utils::BitMap bitmap;
+    std::vector<int> boolmap;
+    rabit::Reducer<SplitEntry> reducer;
+  };
+  // we directly introduce pruner here
+  TreePruner pruner;
+  // training parameter
+  TrainParam param;
+  // pointer to the builder
+  Builder builder; 
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -0,0 +1,701 @@
+#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+/*!
+ * \file updater_histmaker-inl.hpp
+ * \brief use histogram counting to construct a tree
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include <rabit.h>
+#include "../utils/quantile.h"
+#include "../utils/group_data.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class HistMaker: public BaseMaker {
+ public:
+  virtual ~HistMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    TStats::CheckInfo(info);
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+
+ protected:
+  /*! \brief a single histogram */
+  struct HistUnit {
+    /*! \brief cutting point of histogram, contains maximum point */
+    const bst_float *cut;
+    /*! \brief content of statistics data */    
+    TStats *data;
+    /*! \brief size of histogram */
+    unsigned size;
+    // default constructor
+    HistUnit(void) {}
+    // constructor
+    HistUnit(const bst_float *cut, TStats *data, unsigned size)
+        : cut(cut), data(data), size(size) {}
+    /*! \brief add a histogram to data */
+    inline void Add(bst_float fv, 
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
+      utils::Assert(size != 0, "try insert into size=0");
+      utils::Assert(i < size, 
+                    "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
+      data[i].Add(gpair, info, ridx);
+    }
+  };
+  /*! \brief a set of histograms from different index */
+  struct HistSet {
+    /*! \brief the index pointer of each histunit */
+    const unsigned *rptr;
+    /*! \brief cutting points in each histunit */
+    const bst_float *cut;
+    /*! \brief data in different hist unit */
+    std::vector<TStats> data;
+    /*! \brief */
+    inline HistUnit operator[](size_t fid) {
+      return HistUnit(cut + rptr[fid],
+                      &data[0] + rptr[fid],
+                      rptr[fid+1] - rptr[fid]);
+    }
+  };
+  // thread workspace 
+  struct ThreadWSpace {
+    /*! \brief actual unit pointer */
+    std::vector<unsigned> rptr;
+    /*! \brief cut field */
+    std::vector<bst_float> cut;
+    // per thread histset
+    std::vector<HistSet> hset;
+    // initialize the hist set
+    inline void Init(const TrainParam &param, int nthread) {
+      hset.resize(nthread);
+      // cleanup statistics
+      for (int tid = 0; tid < nthread; ++tid) {
+        for (size_t i = 0; i < hset[tid].data.size(); ++i) {
+          hset[tid].data[i].Clear();
+        }
+        hset[tid].rptr = BeginPtr(rptr);
+        hset[tid].cut = BeginPtr(cut);
+        hset[tid].data.resize(cut.size(), TStats(param));        
+      }
+    }
+    // aggregate all statistics to hset[0]
+    inline void Aggregate(void) {
+      bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        for (size_t tid = 1; tid < hset.size(); ++tid) {
+          hset[0].data[i].Add(hset[tid].data[i]);
+        }
+      }
+    }
+    /*! \brief clear the workspace */
+    inline void Clear(void) {
+      cut.clear(); rptr.resize(1); rptr[0] = 0;
+    }
+    /*! \brief total size */
+    inline size_t Size(void) const {
+      return rptr.size() - 1;
+    }
+  };
+  // workspace of thread
+  ThreadWSpace wspace;
+  // reducer for histogram
+  rabit::Reducer<TStats> histred;
+  // set of working features
+  std::vector<bst_uint> fwork_set;
+  // update function implementation
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      // reset and propose candidate split
+      this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
+      // create histogram
+      this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
+      // find split based on histogram statistics
+      this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
+      // reset position after split
+      this->ResetPositionAfterSplit(p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly 
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree) = 0;
+  // initialize the current working set of features in this round
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    p_fset->resize(tree.param.num_feature);
+    for (size_t i = 0; i < p_fset->size(); ++i) {
+      (*p_fset)[i] = static_cast<unsigned>(i);
+    }
+  }
+  // reset position after split, this is not a must, depending on implementation
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+  }
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const std::vector <bst_uint> &fset,
+                          const RegTree &tree)  = 0;
+ private:
+  inline void EnumerateSplit(const HistUnit &hist, 
+                             const TStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best,
+                             TStats *left_sum) {
+    if (hist.size == 0) return;
+
+    double root_gain = node_sum.CalcGain(param);
+    TStats s(param), c(param);
+    for (bst_uint i = 0; i < hist.size; ++i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          if (best->Update((float)loss_chg, fid, hist.cut[i], false)) {
+            *left_sum = s;
+          }
+        }
+      }
+    }
+    s.Clear();
+    for (bst_uint i = hist.size - 1; i != 0; --i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          if (best->Update((float)loss_chg, fid, hist.cut[i-1], true)) {
+            *left_sum = c;
+          }
+        }
+      }
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        const std::vector <bst_uint> &fset,
+                        RegTree *p_tree) {
+    const size_t num_feature = fset.size();
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    std::vector<TStats> left_sum(qexpand.size());    
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      for (size_t i = 0; i < fset.size(); ++ i) {
+        EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
+                       node_sum, fset[i], &best, &left_sum[wid]);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      this->SetStats(p_tree, nid, node_sum);
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);        
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        // right side sum
+        TStats right_sum;
+        right_sum.SetSubstract(node_sum, left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+  
+  inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
+    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));    
+  }
+};
+
+template<typename TStats>
+class CQHistMaker: public HistMaker<TStats> {
+ protected:
+  struct HistEntry {
+    typename HistMaker<TStats>::HistUnit hist;
+    unsigned istart;
+    /*! 
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gpair, info, ridx);
+    }
+  };
+  // sketch type used for this
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // initialize the work set of tree
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    feat_helper.InitByCol(p_fmat, tree);
+    feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
+  }
+  // code to create histogram  
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const std::vector<bst_uint> &fset,
+                          const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      feat2workindex[fset[i]] = static_cast<int>(i);
+    } 
+    // start to work
+    this->wspace.Init(this->param, 1);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_hist = [&]()
+#endif
+    {
+      thread_hist.resize(this->get_nthread());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateHistCol(gpair, batch[i], info, tree,
+                                fset, offset,
+                                &thread_hist[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        const int wid = this->node2workindex[nid];
+        this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
+            .data[0] = node_stats[nid];
+      }
+    };
+    // sync the histogram
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), 
+                            this->wspace.hset[0].data.size(), lazy_get_hist);
+#else
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());   
+#endif    
+  }
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+    this->ResetPositionCol(this->qexpand, p_fmat, tree);
+  }
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector<bst_uint> &fset,
+                                  const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    freal_set.clear();
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (feat_helper.Type(fset[i]) == 2) {
+        feat2workindex[fset[i]] = static_cast<int>(freal_set.size());
+        freal_set.push_back(fset[i]);
+      } else {
+        feat2workindex[fset[i]] = -2;  
+      }
+    }      
+    this->GetNodeStats(gpair, *p_fmat, tree, info,
+                       &thread_stats, &node_stats);       
+    sketchs.resize(this->qexpand.size() * freal_set.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    // intitialize the summary array
+    summary_array.resize(sketchs.size());
+    // setup maximum size
+    unsigned max_size = this->param.max_sketch_size();
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      summary_array[i].Reserve(max_size);
+    }
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    auto lazy_get_summary = [&]()
+#endif
+    {// get smmary
+      thread_sketch.resize(this->get_nthread());
+      // number of rows in
+      const size_t nrows = p_fmat->buffered_rowset().size();
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateSketchCol(gpair, batch[i], tree,
+                                  node_stats,
+                                  freal_set, offset,
+                                  batch[i].length == nrows,
+                                  &thread_sketch[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < sketchs.size(); ++i) {
+        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+        sketchs[i].GetSummary(&out);
+        summary_array[i].SetPrune(out, max_size);
+      }
+      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
+    };
+    if (summary_array.size() != 0) {
+      size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+#if __cplusplus >= 201103L
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
+#else
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+#endif
+    }
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (size_t i = 0; i < fset.size(); ++i) {
+        int offset = feat2workindex[fset[i]];
+        if (offset >= 0) {
+          const WXQSketch::Summary &a = summary_array[wid * freal_set.size() + offset];
+          for (size_t i = 1; i < a.size; ++i) {
+            bst_float cpt = a.data[i].value - rt_eps;
+            if (i == 1 || cpt > this->wspace.cut.back()) {
+              this->wspace.cut.push_back(cpt);
+            }
+          }
+          // push a value that is greater than anything
+          if (a.size != 0) {
+            bst_float cpt = a.data[a.size - 1].value;
+            // this must be bigger than last value in a scale
+            bst_float last = cpt + fabs(cpt) + rt_eps;
+            this->wspace.cut.push_back(last);
+          }
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+        } else {
+          utils::Assert(offset == -2, "BUG in mark");
+          bst_float cpt = feat_helper.MaxValue(fset[i]);        
+          this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));        
+        }
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (fset.size() + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
+  }
+  
+ private:
+  inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
+                            const ColBatch::Inst &c,
+                            const BoosterInfo &info,
+                            const RegTree &tree,
+                            const std::vector<bst_uint> &fset,
+                            bst_uint fid_offset,
+                            std::vector<HistEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<HistEntry> &hbuilder = *p_temp;
+    hbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      hbuilder[nid].istart = 0;
+      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
+    }
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+      }
+    }
+  }
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<TStats> &nstats,
+                              const std::vector<bst_uint> &frealset,
+                              bst_uint offset,
+                              bool col_full,
+                              std::vector<BaseMaker::SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      sbuilder[nid].sum_total = 0.0f;
+      sbuilder[nid].sketch = &sketchs[wid * frealset.size() + offset];
+    }
+
+    if (!col_full) {
+      // first pass, get sum of weight, TODO, optimization to skip first pass
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].sum_total += gpair[ridx].hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];        
+        sbuilder[nid].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
+      } 
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        sbuilder[nid].sketch->Push(c[0].fvalue, sbuilder[nid].sum_total);
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = this->param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Init(max_size);
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Finalize(max_size);
+    }
+  }
+  // feature helper
+  BaseMaker::FMetaHelper feat_helper;
+  // temp space to map feature id to working index
+  std::vector<int> feat2workindex;
+  // set of index from fset that are real
+  std::vector<bst_uint> freal_set; 
+  // thread temp data
+  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<TStats> > thread_stats;
+  // used to hold start pointer
+  std::vector< std::vector<HistEntry> > thread_hist;
+  // node statistics
+  std::vector<TStats> node_stats;
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;  
+};
+
+template<typename TStats>
+class QuantileHistMaker: public HistMaker<TStats> {  
+ protected:
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree) {
+    // initialize the data structure
+    int nthread = BaseMaker::get_nthread();
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    // start accumulating statistics
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel convert to column major format
+      utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
+      builder.InitBudget(tree.param.num_feature, nthread);
+
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);      
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        int nid = this->position[ridx];
+        if (nid >= 0) {
+          if (!tree[nid].is_leaf()) {
+            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
+          } 
+          if (this->node2workindex[nid] < 0) {
+            this->position[ridx] = ~nid;
+          } else{
+            for (bst_uint j = 0; j < inst.length; ++j) { 
+              builder.AddBudget(inst[j].index, omp_get_thread_num());
+            }
+          }
+        }
+      }
+      builder.InitStorage();
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.Push(inst[j].index,
+                         SparseBatch::Entry(nid, inst[j].fvalue),
+                         omp_get_thread_num());
+          }
+        }
+      }
+      // start putting things into sketch
+      const bst_omp_uint nfeat = col_ptr.size() - 1;
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint k = 0; k < nfeat; ++k) {
+        for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
+          const SparseBatch::Entry &e = col_data[i];
+          const int wid = this->node2workindex[e.index];
+          sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].hess);
+        }
+      }
+    }
+    // setup maximum size
+    unsigned max_size = this->param.max_sketch_size();
+    // synchronize sketch
+    summary_array.resize(sketchs.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array[i].Reserve(max_size);
+      summary_array[i].SetPrune(out, max_size);
+    }
+    
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);    
+    sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
+        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value - rt_eps;
+          if (i == 1 || cpt > this->wspace.cut.back()) {
+            this->wspace.cut.push_back(cpt);
+          }
+        }
+        // push a value that is greater than anything
+        if (a.size != 0) {
+          bst_float cpt = a.data[a.size - 1].value;
+          // this must be bigger than last value in a scale
+          bst_float last = cpt + fabs(cpt) + rt_eps;
+          this->wspace.cut.push_back(last);
+        }
+        this->wspace.rptr.push_back(this->wspace.cut.size());
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(this->wspace.cut.size());
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
+  }
+
+ private:
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  // local temp column data structure
+  std::vector<size_t> col_ptr;
+  // local storage of column data
+  std::vector<SparseBatch::Entry> col_data;
+  std::vector< std::vector<size_t> > thread_col_ptr;
+  // per node, per feature sketch
+  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune-inl.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 #include "./param.h"
 #include "./updater.h"
+#include "./updater_sync-inl.hpp"

 namespace xgboost {
 namespace tree {
@@ -19,6 +20,7 @@ class TreePruner: public IUpdater {
  virtual void SetParam(const char *name, const char *val) {
    using namespace std;
    param.SetParam(name, val);
+    syncher.SetParam(name, val);
    if (!strcmp(name, "silent")) silent = atoi(val);
  }
  // update the tree, do pruning
@@ -33,8 +35,8 @@ class TreePruner: public IUpdater {
      this->DoPrune(*trees[i]);
    }
    param.learning_rate = lr;
+    syncher.Update(gpair, p_fmat, info, trees);
  }
-
 private:
  // try to prune off current leaf
  inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) {
@@ -70,6 +72,8 @@ class TreePruner: public IUpdater {
  }

 private:
+  // synchronizer
+  TreeSyncher syncher;
  // shutup
  int silent;
  // training parameter
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -7,6 +7,7 @@
 */
 #include <vector>
 #include <limits>
+#include <rabit.h>
 #include "./param.h"
 #include "./updater.h"
 #include "../utils/omp.h"
@@ -26,7 +27,7 @@ class TreeRefresher: public IUpdater {
  virtual void Update(const std::vector<bst_gpair> &gpair,
                      IFMatrix *p_fmat,
                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {    
+                      const std::vector<RegTree*> &trees) {        
    if (trees.size() == 0) return;
    // number of threads
    // thread temporal space
@@ -39,54 +40,71 @@ class TreeRefresher: public IUpdater {
      nthread = omp_get_num_threads();
    }
    fvec_temp.resize(nthread, RegTree::FVec());
-    stemp.resize(trees.size() * nthread, std::vector<TStats>());
+    stemp.resize(nthread, std::vector<TStats>());
    #pragma omp parallel
    {
      int tid = omp_get_thread_num();
+      int num_nodes = 0;
      for (size_t i = 0; i < trees.size(); ++i) {
-        std::vector<TStats> &vec = stemp[tid * trees.size() + i];
-        vec.resize(trees[i]->param.num_nodes, TStats(param));
-        std::fill(vec.begin(), vec.end(), TStats(param));
+        num_nodes += trees[i]->param.num_nodes;
      }
+      stemp[tid].resize(num_nodes, TStats(param));
+      std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
      fvec_temp[tid].Init(trees[0]->param.num_feature);
    }
-    // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                   "too large batch size ");
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const int tid = omp_get_thread_num();
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        RegTree::FVec &feats = fvec_temp[tid];
-        feats.Fill(inst);
-        for (size_t j = 0; j < trees.size(); ++j) {
-          AddStats(*trees[j], feats, gpair, info, ridx,
-                   &stemp[tid * trees.size() + j]);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_stats = [&]()
+#endif
+    {
+      // start accumulating statistics
+      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const RowBatch &batch = iter->Value();
+        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
+                     "too large batch size ");
+        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nbatch; ++i) {
+          RowBatch::Inst inst = batch[i];
+          const int tid = omp_get_thread_num();
+          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+          RegTree::FVec &feats = fvec_temp[tid];
+          feats.Fill(inst);
+          int offset = 0;
+          for (size_t j = 0; j < trees.size(); ++j) {
+            AddStats(*trees[j], feats, gpair, info, ridx,
+                     BeginPtr(stemp[tid]) + offset);
+            offset += trees[j]->param.num_nodes;
+          }
+          feats.Drop(inst);
        }
-        feats.Drop(inst);
      }
-    }
-    // start update the trees using the statistics
+      // aggregate the statistics
+      int num_nodes = static_cast<int>(stemp[0].size());
+      #pragma omp parallel for schedule(static)
+      for (int nid = 0; nid < num_nodes; ++nid) {
+        for (int tid = 1; tid < nthread; ++tid) {
+          stemp[0][nid].Add(stemp[tid][nid]);
+        }
+      }
+    };
+#if __cplusplus >= 201103L
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+#else
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
+#endif
    // rescale learning rate according to size of trees
    float lr = param.learning_rate;
    param.learning_rate = lr / trees.size();
-    for (size_t i = 0; i < trees.size(); ++i) {
-      // aggregate
-      #pragma omp parallel for schedule(static)
-      for (int nid = 0; nid < trees[i]->param.num_nodes; ++nid) {
-        for (int tid = 1; tid < nthread; ++tid) {
-          stemp[i][nid].Add(stemp[tid * trees.size() + i][nid]);
-        }
-      }
+    int offset = 0;
+    for (size_t i = 0; i < trees.size(); ++i) {      
      for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
-        this->Refresh(stemp[i], rid, trees[i]);
+        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
      }
+      offset += trees[i]->param.num_nodes;
    }
    // set learning rate back
    param.learning_rate = lr;
@@ -98,8 +116,7 @@ class TreeRefresher: public IUpdater {
                              const std::vector<bst_gpair> &gpair,
                              const BoosterInfo &info,
                              const bst_uint ridx,
-                              std::vector<TStats> *p_gstats) {
-    std::vector<TStats> &gstats = *p_gstats;
+                              TStats *gstats) {
    // start from groups that belongs to current data
    int pid = static_cast<int>(info.GetRoot(ridx));
    gstats[pid].Add(gpair, info, ridx);
@@ -110,7 +127,7 @@ class TreeRefresher: public IUpdater {
      gstats[pid].Add(gpair, info, ridx);
    }
  }
-  inline void Refresh(const std::vector<TStats> &gstats,
+  inline void Refresh(const TStats *gstats,
                      int nid, RegTree *p_tree) {
    RegTree &tree = *p_tree;
    tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
@@ -129,6 +146,8 @@ class TreeRefresher: public IUpdater {
  }
  // training parameter
  TrainParam param;
+  // reducer
+  rabit::Reducer<TStats> reducer;  
 };

 }  // namespace tree
--- a/src/tree/updater_skmaker-inl.hpp
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -0,0 +1,393 @@
+#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+/*!
+ * \file updater_skmaker-inl.hpp
+ * \brief use approximation sketch to construct a tree,
+          a refresh is needed to make the statistics exactly correct
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <algorithm>
+#include <rabit.h>
+#include "../utils/quantile.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+class SketchMaker: public BaseMaker {
+ public:
+  virtual ~SketchMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+ 
+ protected:
+  inline void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->BuildSketch(gpair, p_fmat, info, *p_tree);
+      this->SyncNodeStats();
+      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->ResetPositionCol(qexpand, p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    if (qexpand.size() != 0) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->SyncNodeStats();
+    }
+    // set all statistics correctly
+    for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      this->SetStats(nid, node_stats[nid], p_tree);
+      if (!(*p_tree)[nid].is_leaf()) {
+        p_tree->stat(nid).loss_chg =
+            node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
+            node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
+            node_stats[nid].CalcGain(param);
+      }
+    }
+    // set left leaves
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // define the sketch we want to use
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+
+ private:
+  // statistics needed in the gradient calculation
+  struct SKStats {
+    /*! \brief sum of all positive gradient */
+    double pos_grad;
+    /*! \brief sum of all negative gradient */
+    double neg_grad;
+    /*! \brief sum of hessian statistics */    
+    double sum_hess;
+    explicit SKStats(void) {}
+    // constructor
+    explicit SKStats(const TrainParam &param) {
+      this->Clear();
+    }
+    /*! \brief clear the statistics */
+    inline void Clear(void) {
+      neg_grad = pos_grad = sum_hess = 0.0f;
+    }
+    // accumulate statistics
+    inline void Add(const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    bst_uint ridx) {
+      const bst_gpair &b = gpair[ridx];
+      if (b.grad >= 0.0f) {
+        pos_grad += b.grad;
+      } else {
+        neg_grad -= b.grad;
+      }
+      sum_hess += b.hess;
+    }
+    /*! \brief calculate gain of the solution */
+    inline double CalcGain(const TrainParam &param) const {
+      return param.CalcGain(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief set current value to a - b */
+    inline void SetSubstract(const SKStats &a, const SKStats &b) {
+      pos_grad = a.pos_grad - b.pos_grad;
+      neg_grad = a.neg_grad - b.neg_grad;
+      sum_hess = a.sum_hess - b.sum_hess;
+    }
+    // calculate leaf weight
+    inline double CalcWeight(const TrainParam &param) const {
+      return param.CalcWeight(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief add statistics to the data */
+    inline void Add(const SKStats &b) {
+      pos_grad += b.pos_grad;
+      neg_grad += b.neg_grad;
+      sum_hess += b.sum_hess;
+    }
+    /*! \brief same as add, reduce is used in All Reduce */
+    inline void Reduce(const SKStats &b) {
+      this->Add(b);
+    }
+    /*! \brief set leaf vector value based on statistics */
+    inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
+    }
+  };
+  inline void BuildSketch(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    thread_sketch.resize(this->get_nthread());
+    // number of rows in 
+    const size_t nrows = p_fmat->buffered_rowset().size();
+    // start accumulating statistics
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        this->UpdateSketchCol(gpair, batch[i], tree,
+                              node_stats,
+                              batch.col_index[i],
+                              batch[i].length == nrows,
+                              &thread_sketch[omp_get_thread_num()]);
+      }
+    }
+    // setup maximum size
+    unsigned max_size = param.max_sketch_size();
+    // synchronize sketch
+    summary_array.Init(sketchs.size(), max_size);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array.Set(i, out);
+    }
+    size_t nbytes = summary_array.MemSize();;
+    sketch_reducer.Allreduce(&summary_array, nbytes);    
+  }
+  // update sketch information in column fid
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<SKStats> &nstats,
+                              bst_uint fid,
+                              bool col_full,
+                              std::vector<SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes * 3);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].sum_total = 0.0f;
+        sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];       
+      }
+    }
+    if (!col_full) {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          const bst_gpair &e = gpair[ridx];
+          if (e.grad >= 0.0f) {
+            sbuilder[3 * nid + 0].sum_total += e.grad;
+          } else {
+            sbuilder[3 * nid + 1].sum_total -= e.grad;
+          }
+          sbuilder[3 * nid + 2].sum_total += e.hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];
+        sbuilder[3 * nid + 0].sum_total = nstats[nid].pos_grad;
+        sbuilder[3 * nid + 1].sum_total = nstats[nid].neg_grad;
+        sbuilder[3 * nid + 2].sum_total = nstats[nid].sum_hess;        
+      }
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        for (int k = 0; k < 3; ++k) {
+          sbuilder[3 * nid + k].sketch->Push(c[0].fvalue, sbuilder[3 * nid + k].sum_total);
+        }
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Init(max_size);
+      }
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const bst_gpair &e = gpair[ridx];
+        if (e.grad >= 0.0f) {
+          sbuilder[3 * nid + 0].Push(c[j].fvalue, e.grad, max_size);
+        } else {
+          sbuilder[3 * nid + 1].Push(c[j].fvalue, -e.grad, max_size);
+        }
+        sbuilder[3 * nid + 2].Push(c[j].fvalue, e.hess, max_size);
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Finalize(max_size);
+      }
+    }
+  }  
+  inline void SyncNodeStats(void) {
+    utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
+    std::vector<SKStats> tmp(qexpand.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      tmp[i] = node_stats[qexpand[i]];
+    }
+    stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node_stats[qexpand[i]] = tmp[i];
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        RegTree *p_tree) {
+    const bst_uint num_feature = p_tree->param.num_feature;
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      for (bst_uint fid = 0; fid < num_feature; ++ fid) {
+        unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
+        EnumerateSplit(summary_array[base + 0],
+                       summary_array[base + 1],
+                       summary_array[base + 2],
+                       node_stats[nid], fid, &best);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      this->SetStats(nid, node_stats[nid], p_tree);
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+  // set statistics on ptree
+  inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
+    p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
+  }
+  inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
+                             const WXQSketch::Summary &neg_grad,
+                             const WXQSketch::Summary &sum_hess,
+                             const SKStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best) {
+    if (sum_hess.size == 0) return;
+    double root_gain = node_sum.CalcGain(param);
+    std::vector<bst_float> fsplits;
+    for (size_t i = 0; i < pos_grad.size; ++i) {
+      fsplits.push_back(pos_grad.data[i].value);
+    }
+    for (size_t i = 0; i < neg_grad.size; ++i) {
+      fsplits.push_back(neg_grad.data[i].value);
+    }
+    for (size_t i = 0; i < sum_hess.size; ++i) {
+      fsplits.push_back(sum_hess.data[i].value);
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    // sum feature
+    SKStats feat_sum;
+    feat_sum.pos_grad = pos_grad.data[pos_grad.size - 1].rmax;
+    feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax;
+    feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax;
+    size_t ipos = 0, ineg = 0, ihess = 0;
+    for (size_t i = 1; i < fsplits.size(); ++i) {      
+      WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos);
+      WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg);
+      WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess);
+      SKStats s, c;
+      s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin);
+      s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin);
+      s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
+      c.SetSubstract(node_sum, s);      
+      // forward
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, fsplits[i], false);
+      }
+      // backward
+      c.SetSubstract(feat_sum, s);
+      s.SetSubstract(node_sum, c);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, fsplits[i], true);
+      }      
+    }
+    {// all including
+      SKStats s = feat_sum, c;
+      c.SetSubstract(node_sum, s);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        bst_float cpt = fsplits.back();
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;        
+        best->Update(loss_chg, fid, cpt + fabsf(cpt) + 1.0f, true);
+      }
+    }
+  }
+   
+  // thread temp data
+  // used to hold temporal sketch
+  std::vector< std::vector<SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<SKStats> > thread_stats;
+  // node statistics
+  std::vector<SKStats> node_stats;
+  // summary array
+  WXQSketch::SummaryArray summary_array;
+  // reducer for summary
+  rabit::Reducer<SKStats> stats_reducer;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryArray> sketch_reducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+};
+}  // tree
+}  // xgboost
+#endif
--- a/src/tree/updater_sync-inl.hpp
+++ b/src/tree/updater_sync-inl.hpp
@@ -0,0 +1,53 @@
+#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+/*!
+ * \file updater_sync-inl.hpp
+ * \brief synchronize the tree in all distributed nodes
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <limits>
+#include <rabit.h>
+#include "./updater.h"
+
+namespace xgboost {
+namespace tree {
+/*! 
+ * \brief syncher that synchronize the tree in all distributed nodes
+ * can implement various strategies, so far it is always set to node 0's tree
+ */
+class TreeSyncher: public IUpdater {
+ public:
+  virtual ~TreeSyncher(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+  }
+  // update the tree, do pruning
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    this->SyncTrees(trees);
+  }
+  
+ private:
+  // synchronize the trees in different nodes, take tree from rank 0
+  inline void SyncTrees(const std::vector<RegTree *> &trees) {
+    if (rabit::GetWorldSize() == 1) return;
+    std::string s_model;
+    utils::MemoryBufferStream fs(&s_model);
+    int rank = rabit::GetRank();
+    if (rank == 0) {
+      for (size_t i = 0; i < trees.size(); ++i) {
+        trees[i]->SaveModel(fs);
+      }
+    }
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    for (size_t i = 0; i < trees.size(); ++i) {      
+      trees[i]->LoadModel(fs);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_