[REFACTOR] cleanup structure

2015-11-24 14:25:56 -08:00
parent 5ed4dc4f60
commit d530e0c14f
60 changed files with 42 additions and 51 deletions
--- a/old_src/tree/model.h
+++ b/old_src/tree/model.h
@@ -0,0 +1,573 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file model.h
+ * \brief model structure for tree
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_MODEL_H_
+#define XGBOOST_TREE_MODEL_H_
+
+#include <string>
+#include <cstring>
+#include <sstream>
+#include <limits>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include "../utils/io.h"
+#include "../utils/fmap.h"
+#include "../utils/utils.h"
+
+namespace xgboost {
+namespace tree {
+/*!
+ * \brief template class of TreeModel
+ * \tparam TSplitCond data type to indicate split condition
+ * \tparam TNodeStat auxiliary statistics of node to help tree building
+ */
+template<typename TSplitCond, typename TNodeStat>
+class TreeModel {
+ public:
+  /*! \brief data type to indicate split condition */
+  typedef TNodeStat  NodeStat;
+  /*! \brief auxiliary statistics of node to help tree building */
+  typedef TSplitCond SplitCond;
+  /*! \brief parameters of the tree */
+  struct Param{
+    /*! \brief number of start root */
+    int num_roots;
+    /*! \brief total number of nodes */
+    int num_nodes;
+    /*!\brief number of deleted nodes */
+    int num_deleted;
+    /*! \brief maximum depth, this is a statistics of the tree */
+    int max_depth;
+    /*! \brief  number of features used for tree construction */
+    int num_feature;
+    /*!
+     * \brief leaf vector size, used for vector tree
+     * used to store more than one dimensional information in tree
+     */
+    int size_leaf_vector;
+    /*! \brief reserved part */
+    int reserved[31];
+    /*! \brief constructor */
+    Param(void) {
+      max_depth = 0;
+      size_leaf_vector = 0;
+      std::memset(reserved, 0, sizeof(reserved));
+    }
+    /*!
+     * \brief set parameters from outside
+     * \param name name of the parameter
+     * \param val  value of the parameter
+     */
+    inline void SetParam(const char *name, const char *val) {
+      using namespace std;
+      if (!strcmp("num_roots", name)) num_roots = atoi(val);
+      if (!strcmp("num_feature", name)) num_feature = atoi(val);
+      if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
+    }
+  };
+  /*! \brief tree node */
+  class Node {
+   public:
+    Node(void) : sindex_(0) {}
+    /*! \brief index of left child */
+    inline int cleft(void) const {
+      return this->cleft_;
+    }
+    /*! \brief index of right child */
+    inline int cright(void) const {
+      return this->cright_;
+    }
+    /*! \brief index of default child when feature is missing */
+    inline int cdefault(void) const {
+      return this->default_left() ? this->cleft() : this->cright();
+    }
+    /*! \brief feature index of split condition */
+    inline unsigned split_index(void) const {
+      return sindex_ & ((1U << 31) - 1U);
+    }
+    /*! \brief when feature is unknown, whether goes to left child */
+    inline bool default_left(void) const {
+      return (sindex_ >> 31) != 0;
+    }
+    /*! \brief whether current node is leaf node */
+    inline bool is_leaf(void) const {
+      return cleft_ == -1;
+    }
+    /*! \brief get leaf value of leaf node */
+    inline float leaf_value(void) const {
+      return (this->info_).leaf_value;
+    }
+    /*! \brief get split condition of the node */
+    inline TSplitCond split_cond(void) const {
+      return (this->info_).split_cond;
+    }
+    /*! \brief get parent of the node */
+    inline int parent(void) const {
+      return parent_ & ((1U << 31) - 1);
+    }
+    /*! \brief whether current node is left child */
+    inline bool is_left_child(void) const {
+      return (parent_ & (1U << 31)) != 0;
+    }
+    /*! \brief whether this node is deleted */
+    inline bool is_deleted(void) const {
+      return sindex_ == std::numeric_limits<unsigned>::max();
+    }
+    /*! \brief whether current node is root */
+    inline bool is_root(void) const {
+      return parent_ == -1;
+    }
+    /*!
+     * \brief set the right child
+     * \param nide node id to right child
+     */
+    inline void set_right_child(int nid) {
+      this->cright_ = nid;
+    }
+    /*!
+     * \brief set split condition of current node
+     * \param split_index feature index to split
+     * \param split_cond  split condition
+     * \param default_left the default direction when feature is unknown
+     */
+    inline void set_split(unsigned split_index, TSplitCond split_cond,
+                          bool default_left = false) {
+      if (default_left) split_index |= (1U << 31);
+      this->sindex_ = split_index;
+      (this->info_).split_cond = split_cond;
+    }
+    /*!
+     * \brief set the leaf value of the node
+     * \param value leaf value
+     * \param right right index, could be used to store
+     *        additional information
+     */
+    inline void set_leaf(float value, int right = -1) {
+      (this->info_).leaf_value = value;
+      this->cleft_ = -1;
+      this->cright_ = right;
+    }
+    /*! \brief mark that this node is deleted */
+    inline void mark_delete(void) {
+      this->sindex_ = std::numeric_limits<unsigned>::max();
+    }
+
+   private:
+    friend class TreeModel<TSplitCond, TNodeStat>;
+    /*!
+     * \brief in leaf node, we have weights, in non-leaf nodes,
+     *        we have split condition
+     */
+    union Info{
+      float leaf_value;
+      TSplitCond split_cond;
+    };
+    // pointer to parent, highest bit is used to
+    // indicate whether it's a left child or not
+    int parent_;
+    // pointer to left, right
+    int cleft_, cright_;
+    // split feature index, left split or right split depends on the highest bit
+    unsigned sindex_;
+    // extra info
+    Info info_;
+    // set parent
+    inline void set_parent(int pidx, bool is_left_child = true) {
+      if (is_left_child) pidx |= (1U << 31);
+      this->parent_ = pidx;
+    }
+  };
+
+ protected:
+  // vector of nodes
+  std::vector<Node> nodes;
+  // free node space, used during training process
+  std::vector<int>  deleted_nodes;
+  // stats of nodes
+  std::vector<TNodeStat> stats;
+  // leaf vector, that is used to store additional information
+  std::vector<bst_float> leaf_vector;
+  // allocate a new node,
+  // !!!!!! NOTE: may cause BUG here, nodes.resize
+  inline int AllocNode(void) {
+    if (param.num_deleted != 0) {
+      int nd = deleted_nodes.back();
+      deleted_nodes.pop_back();
+      --param.num_deleted;
+      return nd;
+    }
+    int nd = param.num_nodes++;
+    utils::Check(param.num_nodes < std::numeric_limits<int>::max(),
+                 "number of nodes in the tree exceed 2^31");
+    nodes.resize(param.num_nodes);
+    stats.resize(param.num_nodes);
+    leaf_vector.resize(param.num_nodes * param.size_leaf_vector);
+    return nd;
+  }
+  // delete a tree node, keep the parent field to allow trace back
+  inline void DeleteNode(int nid) {
+    utils::Assert(nid >= param.num_roots, "can not delete root");
+    deleted_nodes.push_back(nid);
+    nodes[nid].mark_delete();
+    ++param.num_deleted;
+  }
+
+ public:
+  /*!
+   * \brief change a non leaf node to a leaf node, delete its children
+   * \param rid node id of the node
+   * \param new leaf value
+   */
+  inline void ChangeToLeaf(int rid, float value) {
+    utils::Assert(nodes[nodes[rid].cleft() ].is_leaf(),
+                  "can not delete a non termial child");
+    utils::Assert(nodes[nodes[rid].cright()].is_leaf(),
+                  "can not delete a non termial child");
+    this->DeleteNode(nodes[rid].cleft());
+    this->DeleteNode(nodes[rid].cright());
+    nodes[rid].set_leaf(value);
+  }
+  /*!
+   * \brief collapse a non leaf node to a leaf node, delete its children
+   * \param rid node id of the node
+   * \param new leaf value
+   */
+  inline void CollapseToLeaf(int rid, float value) {
+    if (nodes[rid].is_leaf()) return;
+    if (!nodes[nodes[rid].cleft() ].is_leaf()) {
+      CollapseToLeaf(nodes[rid].cleft(), 0.0f);
+    }
+    if (!nodes[nodes[rid].cright() ].is_leaf()) {
+      CollapseToLeaf(nodes[rid].cright(), 0.0f);
+    }
+    this->ChangeToLeaf(rid, value);
+  }
+
+ public:
+  /*! \brief model parameter */
+  Param param;
+  /*! \brief constructor */
+  TreeModel(void) {
+    param.num_nodes = 1;
+    param.num_roots = 1;
+    param.num_deleted = 0;
+    nodes.resize(1);
+  }
+  /*! \brief get node given nid */
+  inline Node &operator[](int nid) {
+    return nodes[nid];
+  }
+  /*! \brief get node given nid */
+  inline const Node &operator[](int nid) const {
+    return nodes[nid];
+  }
+  /*! \brief get node statistics given nid */
+  inline NodeStat &stat(int nid) {
+    return stats[nid];
+  }
+  /*! \brief get leaf vector given nid */
+  inline bst_float* leafvec(int nid) {
+    if (leaf_vector.size() == 0) return NULL;
+    return &leaf_vector[nid * param.size_leaf_vector];
+  }
+  /*! \brief get leaf vector given nid */
+  inline const bst_float* leafvec(int nid) const {
+    if (leaf_vector.size() == 0) return NULL;
+    return &leaf_vector[nid * param.size_leaf_vector];
+  }
+  /*! \brief initialize the model */
+  inline void InitModel(void) {
+    param.num_nodes = param.num_roots;
+    nodes.resize(param.num_nodes);
+    stats.resize(param.num_nodes);
+    leaf_vector.resize(param.num_nodes * param.size_leaf_vector, 0.0f);
+    for (int i = 0; i < param.num_nodes; i ++) {
+      nodes[i].set_leaf(0.0f);
+      nodes[i].set_parent(-1);
+    }
+  }
+  /*!
+   * \brief load model from stream
+   * \param fi input stream
+   */
+  inline void LoadModel(utils::IStream &fi) { // NOLINT(*)
+    utils::Check(fi.Read(&param, sizeof(Param)) > 0,
+                 "TreeModel: wrong format");
+    nodes.resize(param.num_nodes); stats.resize(param.num_nodes);
+    utils::Assert(param.num_nodes != 0, "invalid model");
+    utils::Check(fi.Read(BeginPtr(nodes), sizeof(Node) * nodes.size()) > 0,
+                 "TreeModel: wrong format");
+    utils::Check(fi.Read(BeginPtr(stats), sizeof(NodeStat) * stats.size()) > 0,
+                 "TreeModel: wrong format");
+    if (param.size_leaf_vector != 0) {
+      utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
+    }
+    // chg deleted nodes
+    deleted_nodes.resize(0);
+    for (int i = param.num_roots; i < param.num_nodes; ++i) {
+      if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
+    }
+    utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
+                  "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
+                  param.num_deleted, deleted_nodes.size(), param.num_nodes);
+  }
+  /*!
+   * \brief save model to stream
+   * \param fo output stream
+   */
+  inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
+    utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
+                  "TreeModel::SaveModel");
+    utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
+                  "TreeModel::SaveModel");
+    fo.Write(&param, sizeof(Param));
+    utils::Assert(param.num_nodes != 0, "invalid model");
+    fo.Write(BeginPtr(nodes), sizeof(Node) * nodes.size());
+    fo.Write(BeginPtr(stats), sizeof(NodeStat) * nodes.size());
+    if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
+  }
+  /*!
+   * \brief add child nodes to node
+   * \param nid node id to add childs
+   */
+  inline void AddChilds(int nid) {
+    int pleft  = this->AllocNode();
+    int pright = this->AllocNode();
+    nodes[nid].cleft_  = pleft;
+    nodes[nid].cright_ = pright;
+    nodes[nodes[nid].cleft() ].set_parent(nid, true);
+    nodes[nodes[nid].cright()].set_parent(nid, false);
+  }
+  /*!
+   * \brief only add a right child to a leaf node
+   * \param node id to add right child
+   */
+  inline void AddRightChild(int nid) {
+    int pright = this->AllocNode();
+    nodes[nid].right  = pright;
+    nodes[nodes[nid].right].set_parent(nid, false);
+  }
+  /*!
+   * \brief get current depth
+   * \param nid node id
+   * \param pass_rchild whether right child is not counted in depth
+   */
+  inline int GetDepth(int nid, bool pass_rchild = false) const {
+    int depth = 0;
+    while (!nodes[nid].is_root()) {
+      if (!pass_rchild || nodes[nid].is_left_child()) ++depth;
+      nid = nodes[nid].parent();
+    }
+    return depth;
+  }
+  /*!
+   * \brief get maximum depth
+   * \param nid node id
+   */
+  inline int MaxDepth(int nid) const {
+    if (nodes[nid].is_leaf()) return 0;
+    return std::max(MaxDepth(nodes[nid].cleft())+1,
+                     MaxDepth(nodes[nid].cright())+1);
+  }
+  /*!
+   * \brief get maximum depth
+   */
+  inline int MaxDepth(void) {
+    int maxd = 0;
+    for (int i = 0; i < param.num_roots; ++i) {
+      maxd = std::max(maxd, MaxDepth(i));
+    }
+    return maxd;
+  }
+  /*! \brief number of extra nodes besides the root */
+  inline int num_extra_nodes(void) const {
+    return param.num_nodes - param.num_roots - param.num_deleted;
+  }
+  /*!
+   * \brief dump model to text string
+   * \param fmap feature map of feature types
+   * \param with_stats whether dump out statistics as well
+   * \return the string of dumped model
+   */
+  inline std::string DumpModel(const utils::FeatMap& fmap, bool with_stats) {
+    std::stringstream fo("");
+    for (int i = 0; i < param.num_roots; ++i) {
+      this->Dump(i, fo, fmap, 0, with_stats);
+    }
+    return fo.str();
+  }
+
+ private:
+  void Dump(int nid, std::stringstream &fo, // NOLINT(*)
+            const utils::FeatMap& fmap, int depth, bool with_stats) {
+    for (int i = 0;  i < depth; ++i) {
+      fo << '\t';
+    }
+    if (nodes[nid].is_leaf()) {
+      fo << nid << ":leaf=" << nodes[nid].leaf_value();
+      if (with_stats) {
+        stat(nid).Print(fo, true);
+      }
+      fo << '\n';
+    } else {
+      // right then left,
+      TSplitCond cond = nodes[nid].split_cond();
+      const unsigned split_index = nodes[nid].split_index();
+      if (split_index < fmap.size()) {
+        switch (fmap.type(split_index)) {
+          case utils::FeatMap::kIndicator: {
+            int nyes = nodes[nid].default_left() ?
+                nodes[nid].cright() : nodes[nid].cleft();
+            fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
+               << ",no=" << nodes[nid].cdefault();
+            break;
+          }
+          case utils::FeatMap::kInteger: {
+            fo << nid << ":[" << fmap.name(split_index) << "<"
+               << int(float(cond)+1.0f)
+               << "] yes=" << nodes[nid].cleft()
+               << ",no=" << nodes[nid].cright()
+               << ",missing=" << nodes[nid].cdefault();
+            break;
+          }
+          case utils::FeatMap::kFloat:
+          case utils::FeatMap::kQuantitive: {
+            fo << nid << ":[" << fmap.name(split_index) << "<"<< float(cond)
+               << "] yes=" << nodes[nid].cleft()
+               << ",no=" << nodes[nid].cright()
+               << ",missing=" << nodes[nid].cdefault();
+            break;
+          }
+          default: utils::Error("unknown fmap type");
+        }
+      } else {
+        fo << nid << ":[f" << split_index << "<"<< float(cond)
+           << "] yes=" << nodes[nid].cleft()
+           << ",no=" << nodes[nid].cright()
+           << ",missing=" << nodes[nid].cdefault();
+      }
+      if (with_stats) {
+        stat(nid).Print(fo, false);
+      }
+      fo << '\n';
+      this->Dump(nodes[nid].cleft(), fo, fmap, depth+1, with_stats);
+      this->Dump(nodes[nid].cright(), fo, fmap, depth+1, with_stats);
+    }
+  }
+};
+
+/*! \brief node statistics used in regression tree */
+struct RTreeNodeStat {
+  /*! \brief loss change caused by current split */
+  float loss_chg;
+  /*! \brief sum of hessian values, used to measure coverage of data */
+  float sum_hess;
+  /*! \brief weight of current node */
+  float base_weight;
+  /*! \brief number of child that is leaf node known up to now */
+  int   leaf_child_cnt;
+  /*! \brief print information of current stats to fo */
+  inline void Print(std::stringstream &fo, bool is_leaf) const { // NOLINT(*)
+    if (!is_leaf) {
+      fo << ",gain=" << loss_chg << ",cover=" << sum_hess;
+    } else {
+      fo << ",cover=" << sum_hess;
+    }
+  }
+};
+
+/*! \brief define regression tree to be the most common tree model */
+class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
+ public:
+  /*!
+   * \brief dense feature vector that can be taken by RegTree
+   * to do traverse efficiently
+   * and can be construct from sparse feature vector
+   */
+  struct FVec {
+    /*!
+     * \brief a union value of value and flag
+     * when flag == -1, this indicate the value is missing
+     */
+    union Entry{
+      float fvalue;
+      int flag;
+    };
+    std::vector<Entry> data;
+    /*! \brief initialize the vector with size vector */
+    inline void Init(size_t size) {
+      Entry e; e.flag = -1;
+      data.resize(size);
+      std::fill(data.begin(), data.end(), e);
+    }
+    /*! \brief fill the vector with sparse vector */
+    inline void Fill(const RowBatch::Inst &inst) {
+      for (bst_uint i = 0; i < inst.length; ++i) {
+        if (inst[i].index >= data.size()) continue;
+        data[inst[i].index].fvalue = inst[i].fvalue;
+      }
+    }
+    /*! \brief drop the trace after fill, must be called after fill */
+    inline void Drop(const RowBatch::Inst &inst) {
+      for (bst_uint i = 0; i < inst.length; ++i) {
+        if (inst[i].index >= data.size()) continue;
+        data[inst[i].index].flag = -1;
+      }
+    }
+    /*! \brief get ith value */
+    inline float fvalue(size_t i) const {
+      return data[i].fvalue;
+    }
+    /*! \brief check whether i-th entry is missing */
+    inline bool is_missing(size_t i) const {
+      return data[i].flag == -1;
+    }
+  };
+  /*!
+   * \brief get the leaf index
+   * \param feat dense feature vector, if the feature is missing the field is set to NaN
+   * \param root_id starting root index of the instance
+   * \return the leaf index of the given feature
+   */
+  inline int GetLeafIndex(const FVec &feat, unsigned root_id = 0) const {
+    // start from groups that belongs to current data
+    int pid = static_cast<int>(root_id);
+    // traverse tree
+    while (!(*this)[ pid ].is_leaf()) {
+      unsigned split_index = (*this)[pid].split_index();
+      pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
+    }
+    return pid;
+  }
+  /*!
+   * \brief get the prediction of regression tree, only accepts dense feature vector
+   * \param feats dense feature vector, if the feature is missing the field is set to NaN
+   * \param root_id starting root index of the instance
+   * \return the leaf index of the given feature
+   */
+  inline float Predict(const FVec &feat, unsigned root_id = 0) const {
+    int pid = this->GetLeafIndex(feat, root_id);
+    return (*this)[pid].leaf_value();
+  }
+  /*! \brief get next position of the tree given current pid */
+  inline int GetNext(int pid, float fvalue, bool is_unknown) const {
+    float split_value = (*this)[pid].split_cond();
+    if (is_unknown) {
+      return (*this)[pid].cdefault();
+    } else {
+      if (fvalue < split_value) {
+        return (*this)[pid].cleft();
+      } else {
+        return (*this)[pid].cright();
+      }
+    }
+  }
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_MODEL_H_
--- a/old_src/tree/param.h
+++ b/old_src/tree/param.h
@@ -0,0 +1,429 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file param.h
+ * \brief training parameters, statistics used to support tree construction
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_PARAM_H_
+#define XGBOOST_TREE_PARAM_H_
+
+#include <vector>
+#include <cstring>
+#include "../data.h"
+
+namespace xgboost {
+namespace tree {
+
+/*! \brief training parameters for regression tree */
+struct TrainParam{
+  // learning step size for a time
+  float learning_rate;
+  // minimum loss change required for a split
+  float min_split_loss;
+  // maximum depth of a tree
+  int max_depth;
+  //----- the rest parameters are less important ----
+  // minimum amount of hessian(weight) allowed in a child
+  float min_child_weight;
+  // L2 regularization factor
+  float reg_lambda;
+  // L1 regularization factor
+  float reg_alpha;
+  // default direction choice
+  int default_direction;
+  // maximum delta update we can add in weight estimation
+  // this parameter can be used to stabilize update
+  // default=0 means no constraint on weight delta
+  float max_delta_step;
+  // whether we want to do subsample
+  float subsample;
+  // whether to subsample columns each split, in each level
+  float colsample_bylevel;
+  // whether to subsample columns during tree construction
+  float colsample_bytree;
+  // speed optimization for dense column
+  float opt_dense_col;
+  // accuracy of sketch
+  float sketch_eps;
+  // accuracy of sketch
+  float sketch_ratio;
+  // leaf vector size
+  int size_leaf_vector;
+  // option for parallelization
+  int parallel_option;
+  // option to open cacheline optimization
+  int cache_opt;
+  // number of threads to be used for tree construction,
+  // if OpenMP is enabled, if equals 0, use system default
+  int nthread;
+  /*! \brief constructor */
+  TrainParam(void) {
+    learning_rate = 0.3f;
+    min_split_loss = 0.0f;
+    min_child_weight = 1.0f;
+    max_delta_step = 0.0f;
+    max_depth = 6;
+    reg_lambda = 1.0f;
+    reg_alpha = 0.0f;
+    default_direction = 0;
+    subsample = 1.0f;
+    colsample_bytree = 1.0f;
+    colsample_bylevel = 1.0f;
+    opt_dense_col = 1.0f;
+    nthread = 0;
+    size_leaf_vector = 0;
+    // enforce parallel option to 0 for now, investigate the other strategy
+    parallel_option = 0;
+    sketch_eps = 0.1f;
+    sketch_ratio = 2.0f;
+    cache_opt = 1;
+  }
+  /*!
+   * \brief set parameters from outside
+   * \param name name of the parameter
+   * \param val  value of the parameter
+   */
+  inline void SetParam(const char *name, const char *val) {
+    using namespace std;
+    // sync-names
+    if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
+    if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
+    if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
+    if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
+    if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
+    if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
+    if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
+    if (!strcmp(name, "max_delta_step")) max_delta_step = static_cast<float>(atof(val));
+    if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
+    if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
+    if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
+    if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
+    if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
+    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
+    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
+    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
+    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
+    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
+    if (!strcmp(name, "nthread")) nthread = atoi(val);
+    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
+    if (!strcmp(name, "default_direction")) {
+      if (!strcmp(val, "learn")) default_direction = 0;
+      if (!strcmp(val, "left")) default_direction = 1;
+      if (!strcmp(val, "right")) default_direction = 2;
+    }
+  }
+  // calculate the cost of loss function
+  inline double CalcGain(double sum_grad, double sum_hess) const {
+    if (sum_hess < min_child_weight) return 0.0;
+    if (max_delta_step == 0.0f) {
+      if (reg_alpha == 0.0f) {
+        return Sqr(sum_grad) / (sum_hess + reg_lambda);
+      } else {
+        return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
+      }
+    } else {
+      double w = CalcWeight(sum_grad, sum_hess);
+      double ret = sum_grad * w + 0.5 * (sum_hess + reg_lambda) * Sqr(w);
+      if (reg_alpha == 0.0f) {
+        return - 2.0 * ret;
+      } else {
+        return - 2.0 * (ret + reg_alpha * std::abs(w));
+      }
+    }
+  }
+  // calculate cost of loss function with four statistics
+  inline double CalcGain(double sum_grad, double sum_hess,
+                         double test_grad, double test_hess) const {
+    double w = CalcWeight(sum_grad, sum_hess);
+    double ret = test_grad * w  + 0.5 * (test_hess + reg_lambda) * Sqr(w);
+    if (reg_alpha == 0.0f) {
+      return - 2.0 * ret;
+    } else {
+      return - 2.0 * (ret + reg_alpha * std::abs(w));
+    }
+  }
+  // calculate weight given the statistics
+  inline double CalcWeight(double sum_grad, double sum_hess) const {
+    if (sum_hess < min_child_weight) return 0.0;
+    double dw;
+    if (reg_alpha == 0.0f) {
+      dw = -sum_grad / (sum_hess + reg_lambda);
+    } else {
+      dw = -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
+    }
+    if (max_delta_step != 0.0f) {
+      if (dw > max_delta_step) dw = max_delta_step;
+      if (dw < -max_delta_step) dw = -max_delta_step;
+    }
+    return dw;
+  }
+  /*! \brief whether need forward small to big search: default right */
+  inline bool need_forward_search(float col_density, bool indicator) const {
+    return this->default_direction == 2 ||
+        (default_direction == 0 && (col_density < opt_dense_col) && !indicator);
+  }
+  /*! \brief whether need backward big to small search: default left */
+  inline bool need_backward_search(float col_density, bool indicator) const {
+    return this->default_direction != 2;
+  }
+  /*! \brief given the loss change, whether we need to invoke pruning */
+  inline bool need_prune(double loss_chg, int depth) const {
+    return loss_chg < this->min_split_loss;
+  }
+  /*! \brief whether we can split with current hessian */
+  inline bool cannot_split(double sum_hess, int depth) const {
+    return sum_hess < this->min_child_weight * 2.0;
+  }
+  /*! \brief maximum sketch size */
+  inline unsigned max_sketch_size(void) const {
+    unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
+    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
+    return ret;
+  }
+
+ protected:
+  // functions for L1 cost
+  inline static double ThresholdL1(double w, double lambda) {
+    if (w > +lambda) return w - lambda;
+    if (w < -lambda) return w + lambda;
+    return 0.0;
+  }
+  inline static double Sqr(double a) {
+    return a * a;
+  }
+};
+
+/*! \brief core statistics used for tree construction */
+struct GradStats {
+  /*! \brief sum gradient statistics */
+  double sum_grad;
+  /*! \brief sum hessian statistics */
+  double sum_hess;
+  /*!
+   * \brief whether this is simply statistics and we only need to call
+   *   Add(gpair), instead of Add(gpair, info, ridx)
+   */
+  static const int kSimpleStats = 1;
+  /*! \brief constructor, the object must be cleared during construction */
+  explicit GradStats(const TrainParam &param) {
+    this->Clear();
+  }
+  /*! \brief clear the statistics */
+  inline void Clear(void) {
+    sum_grad = sum_hess = 0.0f;
+  }
+  /*! \brief check if necessary information is ready */
+  inline static void CheckInfo(const BoosterInfo &info) {
+  }
+  /*!
+   * \brief accumulate statistics
+   * \param p the gradient pair
+   */
+  inline void Add(bst_gpair p) {
+    this->Add(p.grad, p.hess);
+  }
+  /*!
+   * \brief accumulate statistics, more complicated version
+   * \param gpair the vector storing the gradient statistics
+   * \param info the additional information
+   * \param ridx instance index of this instance
+   */
+  inline void Add(const std::vector<bst_gpair> &gpair,
+                  const BoosterInfo &info,
+                  bst_uint ridx) {
+    const bst_gpair &b = gpair[ridx];
+    this->Add(b.grad, b.hess);
+  }
+  /*! \brief calculate leaf weight */
+  inline double CalcWeight(const TrainParam &param) const {
+    return param.CalcWeight(sum_grad, sum_hess);
+  }
+  /*! \brief calculate gain of the solution */
+  inline double CalcGain(const TrainParam &param) const {
+    return param.CalcGain(sum_grad, sum_hess);
+  }
+  /*! \brief add statistics to the data */
+  inline void Add(const GradStats &b) {
+    this->Add(b.sum_grad, b.sum_hess);
+  }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline static void Reduce(GradStats &a, const GradStats &b) { // NOLINT(*)
+    a.Add(b);
+  }
+  /*! \brief set current value to a - b */
+  inline void SetSubstract(const GradStats &a, const GradStats &b) {
+    sum_grad = a.sum_grad - b.sum_grad;
+    sum_hess = a.sum_hess - b.sum_hess;
+  }
+  /*! \return whether the statistics is not used yet */
+  inline bool Empty(void) const {
+    return sum_hess == 0.0;
+  }
+  /*! \brief set leaf vector value based on statistics */
+  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
+  }
+  // constructor to allow inheritance
+  GradStats(void) {}
+  /*! \brief add statistics to the data */
+  inline void Add(double grad, double hess) {
+    sum_grad += grad; sum_hess += hess;
+  }
+};
+
+/*! \brief vectorized cv statistics */
+template<unsigned vsize>
+struct CVGradStats : public GradStats {
+  // additional statistics
+  GradStats train[vsize], valid[vsize];
+  // constructor
+  explicit CVGradStats(const TrainParam &param) {
+    utils::Check(param.size_leaf_vector == vsize,
+                 "CVGradStats: vsize must match size_leaf_vector");
+    this->Clear();
+  }
+  /*! \brief check if necessary information is ready */
+  inline static void CheckInfo(const BoosterInfo &info) {
+    utils::Check(info.fold_index.size() != 0,
+                 "CVGradStats: require fold_index");
+  }
+  /*! \brief clear the statistics */
+  inline void Clear(void) {
+    GradStats::Clear();
+    for (unsigned i = 0; i < vsize; ++i) {
+      train[i].Clear(); valid[i].Clear();
+    }
+  }
+  inline void Add(const std::vector<bst_gpair> &gpair,
+                  const BoosterInfo &info,
+                  bst_uint ridx) {
+    GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
+    const size_t step = info.fold_index.size();
+    for (unsigned i = 0; i < vsize; ++i) {
+      const bst_gpair &b = gpair[(i + 1) * step + ridx];
+      if (info.fold_index[ridx] == i) {
+        valid[i].Add(b.grad, b.hess);
+      } else {
+        train[i].Add(b.grad, b.hess);
+      }
+    }
+  }
+  /*! \brief calculate gain of the solution */
+  inline double CalcGain(const TrainParam &param) const {
+    double ret = 0.0;
+    for (unsigned i = 0; i < vsize; ++i) {
+      ret += param.CalcGain(train[i].sum_grad,
+                            train[i].sum_hess,
+                            vsize * valid[i].sum_grad,
+                            vsize * valid[i].sum_hess);
+    }
+    return ret / vsize;
+  }
+  /*! \brief add statistics to the data */
+  inline void Add(const CVGradStats &b) {
+    GradStats::Add(b);
+    for (unsigned i = 0; i < vsize; ++i) {
+      train[i].Add(b.train[i]);
+      valid[i].Add(b.valid[i]);
+    }
+  }
+  /*! \brief same as add, reduce is used in All Reduce */
+  inline static void Reduce(CVGradStats &a, const CVGradStats &b) { // NOLINT(*)
+    a.Add(b);
+  }
+  /*! \brief set current value to a - b */
+  inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
+    GradStats::SetSubstract(a, b);
+    for (int i = 0; i < vsize; ++i) {
+      train[i].SetSubstract(a.train[i], b.train[i]);
+      valid[i].SetSubstract(a.valid[i], b.valid[i]);
+    }
+  }
+  /*! \brief set leaf vector value based on statistics */
+  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
+    for (int i = 0; i < vsize; ++i) {
+      vec[i] = param.learning_rate *
+          param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
+    }
+  }
+};
+
+/*!
+ * \brief statistics that is helpful to store
+ *   and represent a split solution for the tree
+ */
+struct SplitEntry{
+  /*! \brief loss change after split this node */
+  bst_float loss_chg;
+  /*! \brief split index */
+  unsigned sindex;
+  /*! \brief split value */
+  float split_value;
+  /*! \brief constructor */
+  SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
+  /*!
+   * \brief decides whether we can replace current entry with the given statistics
+   *   This function gives better priority to lower index when loss_chg == new_loss_chg.
+   *   Not the best way, but helps to give consistent result during multi-thread execution.
+   * \param new_loss_chg the loss reduction get through the split
+   * \param split_index the feature index where the split is on
+   */
+  inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
+    if (this->split_index() <= split_index) {
+      return new_loss_chg > this->loss_chg;
+    } else {
+      return !(this->loss_chg > new_loss_chg);
+    }
+  }
+  /*!
+   * \brief update the split entry, replace it if e is better
+   * \param e candidate split solution
+   * \return whether the proposed split is better and can replace current split
+   */
+  inline bool Update(const SplitEntry &e) {
+    if (this->NeedReplace(e.loss_chg, e.split_index())) {
+      this->loss_chg = e.loss_chg;
+      this->sindex = e.sindex;
+      this->split_value = e.split_value;
+      return true;
+    } else {
+      return false;
+    }
+  }
+  /*!
+   * \brief update the split entry, replace it if e is better
+   * \param new_loss_chg loss reduction of new candidate
+   * \param split_index feature index to split on
+   * \param new_split_value the split point
+   * \param default_left whether the missing value goes to left
+   * \return whether the proposed split is better and can replace current split
+   */
+  inline bool Update(bst_float new_loss_chg, unsigned split_index,
+                     float new_split_value, bool default_left) {
+    if (this->NeedReplace(new_loss_chg, split_index)) {
+      this->loss_chg = new_loss_chg;
+      if (default_left) split_index |= (1U << 31);
+      this->sindex = split_index;
+      this->split_value = new_split_value;
+      return true;
+    } else {
+      return false;
+    }
+  }
+  /*! \brief same as update, used by AllReduce*/
+  inline static void Reduce(SplitEntry &dst, const SplitEntry &src) { // NOLINT(*)
+    dst.Update(src);
+  }
+  /*!\return feature index to split on */
+  inline unsigned split_index(void) const {
+    return sindex & ((1U << 31) - 1U);
+  }
+  /*!\return whether missing value goes to left branch */
+  inline bool default_left(void) const {
+    return (sindex >> 31) != 0;
+  }
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_PARAM_H_
--- a/old_src/tree/updater.cpp
+++ b/old_src/tree/updater.cpp
@@ -0,0 +1,35 @@
+// Copyright 2014 by Contributors
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
+#include <cstring>
+#include "./updater.h"
+#include "./updater_prune-inl.hpp"
+#include "./updater_refresh-inl.hpp"
+#include "./updater_colmaker-inl.hpp"
+#ifndef XGBOOST_STRICT_CXX98_
+#include "./updater_sync-inl.hpp"
+#include "./updater_distcol-inl.hpp"
+#include "./updater_histmaker-inl.hpp"
+#include "./updater_skmaker-inl.hpp"
+#endif
+
+namespace xgboost {
+namespace tree {
+IUpdater* CreateUpdater(const char *name) {
+  using namespace std;
+  if (!strcmp(name, "prune")) return new TreePruner();
+  if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
+  if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
+#ifndef XGBOOST_STRICT_CXX98_
+  if (!strcmp(name, "sync")) return new TreeSyncher();
+  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
+  if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
+  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
+#endif
+  utils::Error("unknown updater:%s", name);
+  return NULL;
+}
+
+}  // namespace tree
+}  // namespace xgboost
--- a/old_src/tree/updater.h
+++ b/old_src/tree/updater.h
@@ -0,0 +1,63 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater.h
+ * \brief interface to update the tree
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_H_
+#define XGBOOST_TREE_UPDATER_H_
+
+#include <vector>
+
+#include "../data.h"
+#include "./model.h"
+
+namespace xgboost {
+namespace tree {
+/*!
+ * \brief interface of tree update module, that performs update of a tree
+ */
+class IUpdater {
+ public:
+  /*!
+   * \brief set parameters from outside
+   * \param name name of the parameter
+   * \param val  value of the parameter
+   */
+  virtual void SetParam(const char *name, const char *val) = 0;
+  /*!
+   * \brief perform update to the tree models
+   * \param gpair the gradient pair statistics of the data
+   * \param p_fmat feature matrix that provide access to features
+   * \param info extra side information that may be need, such as root index
+   * \param trees references the trees to be updated, updater will change the content of trees
+   *   note: all the trees in the vector are updated, with the same statistics,
+   *         but maybe different random seeds, usually one tree is passed in at a time,
+   *         there can be multiple trees when we train random forest style model
+   */
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) = 0;
+
+  /*!
+   * \brief this is simply a function for optimizing performance
+   * this function asks the updater to return the leaf position of each instance in the p_fmat,
+   * if it is cached in the updater, if it is not available, return NULL
+   * \return array of leaf position of each instance in the last updated tree
+   */
+  virtual const int* GetLeafPosition(void) const {
+    return NULL;
+  }
+  // destructor
+  virtual ~IUpdater(void) {}
+};
+/*!
+ * \brief create an updater based on name
+ * \param name name of updater
+ * \return return the updater instance
+ */
+IUpdater* CreateUpdater(const char *name);
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_H_
--- a/old_src/tree/updater_basemaker-inl.hpp
+++ b/old_src/tree/updater_basemaker-inl.hpp
@@ -0,0 +1,427 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_basemaker-inl.hpp
+ * \brief implement a common tree constructor
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <limits>
+#include "../sync/sync.h"
+#include "../utils/random.h"
+#include "../utils/quantile.h"
+
+namespace xgboost {
+namespace tree {
+/*!
+ * \brief base tree maker class that defines common operation
+ *  needed in tree making
+ */
+class BaseMaker: public IUpdater {
+ public:
+  // destructor
+  virtual ~BaseMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+
+ protected:
+  // helper to collect and query feature meta information
+  struct FMetaHelper {
+   public:
+    /*! \brief find type of each feature, use column format */
+    inline void InitByCol(IFMatrix *p_fmat,
+                          const RegTree &tree) {
+      fminmax.resize(tree.param.num_feature * 2);
+      std::fill(fminmax.begin(), fminmax.end(),
+                -std::numeric_limits<bst_float>::max());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (bst_uint i = 0; i < batch.size; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const ColBatch::Inst &c = batch[i];
+          if (c.length != 0) {
+            fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
+            fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
+          }
+        }
+      }
+      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
+    }
+    // get feature type, 0:empty 1:binary 2:real
+    inline int Type(bst_uint fid) const {
+      utils::Assert(fid * 2 + 1 < fminmax.size(),
+                    "FeatHelper fid exceed query bound ");
+      bst_float a = fminmax[fid * 2];
+      bst_float b = fminmax[fid * 2 + 1];
+      if (a == -std::numeric_limits<bst_float>::max()) return 0;
+      if (-a == b) {
+        return 1;
+      } else {
+        return 2;
+      }
+    }
+    inline bst_float MaxValue(bst_uint fid) const {
+      return fminmax[fid *2 + 1];
+    }
+    inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
+      std::vector<bst_uint> &findex = *p_findex;
+      findex.clear();
+      for (size_t i = 0; i < fminmax.size(); i += 2) {
+        const bst_uint fid = static_cast<bst_uint>(i / 2);
+        if (this->Type(fid) != 0) findex.push_back(fid);
+      }
+      unsigned n = static_cast<unsigned>(p * findex.size());
+      random::Shuffle(findex);
+      findex.resize(n);
+      // sync the findex if it is subsample
+      std::string s_cache;
+      utils::MemoryBufferStream fc(&s_cache);
+      utils::IStream &fs = fc;
+      if (rabit::GetRank() == 0) {
+        fs.Write(findex);
+      }
+      rabit::Broadcast(&s_cache, 0);
+      fs.Read(&findex);
+    }
+
+   private:
+    std::vector<bst_float> fminmax;
+  };
+  // ------static helper functions ------
+  // helper function to get to next level of the tree
+  /*! \brief this is  helper function for row based data*/
+  inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
+    const RegTree::Node &n = tree[nid];
+    bst_uint findex = n.split_index();
+    for (unsigned i = 0; i < inst.length; ++i) {
+      if (findex == inst[i].index) {
+        if (inst[i].fvalue < n.split_cond()) {
+          return n.cleft();
+        } else {
+          return n.cright();
+        }
+      }
+    }
+    return n.cdefault();
+  }
+  /*! \brief get number of omp thread in current context */
+  inline static int get_nthread(void) {
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    return nthread;
+  }
+  //  ------class member helpers---------
+  /*! \brief initialize temp data structure */
+  inline void InitData(const std::vector<bst_gpair> &gpair,
+                       const IFMatrix &fmat,
+                       const std::vector<unsigned> &root_index,
+                       const RegTree &tree) {
+    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+                  "TreeMaker: can only grow new tree");
+    {
+      // setup position
+      position.resize(gpair.size());
+      if (root_index.size() == 0) {
+        std::fill(position.begin(), position.end(), 0);
+      } else {
+        for (size_t i = 0; i < position.size(); ++i) {
+          position[i] = root_index[i];
+          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
+                        "root index exceed setting");
+        }
+      }
+      // mark delete for the deleted datas
+      for (size_t i = 0; i < position.size(); ++i) {
+        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
+      }
+      // mark subsample
+      if (param.subsample < 1.0f) {
+        for (size_t i = 0; i < position.size(); ++i) {
+          if (gpair[i].hess < 0.0f) continue;
+          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+        }
+      }
+    }
+    {
+      // expand query
+      qexpand.reserve(256); qexpand.clear();
+      for (int i = 0; i < tree.param.num_roots; ++i) {
+        qexpand.push_back(i);
+      }
+      this->UpdateNode2WorkIndex(tree);
+    }
+  }
+  /*! \brief update queue expand add in new leaves */
+  inline void UpdateQueueExpand(const RegTree &tree) {
+    std::vector<int> newnodes;
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      if (!tree[nid].is_leaf()) {
+        newnodes.push_back(tree[nid].cleft());
+        newnodes.push_back(tree[nid].cright());
+      }
+    }
+    // use new nodes for qexpand
+    qexpand = newnodes;
+    this->UpdateNode2WorkIndex(tree);
+  }
+  // return decoded position
+  inline int DecodePosition(bst_uint ridx) const {
+    const int pid = position[ridx];
+    return pid < 0 ? ~pid : pid;
+  }
+  // encode the encoded position value for ridx
+  inline void SetEncodePosition(bst_uint ridx, int nid) {
+    if (position[ridx] < 0) {
+      position[ridx] = ~nid;
+    } else {
+      position[ridx] = nid;
+    }
+  }
+  /*!
+   * \brief this is helper function uses column based data structure,
+   *        reset the positions to the lastest one
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  inline void ResetPositionCol(const std::vector<int> &nodes,
+                               IFMatrix *p_fmat, const RegTree &tree) {
+    // set the positions in the nondefault
+    this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
+    // set rest of instances to default position
+    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+    // set default direct nodes to default
+    // for leaf nodes that are not fresh, mark then to ~nid,
+    // so that they are ignored in future statistics collection
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = this->DecodePosition(ridx);
+      if (tree[nid].is_leaf()) {
+        // mark finish when it is not a fresh leaf
+        if (tree[nid].cright() == -1) {
+          position[ridx] = ~nid;
+        }
+        } else {
+        // push to default branch
+        if (tree[nid].default_left()) {
+          this->SetEncodePosition(ridx, tree[nid].cleft());
+        } else {
+          this->SetEncodePosition(ridx, tree[nid].cright());
+        }
+      }
+    }
+  }
+  /*!
+   * \brief this is helper function uses column based data structure,
+   *        update all positions into nondefault branch, if any, ignore the default branch
+   * \param nodes the set of nodes that contains the split to be used
+   * \param p_fmat feature matrix needed for tree construction
+   * \param tree the regression tree structure
+   */
+  virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
+                                        IFMatrix *p_fmat, const RegTree &tree) {
+    // step 1, classify the non-default data into right places
+    std::vector<unsigned> fsplits;
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      const int nid = nodes[i];
+      if (!tree[nid].is_leaf()) {
+        fsplits.push_back(tree[nid].split_index());
+      }
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      for (size_t i = 0; i < batch.size; ++i) {
+        ColBatch::Inst col = batch[i];
+        const bst_uint fid = batch.col_index[i];
+        const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+          const bst_uint ridx = col[j].index;
+          const float fvalue = col[j].fvalue;
+          const int nid = this->DecodePosition(ridx);
+          // go back to parent, correct those who are not default
+          if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+            if (fvalue < tree[nid].split_cond()) {
+              this->SetEncodePosition(ridx, tree[nid].cleft());
+            } else {
+              this->SetEncodePosition(ridx, tree[nid].cright());
+            }
+          }
+        }
+      }
+    }
+  }
+  /*! \brief helper function to get statistics from a tree */
+  template<typename TStats>
+  inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
+                           const IFMatrix &fmat,
+                           const RegTree &tree,
+                           const BoosterInfo &info,
+                           std::vector< std::vector<TStats> > *p_thread_temp,
+                           std::vector<TStats> *p_node_stats) {
+    std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
+    thread_temp.resize(this->get_nthread());
+    p_node_stats->resize(tree.param.num_nodes);
+    #pragma omp parallel
+    {
+      const int tid = omp_get_thread_num();
+      thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const unsigned nid = qexpand[i];
+        thread_temp[tid][nid].Clear();
+      }
+    }
+    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+    // setup position
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+    #pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < ndata; ++i) {
+      const bst_uint ridx = rowset[i];
+      const int nid = position[ridx];
+      const int tid = omp_get_thread_num();
+      if (nid >= 0) {
+        thread_temp[tid][nid].Add(gpair, info, ridx);
+      }
+    }
+    // sum the per thread statistics together
+    for (size_t j = 0; j < qexpand.size(); ++j) {
+      const int nid = qexpand[j];
+      TStats &s = (*p_node_stats)[nid];
+      s.Clear();
+      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
+        s.Add(thread_temp[tid][nid]);
+      }
+    }
+  }
+  /*! \brief common helper data structure to build sketch */
+  struct SketchEntry {
+    /*! \brief total sum of amount to be met */
+    double sum_total;
+    /*! \brief statistics used in the sketch */
+    double rmin, wmin;
+    /*! \brief last seen feature value */
+    bst_float last_fvalue;
+    /*! \brief current size of sketch */
+    double next_goal;
+    // pointer to the sketch to put things in
+    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
+    // initialize the space
+    inline void Init(unsigned max_size) {
+      next_goal = -1.0f;
+      rmin = wmin = 0.0f;
+      sketch->temp.Reserve(max_size + 1);
+      sketch->temp.size = 0;
+    }
+    /*!
+     * \brief push a new element to sketch
+     * \param fvalue feature value, comes in sorted ascending order
+     * \param w weight
+     * \param max_size
+     */
+    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+      if (next_goal == -1.0f) {
+        next_goal = 0.0f;
+        last_fvalue = fvalue;
+        wmin = w;
+        return;
+      }
+      if (last_fvalue != fvalue) {
+        double rmax = rmin + wmin;
+        if (rmax >= next_goal && sketch->temp.size != max_size) {
+          if (sketch->temp.size == 0 ||
+              last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+            // push to sketch
+            sketch->temp.data[sketch->temp.size] =
+                utils::WXQuantileSketch<bst_float, bst_float>::
+                Entry(static_cast<bst_float>(rmin),
+                      static_cast<bst_float>(rmax),
+                      static_cast<bst_float>(wmin), last_fvalue);
+            utils::Assert(sketch->temp.size < max_size,
+                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
+                          max_size, sketch->temp.size);
+            ++sketch->temp.size;
+          }
+          if (sketch->temp.size == max_size) {
+            next_goal = sum_total * 2.0f + 1e-5f;
+          } else {
+            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
+          }
+        } else {
+          if (rmax >= next_goal) {
+            rabit::TrackerPrintf("INFO: rmax=%g, sum_total=%g, next_goal=%g, size=%lu\n",
+                                 rmax, sum_total, next_goal, sketch->temp.size);
+          }
+        }
+        rmin = rmax;
+        wmin = w;
+        last_fvalue = fvalue;
+      } else {
+        wmin += w;
+      }
+    }
+    /*! \brief push final unfinished value to the sketch */
+    inline void Finalize(unsigned max_size) {
+      double rmax = rmin + wmin;
+      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+        utils::Assert(sketch->temp.size <= max_size,
+                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+                      sketch->temp.size, max_size);
+        // push to sketch
+        sketch->temp.data[sketch->temp.size] =
+            utils::WXQuantileSketch<bst_float, bst_float>::
+            Entry(static_cast<bst_float>(rmin),
+                  static_cast<bst_float>(rmax),
+                  static_cast<bst_float>(wmin), last_fvalue);
+        ++sketch->temp.size;
+      }
+      sketch->PushTemp();
+    }
+  };
+  /*! \brief training parameter of tree grower */
+  TrainParam param;
+  /*! \brief queue of nodes to be expanded */
+  std::vector<int> qexpand;
+  /*!
+   * \brief map active node to is working index offset in qexpand,
+   *   can be -1, which means the node is node actively expanding
+   */
+  std::vector<int> node2workindex;
+  /*!
+   * \brief position of each instance in the tree
+   *   can be negative, which means this position is no longer expanding
+   *   see also Decode/EncodePosition
+   */
+  std::vector<int> position;
+
+ private:
+  inline void UpdateNode2WorkIndex(const RegTree &tree) {
+    // update the node2workindex
+    std::fill(node2workindex.begin(), node2workindex.end(), -1);
+    node2workindex.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node2workindex[qexpand[i]] = static_cast<int>(i);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
--- a/old_src/tree/updater_colmaker-inl.hpp
+++ b/old_src/tree/updater_colmaker-inl.hpp
@@ -0,0 +1,732 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_colmaker-inl.hpp
+ * \brief use columnwise update to construct a tree
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
+
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include "./param.h"
+#include "./updater.h"
+#include "../utils/omp.h"
+#include "../utils/random.h"
+
+namespace xgboost {
+namespace tree {
+/*! \brief column-wise update to construct a tree */
+template<typename TStats>
+class ColMaker: public IUpdater {
+ public:
+  virtual ~ColMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    TStats::CheckInfo(info);
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      Builder builder(param);
+      builder.Update(gpair, p_fmat, info, trees[i]);
+    }
+
+    param.learning_rate = lr;
+  }
+
+ protected:
+  // training parameter
+  TrainParam param;
+  // data structure
+  /*! \brief per thread x per node entry to store tmp data */
+  struct ThreadEntry {
+    /*! \brief statistics of data */
+    TStats stats;
+    /*! \brief extra statistics of data */
+    TStats stats_extra;
+    /*! \brief last feature value scanned */
+    float  last_fvalue;
+    /*! \brief first feature value scanned */
+    float  first_fvalue;
+    /*! \brief current best solution */
+    SplitEntry best;
+    // constructor
+    explicit ThreadEntry(const TrainParam &param)
+        : stats(param), stats_extra(param) {
+    }
+  };
+  struct NodeEntry {
+    /*! \brief statics for node entry */
+    TStats stats;
+    /*! \brief loss of this node, without split */
+    bst_float root_gain;
+    /*! \brief weight calculated related to current data */
+    float weight;
+    /*! \brief current best solution */
+    SplitEntry best;
+    // constructor
+    explicit NodeEntry(const TrainParam &param)
+        : stats(param), root_gain(0.0f), weight(0.0f){
+    }
+  };
+  // actual builder that runs the algorithm
+  struct Builder{
+   public:
+    // constructor
+    explicit Builder(const TrainParam &param) : param(param) {}
+    // update one tree, growing
+    virtual void Update(const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        RegTree *p_tree) {
+      this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+      this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+      for (int depth = 0; depth < param.max_depth; ++depth) {
+        this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
+        this->ResetPosition(qexpand_, p_fmat, *p_tree);
+        this->UpdateQueueExpand(*p_tree, &qexpand_);
+        this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
+        // if nothing left to be expand, break
+        if (qexpand_.size() == 0) break;
+      }
+      // set all the rest expanding nodes to leaf
+      for (size_t i = 0; i < qexpand_.size(); ++i) {
+        const int nid = qexpand_[i];
+        (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
+      }
+      // remember auxiliary statistics in the tree node
+      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+        p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
+        p_tree->stat(nid).base_weight = snode[nid].weight;
+        p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
+        snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
+      }
+    }
+
+   protected:
+    // initialize temp data structure
+    inline void InitData(const std::vector<bst_gpair> &gpair,
+                         const IFMatrix &fmat,
+                         const std::vector<unsigned> &root_index,
+                         const RegTree &tree) {
+      utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+                    "ColMaker: can only grow new tree");
+      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+      {
+        // setup position
+        position.resize(gpair.size());
+        if (root_index.size() == 0) {
+          for (size_t i = 0; i < rowset.size(); ++i) {
+            position[rowset[i]] = 0;
+          }
+        } else {
+          for (size_t i = 0; i < rowset.size(); ++i) {
+            const bst_uint ridx = rowset[i];
+            position[ridx] = root_index[ridx];
+            utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots,
+                          "root index exceed setting");
+          }
+        }
+        // mark delete for the deleted datas
+        for (size_t i = 0; i < rowset.size(); ++i) {
+          const bst_uint ridx = rowset[i];
+          if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
+        }
+        // mark subsample
+        if (param.subsample < 1.0f) {
+          for (size_t i = 0; i < rowset.size(); ++i) {
+            const bst_uint ridx = rowset[i];
+            if (gpair[ridx].hess < 0.0f) continue;
+            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
+          }
+        }
+      }
+      {
+        // initialize feature index
+        unsigned ncol = static_cast<unsigned>(fmat.NumCol());
+        for (unsigned i = 0; i < ncol; ++i) {
+          if (fmat.GetColSize(i) != 0) {
+            feat_index.push_back(i);
+          }
+        }
+        unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
+        random::Shuffle(feat_index);
+        utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included",
+                     param.colsample_bytree);
+        feat_index.resize(n);
+      }
+      {
+        // setup temp space for each thread
+        #pragma omp parallel
+        {
+          this->nthread = omp_get_num_threads();
+        }
+        // reserve a small space
+        stemp.clear();
+        stemp.resize(this->nthread, std::vector<ThreadEntry>());
+        for (size_t i = 0; i < stemp.size(); ++i) {
+          stemp[i].clear(); stemp[i].reserve(256);
+        }
+        snode.reserve(256);
+      }
+      {
+        // expand query
+        qexpand_.reserve(256); qexpand_.clear();
+        for (int i = 0; i < tree.param.num_roots; ++i) {
+          qexpand_.push_back(i);
+        }
+      }
+    }
+    /*!
+     * \brief initialize the base_weight, root_gain,
+     *  and NodeEntry for all the new nodes in qexpand
+     */
+    inline void InitNewNode(const std::vector<int> &qexpand,
+                            const std::vector<bst_gpair> &gpair,
+                            const IFMatrix &fmat,
+                            const BoosterInfo &info,
+                            const RegTree &tree) {
+      {
+        // setup statistics space for each tree node
+        for (size_t i = 0; i < stemp.size(); ++i) {
+          stemp[i].resize(tree.param.num_nodes, ThreadEntry(param));
+        }
+        snode.resize(tree.param.num_nodes, NodeEntry(param));
+      }
+      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
+      // setup position
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int tid = omp_get_thread_num();
+        if (position[ridx] < 0) continue;
+        stemp[tid][position[ridx]].stats.Add(gpair, info, ridx);
+      }
+      // sum the per thread statistics together
+      for (size_t j = 0; j < qexpand.size(); ++j) {
+        const int nid = qexpand[j];
+        TStats stats(param);
+        for (size_t tid = 0; tid < stemp.size(); ++tid) {
+          stats.Add(stemp[tid][nid].stats);
+        }
+        // update node statistics
+        snode[nid].stats = stats;
+        snode[nid].root_gain = static_cast<float>(stats.CalcGain(param));
+        snode[nid].weight = static_cast<float>(stats.CalcWeight(param));
+      }
+    }
+    /*! \brief update queue expand add in new leaves */
+    inline void UpdateQueueExpand(const RegTree &tree, std::vector<int> *p_qexpand) {
+      std::vector<int> &qexpand = *p_qexpand;
+      std::vector<int> newnodes;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[ nid ].is_leaf()) {
+          newnodes.push_back(tree[nid].cleft());
+          newnodes.push_back(tree[nid].cright());
+        }
+      }
+      // use new nodes for qexpand
+      qexpand = newnodes;
+    }
+    // parallel find the best split of current fid
+    // this function does not support nested functions
+    inline void ParallelFindSplit(const ColBatch::Inst &col,
+                                  bst_uint fid,
+                                  const IFMatrix &fmat,
+                                  const std::vector<bst_gpair> &gpair,
+                                  const BoosterInfo &info) {
+      const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
+      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
+      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
+      const std::vector<int> &qexpand = qexpand_;
+      #pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        // cleanup temp statistics
+        for (size_t j = 0; j < qexpand.size(); ++j) {
+          temp[qexpand[j]].stats.Clear();
+        }
+        nthread = omp_get_num_threads();
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          if (temp[nid].stats.Empty()) {
+            temp[nid].first_fvalue = fvalue;
+          }
+          temp[nid].stats.Add(gpair, info, ridx);
+          temp[nid].last_fvalue = fvalue;
+        }
+      }
+      // start collecting the partial sum statistics
+      bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint j = 0; j < nnode; ++j) {
+        const int nid = qexpand[j];
+        TStats sum(param), tmp(param), c(param);
+        for (int tid = 0; tid < nthread; ++tid) {
+          tmp = stemp[tid][nid].stats;
+          stemp[tid][nid].stats = sum;
+          sum.Add(tmp);
+          if (tid != 0) {
+            std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
+          }
+        }
+        for (int tid = 0; tid < nthread; ++tid) {
+          stemp[tid][nid].stats_extra = sum;
+          ThreadEntry &e = stemp[tid][nid];
+          float fsplit;
+          if (tid != 0) {
+            if (std::abs(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
+              fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
+            } else {
+              continue;
+            }
+          } else {
+            fsplit = e.first_fvalue - rt_eps;
+          }
+          if (need_forward && tid != 0) {
+            c.SetSubstract(snode[nid].stats, e.stats);
+            if (c.sum_hess >= param.min_child_weight &&
+                e.stats.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
+                                                          c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, false);
+            }
+          }
+          if (need_backward) {
+            tmp.SetSubstract(sum, e.stats);
+            c.SetSubstract(snode[nid].stats, tmp);
+            if (c.sum_hess >= param.min_child_weight &&
+                tmp.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) +
+                                                          c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, fsplit, true);
+            }
+          }
+        }
+        if (need_backward) {
+          tmp = sum;
+          ThreadEntry &e = stemp[nthread-1][nid];
+          c.SetSubstract(snode[nid].stats, tmp);
+          if (c.sum_hess >= param.min_child_weight &&
+              tmp.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) +
+                                                        c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
+          }
+        }
+      }
+      // rescan, generate candidate split
+      #pragma omp parallel
+      {
+        TStats c(param), cright(param);
+        const int tid = omp_get_thread_num();
+        std::vector<ThreadEntry> &temp = stemp[tid];
+        nthread = static_cast<bst_uint>(omp_get_num_threads());
+        bst_uint step = (col.length + nthread - 1) / nthread;
+        bst_uint end = std::min(col.length, step * (tid + 1));
+        for (bst_uint i = tid * step; i < end; ++i) {
+          const bst_uint ridx = col[i].index;
+          const int nid = position[ridx];
+          if (nid < 0) continue;
+          const float fvalue = col[i].fvalue;
+          // get the statistics of nid
+          ThreadEntry &e = temp[nid];
+          if (e.stats.Empty()) {
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;
+          } else {
+            // forward default right
+            if (std::abs(fvalue - e.first_fvalue) > rt_2eps) {
+              if (need_forward) {
+                c.SetSubstract(snode[nid].stats, e.stats);
+                if (c.sum_hess >= param.min_child_weight &&
+                    e.stats.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
+                                                              c.CalcGain(param) -
+                                                              snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
+                }
+              }
+              if (need_backward) {
+                cright.SetSubstract(e.stats_extra, e.stats);
+                c.SetSubstract(snode[nid].stats, cright);
+                if (c.sum_hess >= param.min_child_weight &&
+                    cright.sum_hess >= param.min_child_weight) {
+                  bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) +
+                                                              c.CalcGain(param) -
+                                                              snode[nid].root_gain);
+                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
+                }
+              }
+            }
+            e.stats.Add(gpair, info, ridx);
+            e.first_fvalue = fvalue;
+          }
+        }
+      }
+    }
+    // update enumeration solution
+    inline void UpdateEnumeration(int nid, bst_gpair gstats,
+                                  float fvalue, int d_step, bst_uint fid,
+                                  TStats &c, std::vector<ThreadEntry> &temp) { // NOLINT(*)
+      // get the statistics of nid
+      ThreadEntry &e = temp[nid];
+      // test if first hit, this is fine, because we set 0 during init
+      if (e.stats.Empty()) {
+        e.stats.Add(gstats);
+        e.last_fvalue = fvalue;
+      } else {
+        // try to find a split
+        if (std::abs(fvalue - e.last_fvalue) > rt_2eps &&
+            e.stats.sum_hess >= param.min_child_weight) {
+          c.SetSubstract(snode[nid].stats, e.stats);
+          if (c.sum_hess >= param.min_child_weight) {
+            bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
+                                                        c.CalcGain(param) - snode[nid].root_gain);
+            e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
+          }
+        }
+        // update the statistics
+        e.stats.Add(gstats);
+        e.last_fvalue = fvalue;
+      }
+    }
+    // same as EnumerateSplit, with cacheline prefetch optimization
+    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
+                                       const ColBatch::Entry *end,
+                                       int d_step,
+                                       bst_uint fid,
+                                       const std::vector<bst_gpair> &gpair,
+                                       std::vector<ThreadEntry> &temp) { // NOLINT(*)
+      const std::vector<int> &qexpand = qexpand_;
+      // clear all the temp statistics
+      for (size_t j = 0; j < qexpand.size(); ++j) {
+        temp[qexpand[j]].stats.Clear();
+      }
+      // left statistics
+      TStats c(param);
+      // local cache buffer for position and gradient pair
+      const int kBuffer = 32;
+      int buf_position[kBuffer];
+      bst_gpair buf_gpair[kBuffer];
+      // aligned ending position
+      const ColBatch::Entry *align_end;
+      if (d_step > 0) {
+        align_end = begin + (end - begin) / kBuffer * kBuffer;
+      } else {
+        align_end = begin - (begin - end) / kBuffer * kBuffer;
+      }
+      int i;
+      const ColBatch::Entry *it;
+      const int align_step = d_step * kBuffer;
+      // internal cached loop
+      for (it = begin; it != align_end; it += align_step) {
+        const ColBatch::Entry *p;
+        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
+          buf_position[i] = position[p->index];
+          buf_gpair[i] = gpair[p->index];
+        }
+        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
+          const int nid = buf_position[i];
+          if (nid < 0) continue;
+          this->UpdateEnumeration(nid, buf_gpair[i],
+                                  p->fvalue, d_step,
+                                  fid, c, temp);
+        }
+      }
+      // finish up the ending piece
+      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
+        buf_position[i] = position[it->index];
+        buf_gpair[i] = gpair[it->index];
+      }
+      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
+        const int nid = buf_position[i];
+        if (nid < 0) continue;
+        this->UpdateEnumeration(nid, buf_gpair[i],
+                                it->fvalue, d_step,
+                                fid, c, temp);
+      }
+      // finish updating all statistics, check if it is possible to include all sum statistics
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        ThreadEntry &e = temp[nid];
+        c.SetSubstract(snode[nid].stats, e.stats);
+        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
+          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
+                                                      c.CalcGain(param) - snode[nid].root_gain);
+          const float gap = std::abs(e.last_fvalue) + rt_eps;
+          const float delta = d_step == +1 ? gap: -gap;
+          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
+        }
+      }
+    }
+
+    // enumerate the split values of specific feature
+    inline void EnumerateSplit(const ColBatch::Entry *begin,
+                               const ColBatch::Entry *end,
+                               int d_step,
+                               bst_uint fid,
+                               const std::vector<bst_gpair> &gpair,
+                               const BoosterInfo &info,
+                               std::vector<ThreadEntry> &temp) { // NOLINT(*)
+      // use cacheline aware optimization
+      if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
+        EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
+        return;
+      }
+      const std::vector<int> &qexpand = qexpand_;
+      // clear all the temp statistics
+      for (size_t j = 0; j < qexpand.size(); ++j) {
+        temp[qexpand[j]].stats.Clear();
+      }
+      // left statistics
+      TStats c(param);
+      for (const ColBatch::Entry *it = begin; it != end; it += d_step) {
+        const bst_uint ridx = it->index;
+        const int nid = position[ridx];
+        if (nid < 0) continue;
+        // start working
+        const float fvalue = it->fvalue;
+        // get the statistics of nid
+        ThreadEntry &e = temp[nid];
+        // test if first hit, this is fine, because we set 0 during init
+        if (e.stats.Empty()) {
+          e.stats.Add(gpair, info, ridx);
+          e.last_fvalue = fvalue;
+        } else {
+          // try to find a split
+          if (std::abs(fvalue - e.last_fvalue) > rt_2eps &&
+              e.stats.sum_hess >= param.min_child_weight) {
+            c.SetSubstract(snode[nid].stats, e.stats);
+            if (c.sum_hess >= param.min_child_weight) {
+              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
+                                                          c.CalcGain(param) - snode[nid].root_gain);
+              e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
+            }
+          }
+          // update the statistics
+          e.stats.Add(gpair, info, ridx);
+          e.last_fvalue = fvalue;
+        }
+      }
+      // finish updating all statistics, check if it is possible to include all sum statistics
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        ThreadEntry &e = temp[nid];
+        c.SetSubstract(snode[nid].stats, e.stats);
+        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
+          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
+                                                      c.CalcGain(param) - snode[nid].root_gain);
+          const float gap = std::abs(e.last_fvalue) + rt_eps;
+          const float delta = d_step == +1 ? gap: -gap;
+          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
+        }
+      }
+    }
+
+    // update the solution candidate
+    virtual void UpdateSolution(const ColBatch &batch,
+                                const std::vector<bst_gpair> &gpair,
+                                const IFMatrix &fmat,
+                                const BoosterInfo &info) {
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #if defined(_OPENMP)
+      const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
+      #endif
+      int poption = param.parallel_option;
+      if (poption == 2) {
+        poption = static_cast<int>(nsize) * 2 < nthread ? 1 : 0;
+      }
+      if (poption == 0) {
+        #pragma omp parallel for schedule(dynamic, batch_size)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          const bst_uint fid = batch.col_index[i];
+          const int tid = omp_get_thread_num();
+          const ColBatch::Inst c = batch[i];
+          const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
+          if (param.need_forward_search(fmat.GetColDensity(fid), ind)) {
+            this->EnumerateSplit(c.data, c.data + c.length, +1,
+                                 fid, gpair, info, stemp[tid]);
+          }
+          if (param.need_backward_search(fmat.GetColDensity(fid), ind)) {
+            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
+                                 fid, gpair, info, stemp[tid]);
+          }
+        }
+      } else {
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          this->ParallelFindSplit(batch[i], batch.col_index[i],
+                                  fmat, gpair, info);
+        }
+      }
+    }
+    // find splits at current level, do split per level
+    inline void FindSplit(int depth,
+                          const std::vector<int> &qexpand,
+                          const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          RegTree *p_tree) {
+      std::vector<bst_uint> feat_set = feat_index;
+      if (param.colsample_bylevel != 1.0f) {
+        random::Shuffle(feat_set);
+        unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
+        utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
+        feat_set.resize(n);
+      }
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
+      while (iter->Next()) {
+        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
+      }
+      // after this each thread's stemp will get the best candidates, aggregate results
+      this->SyncBestSolution(qexpand);
+      // get the best result, we can synchronize the solution
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        NodeEntry &e = snode[nid];
+        // now we know the solution in snode[nid], set split
+        if (e.best.loss_chg > rt_eps) {
+          p_tree->AddChilds(nid);
+          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
+          // mark right child as 0, to indicate fresh leaf
+          (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+          (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        } else {
+          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
+        }
+      }
+    }
+    // reset position of each data points after split is created in the tree
+    inline void ResetPosition(const std::vector<int> &qexpand,
+                              IFMatrix *p_fmat, const RegTree &tree) {
+      // set the positions in the nondefault
+      this->SetNonDefaultPosition(qexpand, p_fmat, tree);
+      // set rest of instances to default position
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // set default direct nodes to default
+      // for leaf nodes that are not fresh, mark then to ~nid,
+      // so that they are ignored in future statistics collection
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        if (ridx >= position.size()) {
+          utils::Printf("ridx exceed bound\n");
+        }
+        const int nid = this->DecodePosition(ridx);
+        if (tree[nid].is_leaf()) {
+          // mark finish when it is not a fresh leaf
+          if (tree[nid].cright() == -1) {
+            position[ridx] = ~nid;
+          }
+        } else {
+          // push to default branch
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cright());
+          }
+        }
+      }
+    }
+    // customization part
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        NodeEntry &e = snode[nid];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          e.best.Update(stemp[tid][nid].best);
+        }
+      }
+    }
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 1, classify the non-default data into right places
+      std::vector<unsigned> fsplits;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
+      }
+      std::sort(fsplits.begin(), fsplits.end());
+      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (size_t i = 0; i < batch.size; ++i) {
+          ColBatch::Inst col = batch[i];
+          const bst_uint fid = batch.col_index[i];
+          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+          #pragma omp parallel for schedule(static)
+          for (bst_omp_uint j = 0; j < ndata; ++j) {
+            const bst_uint ridx = col[j].index;
+            const int nid = this->DecodePosition(ridx);
+            const float fvalue = col[j].fvalue;
+            // go back to parent, correct those who are not default
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if (fvalue < tree[nid].split_cond()) {
+                this->SetEncodePosition(ridx, tree[nid].cleft());
+              } else {
+                this->SetEncodePosition(ridx, tree[nid].cright());
+              }
+            }
+          }
+        }
+      }
+    }
+    // utils to get/set position, with encoded format
+    // return decoded position
+    inline int DecodePosition(bst_uint ridx) const {
+      const int pid = position[ridx];
+      return pid < 0 ? ~pid : pid;
+    }
+    // encode the encoded position value for ridx
+    inline void SetEncodePosition(bst_uint ridx, int nid) {
+      if (position[ridx] < 0) {
+        position[ridx] = ~nid;
+      } else {
+        position[ridx] = nid;
+      }
+    }
+    //  --data fields--
+    const TrainParam &param;
+    // number of omp thread used during training
+    int nthread;
+    // Per feature: shuffle index of each feature index
+    std::vector<bst_uint> feat_index;
+    // Instance Data: current node position in the tree of each instance
+    std::vector<int> position;
+    // PerThread x PerTreeNode: statistics for per thread construction
+    std::vector< std::vector<ThreadEntry> > stemp;
+    /*! \brief TreeNode Data: statistics for each constructed node */
+    std::vector<NodeEntry> snode;
+    /*! \brief queue of nodes to be expanded */
+    std::vector<int> qexpand_;
+  };
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
--- a/old_src/tree/updater_distcol-inl.hpp
+++ b/old_src/tree/updater_distcol-inl.hpp
@@ -0,0 +1,175 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_distcol-inl.hpp
+ * \brief beta distributed version that takes a sub-column
+ *        and construct a tree
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
+
+#include <vector>
+#include <algorithm>
+#include "../sync/sync.h"
+#include "../utils/bitmap.h"
+#include "../utils/io.h"
+#include "./updater_colmaker-inl.hpp"
+#include "./updater_prune-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class DistColMaker : public ColMaker<TStats> {
+ public:
+  DistColMaker(void) : builder(param) {}
+  virtual ~DistColMaker(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+    pruner.SetParam(name, val);
+  }
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    TStats::CheckInfo(info);
+    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
+    // build the tree
+    builder.Update(gpair, p_fmat, info, trees[0]);
+    //// prune the tree, note that pruner will sync the tree
+    pruner.Update(gpair, p_fmat, info, trees);
+    // update position after the tree is pruned
+    builder.UpdatePosition(p_fmat, *trees[0]);
+  }
+  virtual const int* GetLeafPosition(void) const {
+    return builder.GetLeafPosition();
+  }
+
+ private:
+  struct Builder : public ColMaker<TStats>::Builder {
+   public:
+    explicit Builder(const TrainParam &param)
+        : ColMaker<TStats>::Builder(param) {
+    }
+    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        int nid = this->DecodePosition(ridx);
+        while (tree[nid].is_deleted()) {
+          nid = tree[nid].parent();
+          utils::Assert(nid >=0, "distributed learning error");
+        }
+        this->position[ridx] = nid;
+      }
+    }
+    virtual const int* GetLeafPosition(void) const {
+      return BeginPtr(this->position);
+    }
+
+   protected:
+    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
+                                       IFMatrix *p_fmat, const RegTree &tree) {
+      // step 2, classify the non-default data into right places
+      std::vector<unsigned> fsplits;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        if (!tree[nid].is_leaf()) {
+          fsplits.push_back(tree[nid].split_index());
+        }
+      }
+      // get the candidate split index
+      std::sort(fsplits.begin(), fsplits.end());
+      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
+        fsplits.pop_back();
+      }
+      // bitmap is only word concurrent, set to bool first
+      {
+        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
+        boolmap.resize(ndata);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint j = 0; j < ndata; ++j) {
+            boolmap[j] = 0;
+        }
+      }
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        for (size_t i = 0; i < batch.size; ++i) {
+          ColBatch::Inst col = batch[i];
+          const bst_uint fid = batch.col_index[i];
+          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+          #pragma omp parallel for schedule(static)
+          for (bst_omp_uint j = 0; j < ndata; ++j) {
+            const bst_uint ridx = col[j].index;
+            const float fvalue = col[j].fvalue;
+            const int nid = this->DecodePosition(ridx);
+            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+              if (fvalue < tree[nid].split_cond()) {
+                if (!tree[nid].default_left()) boolmap[ridx] = 1;
+              } else {
+                if (tree[nid].default_left()) boolmap[ridx] = 1;
+              }
+            }
+          }
+        }
+      }
+
+      bitmap.InitFromBool(boolmap);
+      // communicate bitmap
+      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
+      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
+      // get the new position
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        const bst_uint ridx = rowset[i];
+        const int nid = this->DecodePosition(ridx);
+        if (bitmap.Get(ridx)) {
+          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
+          if (tree[nid].default_left()) {
+            this->SetEncodePosition(ridx, tree[nid].cright());
+          } else {
+            this->SetEncodePosition(ridx, tree[nid].cleft());
+          }
+        }
+      }
+    }
+    // synchronize the best solution of each node
+    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
+      std::vector<SplitEntry> vec;
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        for (int tid = 0; tid < this->nthread; ++tid) {
+          this->snode[nid].best.Update(this->stemp[tid][nid].best);
+        }
+        vec.push_back(this->snode[nid].best);
+      }
+      // TODO(tqchen) lazy version
+      // communicate best solution
+      reducer.Allreduce(BeginPtr(vec), vec.size());
+      // assign solution back
+      for (size_t i = 0; i < qexpand.size(); ++i) {
+        const int nid = qexpand[i];
+        this->snode[nid].best = vec[i];
+      }
+    }
+
+   private:
+    utils::BitMap bitmap;
+    std::vector<int> boolmap;
+    rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
+  };
+  // we directly introduce pruner here
+  TreePruner pruner;
+  // training parameter
+  TrainParam param;
+  // pointer to the builder
+  Builder builder;
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
--- a/old_src/tree/updater_histmaker-inl.hpp
+++ b/old_src/tree/updater_histmaker-inl.hpp
@@ -0,0 +1,769 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_histmaker-inl.hpp
+ * \brief use histogram counting to construct a tree
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
+
+#include <vector>
+#include <algorithm>
+#include "../sync/sync.h"
+#include "../utils/quantile.h"
+#include "../utils/group_data.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+template<typename TStats>
+class HistMaker: public BaseMaker {
+ public:
+  virtual ~HistMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    TStats::CheckInfo(info);
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+
+ protected:
+  /*! \brief a single histogram */
+  struct HistUnit {
+    /*! \brief cutting point of histogram, contains maximum point */
+    const bst_float *cut;
+    /*! \brief content of statistics data */
+    TStats *data;
+    /*! \brief size of histogram */
+    unsigned size;
+    // default constructor
+    HistUnit(void) {}
+    // constructor
+    HistUnit(const bst_float *cut, TStats *data, unsigned size)
+        : cut(cut), data(data), size(size) {}
+    /*! \brief add a histogram to data */
+    inline void Add(bst_float fv,
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
+      utils::Assert(size != 0, "try insert into size=0");
+      utils::Assert(i < size,
+                    "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
+      data[i].Add(gpair, info, ridx);
+    }
+  };
+  /*! \brief a set of histograms from different index */
+  struct HistSet {
+    /*! \brief the index pointer of each histunit */
+    const unsigned *rptr;
+    /*! \brief cutting points in each histunit */
+    const bst_float *cut;
+    /*! \brief data in different hist unit */
+    std::vector<TStats> data;
+    /*! \brief */
+    inline HistUnit operator[](size_t fid) {
+      return HistUnit(cut + rptr[fid],
+                      &data[0] + rptr[fid],
+                      rptr[fid+1] - rptr[fid]);
+    }
+  };
+  // thread workspace
+  struct ThreadWSpace {
+    /*! \brief actual unit pointer */
+    std::vector<unsigned> rptr;
+    /*! \brief cut field */
+    std::vector<bst_float> cut;
+    // per thread histset
+    std::vector<HistSet> hset;
+    // initialize the hist set
+    inline void Init(const TrainParam &param, int nthread) {
+      hset.resize(nthread);
+      // cleanup statistics
+      for (int tid = 0; tid < nthread; ++tid) {
+        for (size_t i = 0; i < hset[tid].data.size(); ++i) {
+          hset[tid].data[i].Clear();
+        }
+        hset[tid].rptr = BeginPtr(rptr);
+        hset[tid].cut = BeginPtr(cut);
+        hset[tid].data.resize(cut.size(), TStats(param));
+      }
+    }
+    // aggregate all statistics to hset[0]
+    inline void Aggregate(void) {
+      bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        for (size_t tid = 1; tid < hset.size(); ++tid) {
+          hset[0].data[i].Add(hset[tid].data[i]);
+        }
+      }
+    }
+    /*! \brief clear the workspace */
+    inline void Clear(void) {
+      cut.clear(); rptr.resize(1); rptr[0] = 0;
+    }
+    /*! \brief total size */
+    inline size_t Size(void) const {
+      return rptr.size() - 1;
+    }
+  };
+  // workspace of thread
+  ThreadWSpace wspace;
+  // reducer for histogram
+  rabit::Reducer<TStats, TStats::Reduce> histred;
+  // set of working features
+  std::vector<bst_uint> fwork_set;
+  // update function implementation
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      // reset and propose candidate split
+      this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
+      // create histogram
+      this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
+      // find split based on histogram statistics
+      this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
+      // reset position after split
+      this->ResetPositionAfterSplit(p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // this function does two jobs
+  // (1) reset the position in array position, to be the latest leaf id
+  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree) = 0;
+  // initialize the current working set of features in this round
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    p_fset->resize(tree.param.num_feature);
+    for (size_t i = 0; i < p_fset->size(); ++i) {
+      (*p_fset)[i] = static_cast<unsigned>(i);
+    }
+  }
+  // reset position after split, this is not a must, depending on implementation
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+  }
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const std::vector <bst_uint> &fset,
+                          const RegTree &tree)  = 0;
+
+ private:
+  inline void EnumerateSplit(const HistUnit &hist,
+                             const TStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best,
+                             TStats *left_sum) {
+    if (hist.size == 0) return;
+
+    double root_gain = node_sum.CalcGain(param);
+    TStats s(param), c(param);
+    for (bst_uint i = 0; i < hist.size; ++i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          if (best->Update(static_cast<float>(loss_chg), fid, hist.cut[i], false)) {
+            *left_sum = s;
+          }
+        }
+      }
+    }
+    s.Clear();
+    for (bst_uint i = hist.size - 1; i != 0; --i) {
+      s.Add(hist.data[i]);
+      if (s.sum_hess >= param.min_child_weight) {
+        c.SetSubstract(node_sum, s);
+        if (c.sum_hess >= param.min_child_weight) {
+          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+          if (best->Update(static_cast<float>(loss_chg), fid, hist.cut[i-1], true)) {
+            *left_sum = c;
+          }
+        }
+      }
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        const std::vector <bst_uint> &fset,
+                        RegTree *p_tree) {
+    const size_t num_feature = fset.size();
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    std::vector<TStats> left_sum(qexpand.size());
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      for (size_t i = 0; i < fset.size(); ++i) {
+        EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
+                       node_sum, fset[i], &best, &left_sum[wid]);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
+      this->SetStats(p_tree, nid, node_sum);
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+        // right side sum
+        TStats right_sum;
+        right_sum.SetSubstract(node_sum, left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
+        this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+
+  inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
+    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
+  }
+};
+
+template<typename TStats>
+class CQHistMaker: public HistMaker<TStats> {
+ protected:
+  struct HistEntry {
+    typename HistMaker<TStats>::HistUnit hist;
+    unsigned istart;
+    /*!
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    const bst_uint ridx) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gpair, info, ridx);
+    }
+    /*!
+     * \brief add a histogram to data,
+     * do linear scan, start from istart
+     */
+    inline void Add(bst_float fv,
+                    bst_gpair gstats) {
+      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
+      utils::Assert(istart != hist.size, "the bound variable must be max");
+      hist.data[istart].Add(gstats);
+    }
+  };
+  // sketch type used for this
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  // initialize the work set of tree
+  virtual void InitWorkSet(IFMatrix *p_fmat,
+                           const RegTree &tree,
+                           std::vector<bst_uint> *p_fset) {
+    feat_helper.InitByCol(p_fmat, tree);
+    feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
+  }
+  // code to create histogram
+  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const std::vector<bst_uint> &fset,
+                          const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    for (size_t i = 0; i < fset.size(); ++i) {
+      feat2workindex[fset[i]] = static_cast<int>(i);
+    }
+    // start to work
+    this->wspace.Init(this->param, 1);
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_hist = [&]()
+#endif
+    {
+      thread_hist.resize(this->get_nthread());
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateHistCol(gpair, batch[i], info, tree,
+                                fset, offset,
+                                &thread_hist[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        const int wid = this->node2workindex[nid];
+        this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
+            .data[0] = node_stats[nid];
+      }
+    };
+    // sync the histogram
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data),
+                            this->wspace.hset[0].data.size(), lazy_get_hist);
+#else
+    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());
+#endif
+  }
+  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
+                                       const RegTree &tree) {
+    this->ResetPositionCol(this->qexpand, p_fmat, tree);
+  }
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector<bst_uint> &fset,
+                                  const RegTree &tree) {
+    // fill in reverse map
+    feat2workindex.resize(tree.param.num_feature);
+    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
+    freal_set.clear();
+    for (size_t i = 0; i < fset.size(); ++i) {
+      if (feat_helper.Type(fset[i]) == 2) {
+        feat2workindex[fset[i]] = static_cast<int>(freal_set.size());
+        freal_set.push_back(fset[i]);
+      } else {
+        feat2workindex[fset[i]] = -2;
+      }
+    }
+    this->GetNodeStats(gpair, *p_fmat, tree, info,
+                       &thread_stats, &node_stats);
+    sketchs.resize(this->qexpand.size() * freal_set.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    // intitialize the summary array
+    summary_array.resize(sketchs.size());
+    // setup maximum size
+    unsigned max_size = this->param.max_sketch_size();
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      summary_array[i].Reserve(max_size);
+    }
+    // if it is C++11, use lazy evaluation for Allreduce
+#if __cplusplus >= 201103L
+    auto lazy_get_summary = [&]()
+#endif
+        {
+      // get smmary
+      thread_sketch.resize(this->get_nthread());
+      // number of rows in
+      const size_t nrows = p_fmat->buffered_rowset().size();
+      // start accumulating statistics
+      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const ColBatch &batch = iter->Value();
+        // start enumeration
+        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (bst_omp_uint i = 0; i < nsize; ++i) {
+          int offset = feat2workindex[batch.col_index[i]];
+          if (offset >= 0) {
+            this->UpdateSketchCol(gpair, batch[i], tree,
+                                  node_stats,
+                                  freal_set, offset,
+                                  batch[i].length == nrows,
+                                  &thread_sketch[omp_get_thread_num()]);
+          }
+        }
+      }
+      for (size_t i = 0; i < sketchs.size(); ++i) {
+        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+        sketchs[i].GetSummary(&out);
+        summary_array[i].SetPrune(out, max_size);
+      }
+      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
+    };
+    if (summary_array.size() != 0) {
+      size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+#if __cplusplus >= 201103L
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
+#else
+      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+#endif
+    }
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (size_t i = 0; i < fset.size(); ++i) {
+        int offset = feat2workindex[fset[i]];
+        if (offset >= 0) {
+          const WXQSketch::Summary &a = summary_array[wid * freal_set.size() + offset];
+          for (size_t i = 1; i < a.size; ++i) {
+            bst_float cpt = a.data[i].value - rt_eps;
+            if (i == 1 || cpt > this->wspace.cut.back()) {
+              this->wspace.cut.push_back(cpt);
+            }
+          }
+          // push a value that is greater than anything
+          if (a.size != 0) {
+            bst_float cpt = a.data[a.size - 1].value;
+            // this must be bigger than last value in a scale
+            bst_float last = cpt + fabs(cpt) + rt_eps;
+            this->wspace.cut.push_back(last);
+          }
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+        } else {
+          utils::Assert(offset == -2, "BUG in mark");
+          bst_float cpt = feat_helper.MaxValue(fset[i]);
+          this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
+          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+        }
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (fset.size() + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
+  }
+
+ private:
+  inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
+                            const ColBatch::Inst &c,
+                            const BoosterInfo &info,
+                            const RegTree &tree,
+                            const std::vector<bst_uint> &fset,
+                            bst_uint fid_offset,
+                            std::vector<HistEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<HistEntry> &hbuilder = *p_temp;
+    hbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      hbuilder[nid].istart = 0;
+      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
+    }
+    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
+      const bst_uint kBuffer = 32;
+      bst_uint align_length = c.length / kBuffer * kBuffer;
+      int buf_position[kBuffer];
+      bst_gpair buf_gpair[kBuffer];
+      for (bst_uint j = 0; j < align_length; j += kBuffer) {
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          bst_uint ridx = c[j + i].index;
+          buf_position[i] = this->position[ridx];
+          buf_gpair[i] = gpair[ridx];
+        }
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          const int nid = buf_position[i];
+          if (nid >= 0) {
+            hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
+          }
+        }
+      }
+      for (bst_uint j = align_length; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
+        }
+      }
+    } else {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
+        }
+      }
+    }
+  }
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<TStats> &nstats,
+                              const std::vector<bst_uint> &frealset,
+                              bst_uint offset,
+                              bool col_full,
+                              std::vector<BaseMaker::SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      sbuilder[nid].sum_total = 0.0f;
+      sbuilder[nid].sketch = &sketchs[wid * frealset.size() + offset];
+    }
+
+    if (!col_full) {
+      // first pass, get sum of weight, TODO, optimization to skip first pass
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].sum_total += gpair[ridx].hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];
+        sbuilder[nid].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
+      }
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        sbuilder[nid].sketch->Push(c[0].fvalue, static_cast<bst_float>(sbuilder[nid].sum_total));
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = this->param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Init(max_size);
+    }
+    // second pass, build the sketch
+    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
+      const bst_uint kBuffer = 32;
+      bst_uint align_length = c.length / kBuffer * kBuffer;
+      int buf_position[kBuffer];
+      bst_float buf_hess[kBuffer];
+      for (bst_uint j = 0; j < align_length; j += kBuffer) {
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          bst_uint ridx = c[j + i].index;
+          buf_position[i] = this->position[ridx];
+          buf_hess[i] = gpair[ridx].hess;
+        }
+        for (bst_uint i = 0; i < kBuffer; ++i) {
+          const int nid = buf_position[i];
+          if (nid >= 0) {
+            sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
+          }
+        }
+      }
+      for (bst_uint j = align_length; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        }
+      }
+    } else {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
+        }
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      sbuilder[nid].Finalize(max_size);
+    }
+  }
+  // feature helper
+  BaseMaker::FMetaHelper feat_helper;
+  // temp space to map feature id to working index
+  std::vector<int> feat2workindex;
+  // set of index from fset that are real
+  std::vector<bst_uint> freal_set;
+  // thread temp data
+  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<TStats> > thread_stats;
+  // used to hold start pointer
+  std::vector< std::vector<HistEntry> > thread_hist;
+  // node statistics
+  std::vector<TStats> node_stats;
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+};
+
+template<typename TStats>
+class QuantileHistMaker: public HistMaker<TStats> {
+ protected:
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
+                                  IFMatrix *p_fmat,
+                                  const BoosterInfo &info,
+                                  const std::vector <bst_uint> &fset,
+                                  const RegTree &tree) {
+    // initialize the data structure
+    int nthread = BaseMaker::get_nthread();
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    // start accumulating statistics
+    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const RowBatch &batch = iter->Value();
+      // parallel convert to column major format
+      utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
+      builder.InitBudget(tree.param.num_feature, nthread);
+
+      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        int nid = this->position[ridx];
+        if (nid >= 0) {
+          if (!tree[nid].is_leaf()) {
+            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
+          }
+          if (this->node2workindex[nid] < 0) {
+            this->position[ridx] = ~nid;
+          } else {
+            for (bst_uint j = 0; j < inst.length; ++j) {
+              builder.AddBudget(inst[j].index, omp_get_thread_num());
+            }
+          }
+        }
+      }
+      builder.InitStorage();
+      #pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nbatch; ++i) {
+        RowBatch::Inst inst = batch[i];
+        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          for (bst_uint j = 0; j < inst.length; ++j) {
+            builder.Push(inst[j].index,
+                         SparseBatch::Entry(nid, inst[j].fvalue),
+                         omp_get_thread_num());
+          }
+        }
+      }
+      // start putting things into sketch
+      const bst_omp_uint nfeat = col_ptr.size() - 1;
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint k = 0; k < nfeat; ++k) {
+        for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
+          const SparseBatch::Entry &e = col_data[i];
+          const int wid = this->node2workindex[e.index];
+          sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].hess);
+        }
+      }
+    }
+    // setup maximum size
+    unsigned max_size = this->param.max_sketch_size();
+    // synchronize sketch
+    summary_array.resize(sketchs.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array[i].Reserve(max_size);
+      summary_array[i].SetPrune(out, max_size);
+    }
+
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+    sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+    // now we get the final result of sketch, setup the cut
+    this->wspace.cut.clear();
+    this->wspace.rptr.clear();
+    this->wspace.rptr.push_back(0);
+    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
+      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
+        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
+        for (size_t i = 1; i < a.size; ++i) {
+          bst_float cpt = a.data[i].value - rt_eps;
+          if (i == 1 || cpt > this->wspace.cut.back()) {
+            this->wspace.cut.push_back(cpt);
+          }
+        }
+        // push a value that is greater than anything
+        if (a.size != 0) {
+          bst_float cpt = a.data[a.size - 1].value;
+          // this must be bigger than last value in a scale
+          bst_float last = cpt + fabs(cpt) + rt_eps;
+          this->wspace.cut.push_back(last);
+        }
+        this->wspace.rptr.push_back(this->wspace.cut.size());
+      }
+      // reserve last value for global statistics
+      this->wspace.cut.push_back(0.0f);
+      this->wspace.rptr.push_back(this->wspace.cut.size());
+    }
+    utils::Assert(this->wspace.rptr.size() ==
+                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
+                  "cut space inconsistent");
+  }
+
+ private:
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
+  // local temp column data structure
+  std::vector<size_t> col_ptr;
+  // local storage of column data
+  std::vector<SparseBatch::Entry> col_data;
+  std::vector< std::vector<size_t> > thread_col_ptr;
+  // per node, per feature sketch
+  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
--- a/old_src/tree/updater_prune-inl.hpp
+++ b/old_src/tree/updater_prune-inl.hpp
@@ -0,0 +1,87 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_prune-inl.hpp
+ * \brief prune a tree given the statistics
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
+#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
+
+#include <vector>
+#include "./param.h"
+#include "./updater.h"
+#include "./updater_sync-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+/*! \brief pruner that prunes a tree after growing finishes */
+class TreePruner: public IUpdater {
+ public:
+  virtual ~TreePruner(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    using namespace std;
+    param.SetParam(name, val);
+    syncher.SetParam(name, val);
+    if (!strcmp(name, "silent")) silent = atoi(val);
+  }
+  // update the tree, do pruning
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->DoPrune(*trees[i]);
+    }
+    param.learning_rate = lr;
+    syncher.Update(gpair, p_fmat, info, trees);
+  }
+
+ private:
+  // try to prune off current leaf
+  inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*)
+    if (tree[nid].is_root()) return npruned;
+    int pid = tree[nid].parent();
+    RegTree::NodeStat &s = tree.stat(pid);
+    ++s.leaf_child_cnt;
+    if (s.leaf_child_cnt >= 2 && param.need_prune(s.loss_chg, depth - 1)) {
+      // need to be pruned
+      tree.ChangeToLeaf(pid, param.learning_rate * s.base_weight);
+      // tail recursion
+      return this->TryPruneLeaf(tree, pid, depth - 1, npruned+2);
+    } else {
+      return npruned;
+    }
+  }
+  /*! \brief do pruning of a tree */
+  inline void DoPrune(RegTree &tree) { // NOLINT(*)
+    int npruned = 0;
+    // initialize auxiliary statistics
+    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
+      tree.stat(nid).leaf_child_cnt = 0;
+    }
+    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
+      if (tree[nid].is_leaf()) {
+        npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
+      }
+    }
+    if (silent == 0) {
+      utils::Printf("tree pruning end, %d roots, %d extra nodes, %d pruned nodes, max_depth=%d\n",
+                    tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
+    }
+  }
+
+ private:
+  // synchronizer
+  TreeSyncher syncher;
+  // shutup
+  int silent;
+  // training parameter
+  TrainParam param;
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
--- a/old_src/tree/updater_refresh-inl.hpp
+++ b/old_src/tree/updater_refresh-inl.hpp
@@ -0,0 +1,157 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_refresh-inl.hpp
+ * \brief refresh the statistics and leaf value on the tree on the dataset
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
+#define XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
+
+#include <vector>
+#include <limits>
+#include "../sync/sync.h"
+#include "./param.h"
+#include "./updater.h"
+#include "../utils/omp.h"
+
+namespace xgboost {
+namespace tree {
+/*! \brief pruner that prunes a tree after growing finishs */
+template<typename TStats>
+class TreeRefresher: public IUpdater {
+ public:
+  virtual ~TreeRefresher(void) {}
+  // set training parameter
+  virtual void SetParam(const char *name, const char *val) {
+    param.SetParam(name, val);
+  }
+  // update the tree, do pruning
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    if (trees.size() == 0) return;
+    // number of threads
+    // thread temporal space
+    std::vector< std::vector<TStats> > stemp;
+    std::vector<RegTree::FVec> fvec_temp;
+    // setup temp space for each thread
+    int nthread;
+    #pragma omp parallel
+    {
+      nthread = omp_get_num_threads();
+    }
+    fvec_temp.resize(nthread, RegTree::FVec());
+    stemp.resize(nthread, std::vector<TStats>());
+    #pragma omp parallel
+    {
+      int tid = omp_get_thread_num();
+      int num_nodes = 0;
+      for (size_t i = 0; i < trees.size(); ++i) {
+        num_nodes += trees[i]->param.num_nodes;
+      }
+      stemp[tid].resize(num_nodes, TStats(param));
+      std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
+      fvec_temp[tid].Init(trees[0]->param.num_feature);
+    }
+    // if it is C++11, use lazy evaluation for Allreduce,
+    // to gain speedup in recovery
+#if __cplusplus >= 201103L
+    auto lazy_get_stats = [&]()
+#endif
+    {
+      // start accumulating statistics
+      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
+      iter->BeforeFirst();
+      while (iter->Next()) {
+        const RowBatch &batch = iter->Value();
+        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
+                     "too large batch size ");
+        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
+        #pragma omp parallel for schedule(static)
+        for (bst_omp_uint i = 0; i < nbatch; ++i) {
+          RowBatch::Inst inst = batch[i];
+          const int tid = omp_get_thread_num();
+          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+          RegTree::FVec &feats = fvec_temp[tid];
+          feats.Fill(inst);
+          int offset = 0;
+          for (size_t j = 0; j < trees.size(); ++j) {
+            AddStats(*trees[j], feats, gpair, info, ridx,
+                     BeginPtr(stemp[tid]) + offset);
+            offset += trees[j]->param.num_nodes;
+          }
+          feats.Drop(inst);
+        }
+      }
+      // aggregate the statistics
+      int num_nodes = static_cast<int>(stemp[0].size());
+      #pragma omp parallel for schedule(static)
+      for (int nid = 0; nid < num_nodes; ++nid) {
+        for (int tid = 1; tid < nthread; ++tid) {
+          stemp[0][nid].Add(stemp[tid][nid]);
+        }
+      }
+    };
+#if __cplusplus >= 201103L
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+#else
+    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
+#endif
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    int offset = 0;
+    for (size_t i = 0; i < trees.size(); ++i) {
+      for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
+        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
+      }
+      offset += trees[i]->param.num_nodes;
+    }
+    // set learning rate back
+    param.learning_rate = lr;
+  }
+
+ private:
+  inline static void AddStats(const RegTree &tree,
+                              const RegTree::FVec &feat,
+                              const std::vector<bst_gpair> &gpair,
+                              const BoosterInfo &info,
+                              const bst_uint ridx,
+                              TStats *gstats) {
+    // start from groups that belongs to current data
+    int pid = static_cast<int>(info.GetRoot(ridx));
+    gstats[pid].Add(gpair, info, ridx);
+    // tranverse tree
+    while (!tree[pid].is_leaf()) {
+      unsigned split_index = tree[pid].split_index();
+      pid = tree.GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
+      gstats[pid].Add(gpair, info, ridx);
+    }
+  }
+  inline void Refresh(const TStats *gstats,
+                      int nid, RegTree *p_tree) {
+    RegTree &tree = *p_tree;
+    tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
+    tree.stat(nid).sum_hess = static_cast<float>(gstats[nid].sum_hess);
+    gstats[nid].SetLeafVec(param, tree.leafvec(nid));
+    if (tree[nid].is_leaf()) {
+      tree[nid].set_leaf(tree.stat(nid).base_weight * param.learning_rate);
+    } else {
+      tree.stat(nid).loss_chg = static_cast<float>(
+          gstats[tree[nid].cleft()].CalcGain(param) +
+          gstats[tree[nid].cright()].CalcGain(param) -
+          gstats[nid].CalcGain(param));
+      this->Refresh(gstats, tree[nid].cleft(), p_tree);
+      this->Refresh(gstats, tree[nid].cright(), p_tree);
+    }
+  }
+  // training parameter
+  TrainParam param;
+  // reducer
+  rabit::Reducer<TStats, TStats::Reduce> reducer;
+};
+
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
--- a/old_src/tree/updater_skmaker-inl.hpp
+++ b/old_src/tree/updater_skmaker-inl.hpp
@@ -0,0 +1,399 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_skmaker-inl.hpp
+ * \brief use approximation sketch to construct a tree,
+          a refresh is needed to make the statistics exactly correct
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
+
+#include <vector>
+#include <algorithm>
+#include "../sync/sync.h"
+#include "../utils/quantile.h"
+#include "./updater_basemaker-inl.hpp"
+
+namespace xgboost {
+namespace tree {
+class SketchMaker: public BaseMaker {
+ public:
+  virtual ~SketchMaker(void) {}
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    // rescale learning rate according to size of trees
+    float lr = param.learning_rate;
+    param.learning_rate = lr / trees.size();
+    // build tree
+    for (size_t i = 0; i < trees.size(); ++i) {
+      this->Update(gpair, p_fmat, info, trees[i]);
+    }
+    param.learning_rate = lr;
+  }
+
+ protected:
+  inline void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      RegTree *p_tree) {
+    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
+    for (int depth = 0; depth < param.max_depth; ++depth) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->BuildSketch(gpair, p_fmat, info, *p_tree);
+      this->SyncNodeStats();
+      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
+      this->ResetPositionCol(qexpand, p_fmat, *p_tree);
+      this->UpdateQueueExpand(*p_tree);
+      // if nothing left to be expand, break
+      if (qexpand.size() == 0) break;
+    }
+    if (qexpand.size() != 0) {
+      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
+                         &thread_stats, &node_stats);
+      this->SyncNodeStats();
+    }
+    // set all statistics correctly
+    for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
+      this->SetStats(nid, node_stats[nid], p_tree);
+      if (!(*p_tree)[nid].is_leaf()) {
+        p_tree->stat(nid).loss_chg = static_cast<float>(
+            node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
+            node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
+            node_stats[nid].CalcGain(param));
+      }
+    }
+    // set left leaves
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      const int nid = qexpand[i];
+      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+    }
+  }
+  // define the sketch we want to use
+  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
+
+ private:
+  // statistics needed in the gradient calculation
+  struct SKStats {
+    /*! \brief sum of all positive gradient */
+    double pos_grad;
+    /*! \brief sum of all negative gradient */
+    double neg_grad;
+    /*! \brief sum of hessian statistics */
+    double sum_hess;
+    SKStats(void) {}
+    // constructor
+    explicit SKStats(const TrainParam &param) {
+      this->Clear();
+    }
+    /*! \brief clear the statistics */
+    inline void Clear(void) {
+      neg_grad = pos_grad = sum_hess = 0.0f;
+    }
+    // accumulate statistics
+    inline void Add(const std::vector<bst_gpair> &gpair,
+                    const BoosterInfo &info,
+                    bst_uint ridx) {
+      const bst_gpair &b = gpair[ridx];
+      if (b.grad >= 0.0f) {
+        pos_grad += b.grad;
+      } else {
+        neg_grad -= b.grad;
+      }
+      sum_hess += b.hess;
+    }
+    /*! \brief calculate gain of the solution */
+    inline double CalcGain(const TrainParam &param) const {
+      return param.CalcGain(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief set current value to a - b */
+    inline void SetSubstract(const SKStats &a, const SKStats &b) {
+      pos_grad = a.pos_grad - b.pos_grad;
+      neg_grad = a.neg_grad - b.neg_grad;
+      sum_hess = a.sum_hess - b.sum_hess;
+    }
+    // calculate leaf weight
+    inline double CalcWeight(const TrainParam &param) const {
+      return param.CalcWeight(pos_grad - neg_grad, sum_hess);
+    }
+    /*! \brief add statistics to the data */
+    inline void Add(const SKStats &b) {
+      pos_grad += b.pos_grad;
+      neg_grad += b.neg_grad;
+      sum_hess += b.sum_hess;
+    }
+    /*! \brief same as add, reduce is used in All Reduce */
+    inline static void Reduce(SKStats &a, const SKStats &b) { // NOLINT(*)
+      a.Add(b);
+    }
+    /*! \brief set leaf vector value based on statistics */
+    inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
+    }
+  };
+  inline void BuildSketch(const std::vector<bst_gpair> &gpair,
+                          IFMatrix *p_fmat,
+                          const BoosterInfo &info,
+                          const RegTree &tree) {
+    sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      sketchs[i].Init(info.num_row, this->param.sketch_eps);
+    }
+    thread_sketch.resize(this->get_nthread());
+    // number of rows in
+    const size_t nrows = p_fmat->buffered_rowset().size();
+    // start accumulating statistics
+    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
+    iter->BeforeFirst();
+    while (iter->Next()) {
+      const ColBatch &batch = iter->Value();
+      // start enumeration
+      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
+      #pragma omp parallel for schedule(dynamic, 1)
+      for (bst_omp_uint i = 0; i < nsize; ++i) {
+        this->UpdateSketchCol(gpair, batch[i], tree,
+                              node_stats,
+                              batch.col_index[i],
+                              batch[i].length == nrows,
+                              &thread_sketch[omp_get_thread_num()]);
+      }
+    }
+    // setup maximum size
+    unsigned max_size = param.max_sketch_size();
+    // synchronize sketch
+    summary_array.resize(sketchs.size());
+    for (size_t i = 0; i < sketchs.size(); ++i) {
+      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
+      sketchs[i].GetSummary(&out);
+      summary_array[i].Reserve(max_size);
+      summary_array[i].SetPrune(out, max_size);
+    }
+    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
+    sketch_reducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
+  }
+  // update sketch information in column fid
+  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
+                              const ColBatch::Inst &c,
+                              const RegTree &tree,
+                              const std::vector<SKStats> &nstats,
+                              bst_uint fid,
+                              bool col_full,
+                              std::vector<SketchEntry> *p_temp) {
+    if (c.length == 0) return;
+    // initialize sbuilder for use
+    std::vector<SketchEntry> &sbuilder = *p_temp;
+    sbuilder.resize(tree.param.num_nodes * 3);
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const unsigned nid = this->qexpand[i];
+      const unsigned wid = this->node2workindex[nid];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].sum_total = 0.0f;
+        sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];
+      }
+    }
+    if (!col_full) {
+      for (bst_uint j = 0; j < c.length; ++j) {
+        const bst_uint ridx = c[j].index;
+        const int nid = this->position[ridx];
+        if (nid >= 0) {
+          const bst_gpair &e = gpair[ridx];
+          if (e.grad >= 0.0f) {
+            sbuilder[3 * nid + 0].sum_total += e.grad;
+          } else {
+            sbuilder[3 * nid + 1].sum_total -= e.grad;
+          }
+          sbuilder[3 * nid + 2].sum_total += e.hess;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const unsigned nid = this->qexpand[i];
+        sbuilder[3 * nid + 0].sum_total = static_cast<bst_float>(nstats[nid].pos_grad);
+        sbuilder[3 * nid + 1].sum_total = static_cast<bst_float>(nstats[nid].neg_grad);
+        sbuilder[3 * nid + 2].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
+      }
+    }
+    // if only one value, no need to do second pass
+    if (c[0].fvalue  == c[c.length-1].fvalue) {
+      for (size_t i = 0; i < this->qexpand.size(); ++i) {
+        const int nid = this->qexpand[i];
+        for (int k = 0; k < 3; ++k) {
+          sbuilder[3 * nid + k].sketch->Push(c[0].fvalue,
+                                             static_cast<bst_float>(
+                                                 sbuilder[3 * nid + k].sum_total));
+        }
+      }
+      return;
+    }
+    // two pass scan
+    unsigned max_size = param.max_sketch_size();
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Init(max_size);
+      }
+    }
+    // second pass, build the sketch
+    for (bst_uint j = 0; j < c.length; ++j) {
+      const bst_uint ridx = c[j].index;
+      const int nid = this->position[ridx];
+      if (nid >= 0) {
+        const bst_gpair &e = gpair[ridx];
+        if (e.grad >= 0.0f) {
+          sbuilder[3 * nid + 0].Push(c[j].fvalue, e.grad, max_size);
+        } else {
+          sbuilder[3 * nid + 1].Push(c[j].fvalue, -e.grad, max_size);
+        }
+        sbuilder[3 * nid + 2].Push(c[j].fvalue, e.hess, max_size);
+      }
+    }
+    for (size_t i = 0; i < this->qexpand.size(); ++i) {
+      const int nid = this->qexpand[i];
+      for (int k = 0; k < 3; ++k) {
+        sbuilder[3 * nid + k].Finalize(max_size);
+      }
+    }
+  }
+  inline void SyncNodeStats(void) {
+    utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
+    std::vector<SKStats> tmp(qexpand.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      tmp[i] = node_stats[qexpand[i]];
+    }
+    stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
+    for (size_t i = 0; i < qexpand.size(); ++i) {
+      node_stats[qexpand[i]] = tmp[i];
+    }
+  }
+  inline void FindSplit(int depth,
+                        const std::vector<bst_gpair> &gpair,
+                        IFMatrix *p_fmat,
+                        const BoosterInfo &info,
+                        RegTree *p_tree) {
+    const bst_uint num_feature = p_tree->param.num_feature;
+    // get the best split condition for each node
+    std::vector<SplitEntry> sol(qexpand.size());
+    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
+      const int nid = qexpand[wid];
+      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
+                    "node2workindex inconsistent");
+      SplitEntry &best = sol[wid];
+      for (bst_uint fid = 0; fid < num_feature; ++fid) {
+        unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
+        EnumerateSplit(summary_array[base + 0],
+                       summary_array[base + 1],
+                       summary_array[base + 2],
+                       node_stats[nid], fid, &best);
+      }
+    }
+    // get the best result, we can synchronize the solution
+    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
+      const int nid = qexpand[wid];
+      const SplitEntry &best = sol[wid];
+      // set up the values
+      p_tree->stat(nid).loss_chg = best.loss_chg;
+      this->SetStats(nid, node_stats[nid], p_tree);
+      // now we know the solution in snode[nid], set split
+      if (best.loss_chg > rt_eps) {
+        p_tree->AddChilds(nid);
+        (*p_tree)[nid].set_split(best.split_index(),
+                                 best.split_value, best.default_left());
+        // mark right child as 0, to indicate fresh leaf
+        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+      } else {
+        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
+      }
+    }
+  }
+  // set statistics on ptree
+  inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
+    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
+    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
+    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
+  }
+  inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
+                             const WXQSketch::Summary &neg_grad,
+                             const WXQSketch::Summary &sum_hess,
+                             const SKStats &node_sum,
+                             bst_uint fid,
+                             SplitEntry *best) {
+    if (sum_hess.size == 0) return;
+    double root_gain = node_sum.CalcGain(param);
+    std::vector<bst_float> fsplits;
+    for (size_t i = 0; i < pos_grad.size; ++i) {
+      fsplits.push_back(pos_grad.data[i].value);
+    }
+    for (size_t i = 0; i < neg_grad.size; ++i) {
+      fsplits.push_back(neg_grad.data[i].value);
+    }
+    for (size_t i = 0; i < sum_hess.size; ++i) {
+      fsplits.push_back(sum_hess.data[i].value);
+    }
+    std::sort(fsplits.begin(), fsplits.end());
+    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+    // sum feature
+    SKStats feat_sum;
+    feat_sum.pos_grad = pos_grad.data[pos_grad.size - 1].rmax;
+    feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax;
+    feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax;
+    size_t ipos = 0, ineg = 0, ihess = 0;
+    for (size_t i = 1; i < fsplits.size(); ++i) {
+      WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos);
+      WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg);
+      WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess);
+      SKStats s, c;
+      s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin);
+      s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin);
+      s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
+      c.SetSubstract(node_sum, s);
+      // forward
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+        best->Update(static_cast<bst_float>(loss_chg), fid, fsplits[i], false);
+      }
+      // backward
+      c.SetSubstract(feat_sum, s);
+      s.SetSubstract(node_sum, c);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+        best->Update(static_cast<bst_float>(loss_chg), fid, fsplits[i], true);
+      }
+    }
+    {
+      // all including
+      SKStats s = feat_sum, c;
+      c.SetSubstract(node_sum, s);
+      if (s.sum_hess >= param.min_child_weight &&
+          c.sum_hess >= param.min_child_weight) {
+        bst_float cpt = fsplits.back();
+        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
+        best->Update(static_cast<bst_float>(loss_chg), fid, cpt + fabsf(cpt) + 1.0f, false);
+      }
+    }
+  }
+
+  // thread temp data
+  // used to hold temporal sketch
+  std::vector< std::vector<SketchEntry> > thread_sketch;
+  // used to hold statistics
+  std::vector< std::vector<SKStats> > thread_stats;
+  // node statistics
+  std::vector<SKStats> node_stats;
+  // summary array
+  std::vector<WXQSketch::SummaryContainer> summary_array;
+  // reducer for summary
+  rabit::Reducer<SKStats, SKStats::Reduce> stats_reducer;
+  // reducer for summary
+  rabit::SerializeReducer<WXQSketch::SummaryContainer> sketch_reducer;
+  // per node, per feature sketch
+  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
--- a/old_src/tree/updater_sync-inl.hpp
+++ b/old_src/tree/updater_sync-inl.hpp
@@ -0,0 +1,56 @@
+/*!
+ * Copyright 2014 by Contributors
+ * \file updater_sync-inl.hpp
+ * \brief synchronize the tree in all distributed nodes
+ * \author Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
+
+#include <vector>
+#include <string>
+#include <limits>
+#include "../sync/sync.h"
+#include "./updater.h"
+
+namespace xgboost {
+namespace tree {
+/*!
+ * \brief syncher that synchronize the tree in all distributed nodes
+ * can implement various strategies, so far it is always set to node 0's tree
+ */
+class TreeSyncher: public IUpdater {
+ public:
+  virtual ~TreeSyncher(void) {}
+  virtual void SetParam(const char *name, const char *val) {
+  }
+  // update the tree, do pruning
+  virtual void Update(const std::vector<bst_gpair> &gpair,
+                      IFMatrix *p_fmat,
+                      const BoosterInfo &info,
+                      const std::vector<RegTree*> &trees) {
+    this->SyncTrees(trees);
+  }
+
+ private:
+  // synchronize the trees in different nodes, take tree from rank 0
+  inline void SyncTrees(const std::vector<RegTree *> &trees) {
+    if (rabit::GetWorldSize() == 1) return;
+    std::string s_model;
+    utils::MemoryBufferStream fs(&s_model);
+    int rank = rabit::GetRank();
+    if (rank == 0) {
+      for (size_t i = 0; i < trees.size(); ++i) {
+        trees[i]->SaveModel(fs);
+      }
+    }
+    fs.Seek(0);
+    rabit::Broadcast(&s_model, 0);
+    for (size_t i = 0; i < trees.size(); ++i) {
+      trees[i]->LoadModel(fs);
+    }
+  }
+};
+}  // namespace tree
+}  // namespace xgboost
+#endif  // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_