[TREE] Move colmaker

2016-01-01 04:51:55 -08:00
parent c8ccb61b9e
commit 20043f63a6
10 changed files with 95 additions and 349 deletions
--- a/old_src/tree/param.h
+++ b/old_src/tree/param.h
@@ -1,429 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file param.h
- * \brief training parameters, statistics used to support tree construction
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_PARAM_H_
-#define XGBOOST_TREE_PARAM_H_
-
-#include <vector>
-#include <cstring>
-#include "../data.h"
-
-namespace xgboost {
-namespace tree {
-
-/*! \brief training parameters for regression tree */
-struct TrainParam{
-  // learning step size for a time
-  float learning_rate;
-  // minimum loss change required for a split
-  float min_split_loss;
-  // maximum depth of a tree
-  int max_depth;
-  //----- the rest parameters are less important ----
-  // minimum amount of hessian(weight) allowed in a child
-  float min_child_weight;
-  // L2 regularization factor
-  float reg_lambda;
-  // L1 regularization factor
-  float reg_alpha;
-  // default direction choice
-  int default_direction;
-  // maximum delta update we can add in weight estimation
-  // this parameter can be used to stabilize update
-  // default=0 means no constraint on weight delta
-  float max_delta_step;
-  // whether we want to do subsample
-  float subsample;
-  // whether to subsample columns each split, in each level
-  float colsample_bylevel;
-  // whether to subsample columns during tree construction
-  float colsample_bytree;
-  // speed optimization for dense column
-  float opt_dense_col;
-  // accuracy of sketch
-  float sketch_eps;
-  // accuracy of sketch
-  float sketch_ratio;
-  // leaf vector size
-  int size_leaf_vector;
-  // option for parallelization
-  int parallel_option;
-  // option to open cacheline optimization
-  int cache_opt;
-  // number of threads to be used for tree construction,
-  // if OpenMP is enabled, if equals 0, use system default
-  int nthread;
-  /*! \brief constructor */
-  TrainParam(void) {
-    learning_rate = 0.3f;
-    min_split_loss = 0.0f;
-    min_child_weight = 1.0f;
-    max_delta_step = 0.0f;
-    max_depth = 6;
-    reg_lambda = 1.0f;
-    reg_alpha = 0.0f;
-    default_direction = 0;
-    subsample = 1.0f;
-    colsample_bytree = 1.0f;
-    colsample_bylevel = 1.0f;
-    opt_dense_col = 1.0f;
-    nthread = 0;
-    size_leaf_vector = 0;
-    // enforce parallel option to 0 for now, investigate the other strategy
-    parallel_option = 0;
-    sketch_eps = 0.1f;
-    sketch_ratio = 2.0f;
-    cache_opt = 1;
-  }
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    // sync-names
-    if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
-    if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
-    if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
-    if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
-    if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
-    if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
-    if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
-    if (!strcmp(name, "max_delta_step")) max_delta_step = static_cast<float>(atof(val));
-    if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
-    if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
-    if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
-    if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
-    if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
-    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
-    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
-    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
-    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
-    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
-    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
-    if (!strcmp(name, "nthread")) nthread = atoi(val);
-    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
-    if (!strcmp(name, "default_direction")) {
-      if (!strcmp(val, "learn")) default_direction = 0;
-      if (!strcmp(val, "left")) default_direction = 1;
-      if (!strcmp(val, "right")) default_direction = 2;
-    }
-  }
-  // calculate the cost of loss function
-  inline double CalcGain(double sum_grad, double sum_hess) const {
-    if (sum_hess < min_child_weight) return 0.0;
-    if (max_delta_step == 0.0f) {
-      if (reg_alpha == 0.0f) {
-        return Sqr(sum_grad) / (sum_hess + reg_lambda);
-      } else {
-        return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
-      }
-    } else {
-      double w = CalcWeight(sum_grad, sum_hess);
-      double ret = sum_grad * w + 0.5 * (sum_hess + reg_lambda) * Sqr(w);
-      if (reg_alpha == 0.0f) {
-        return - 2.0 * ret;
-      } else {
-        return - 2.0 * (ret + reg_alpha * std::abs(w));
-      }
-    }
-  }
-  // calculate cost of loss function with four statistics
-  inline double CalcGain(double sum_grad, double sum_hess,
-                         double test_grad, double test_hess) const {
-    double w = CalcWeight(sum_grad, sum_hess);
-    double ret = test_grad * w  + 0.5 * (test_hess + reg_lambda) * Sqr(w);
-    if (reg_alpha == 0.0f) {
-      return - 2.0 * ret;
-    } else {
-      return - 2.0 * (ret + reg_alpha * std::abs(w));
-    }
-  }
-  // calculate weight given the statistics
-  inline double CalcWeight(double sum_grad, double sum_hess) const {
-    if (sum_hess < min_child_weight) return 0.0;
-    double dw;
-    if (reg_alpha == 0.0f) {
-      dw = -sum_grad / (sum_hess + reg_lambda);
-    } else {
-      dw = -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
-    }
-    if (max_delta_step != 0.0f) {
-      if (dw > max_delta_step) dw = max_delta_step;
-      if (dw < -max_delta_step) dw = -max_delta_step;
-    }
-    return dw;
-  }
-  /*! \brief whether need forward small to big search: default right */
-  inline bool need_forward_search(float col_density, bool indicator) const {
-    return this->default_direction == 2 ||
-        (default_direction == 0 && (col_density < opt_dense_col) && !indicator);
-  }
-  /*! \brief whether need backward big to small search: default left */
-  inline bool need_backward_search(float col_density, bool indicator) const {
-    return this->default_direction != 2;
-  }
-  /*! \brief given the loss change, whether we need to invoke pruning */
-  inline bool need_prune(double loss_chg, int depth) const {
-    return loss_chg < this->min_split_loss;
-  }
-  /*! \brief whether we can split with current hessian */
-  inline bool cannot_split(double sum_hess, int depth) const {
-    return sum_hess < this->min_child_weight * 2.0;
-  }
-  /*! \brief maximum sketch size */
-  inline unsigned max_sketch_size(void) const {
-    unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
-    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
-    return ret;
-  }
-
- protected:
-  // functions for L1 cost
-  inline static double ThresholdL1(double w, double lambda) {
-    if (w > +lambda) return w - lambda;
-    if (w < -lambda) return w + lambda;
-    return 0.0;
-  }
-  inline static double Sqr(double a) {
-    return a * a;
-  }
-};
-
-/*! \brief core statistics used for tree construction */
-struct GradStats {
-  /*! \brief sum gradient statistics */
-  double sum_grad;
-  /*! \brief sum hessian statistics */
-  double sum_hess;
-  /*!
-   * \brief whether this is simply statistics and we only need to call
-   *   Add(gpair), instead of Add(gpair, info, ridx)
-   */
-  static const int kSimpleStats = 1;
-  /*! \brief constructor, the object must be cleared during construction */
-  explicit GradStats(const TrainParam &param) {
-    this->Clear();
-  }
-  /*! \brief clear the statistics */
-  inline void Clear(void) {
-    sum_grad = sum_hess = 0.0f;
-  }
-  /*! \brief check if necessary information is ready */
-  inline static void CheckInfo(const BoosterInfo &info) {
-  }
-  /*!
-   * \brief accumulate statistics
-   * \param p the gradient pair
-   */
-  inline void Add(bst_gpair p) {
-    this->Add(p.grad, p.hess);
-  }
-  /*!
-   * \brief accumulate statistics, more complicated version
-   * \param gpair the vector storing the gradient statistics
-   * \param info the additional information
-   * \param ridx instance index of this instance
-   */
-  inline void Add(const std::vector<bst_gpair> &gpair,
-                  const BoosterInfo &info,
-                  bst_uint ridx) {
-    const bst_gpair &b = gpair[ridx];
-    this->Add(b.grad, b.hess);
-  }
-  /*! \brief calculate leaf weight */
-  inline double CalcWeight(const TrainParam &param) const {
-    return param.CalcWeight(sum_grad, sum_hess);
-  }
-  /*! \brief calculate gain of the solution */
-  inline double CalcGain(const TrainParam &param) const {
-    return param.CalcGain(sum_grad, sum_hess);
-  }
-  /*! \brief add statistics to the data */
-  inline void Add(const GradStats &b) {
-    this->Add(b.sum_grad, b.sum_hess);
-  }
-  /*! \brief same as add, reduce is used in All Reduce */
-  inline static void Reduce(GradStats &a, const GradStats &b) { // NOLINT(*)
-    a.Add(b);
-  }
-  /*! \brief set current value to a - b */
-  inline void SetSubstract(const GradStats &a, const GradStats &b) {
-    sum_grad = a.sum_grad - b.sum_grad;
-    sum_hess = a.sum_hess - b.sum_hess;
-  }
-  /*! \return whether the statistics is not used yet */
-  inline bool Empty(void) const {
-    return sum_hess == 0.0;
-  }
-  /*! \brief set leaf vector value based on statistics */
-  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
-  }
-  // constructor to allow inheritance
-  GradStats(void) {}
-  /*! \brief add statistics to the data */
-  inline void Add(double grad, double hess) {
-    sum_grad += grad; sum_hess += hess;
-  }
-};
-
-/*! \brief vectorized cv statistics */
-template<unsigned vsize>
-struct CVGradStats : public GradStats {
-  // additional statistics
-  GradStats train[vsize], valid[vsize];
-  // constructor
-  explicit CVGradStats(const TrainParam &param) {
-    utils::Check(param.size_leaf_vector == vsize,
-                 "CVGradStats: vsize must match size_leaf_vector");
-    this->Clear();
-  }
-  /*! \brief check if necessary information is ready */
-  inline static void CheckInfo(const BoosterInfo &info) {
-    utils::Check(info.fold_index.size() != 0,
-                 "CVGradStats: require fold_index");
-  }
-  /*! \brief clear the statistics */
-  inline void Clear(void) {
-    GradStats::Clear();
-    for (unsigned i = 0; i < vsize; ++i) {
-      train[i].Clear(); valid[i].Clear();
-    }
-  }
-  inline void Add(const std::vector<bst_gpair> &gpair,
-                  const BoosterInfo &info,
-                  bst_uint ridx) {
-    GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
-    const size_t step = info.fold_index.size();
-    for (unsigned i = 0; i < vsize; ++i) {
-      const bst_gpair &b = gpair[(i + 1) * step + ridx];
-      if (info.fold_index[ridx] == i) {
-        valid[i].Add(b.grad, b.hess);
-      } else {
-        train[i].Add(b.grad, b.hess);
-      }
-    }
-  }
-  /*! \brief calculate gain of the solution */
-  inline double CalcGain(const TrainParam &param) const {
-    double ret = 0.0;
-    for (unsigned i = 0; i < vsize; ++i) {
-      ret += param.CalcGain(train[i].sum_grad,
-                            train[i].sum_hess,
-                            vsize * valid[i].sum_grad,
-                            vsize * valid[i].sum_hess);
-    }
-    return ret / vsize;
-  }
-  /*! \brief add statistics to the data */
-  inline void Add(const CVGradStats &b) {
-    GradStats::Add(b);
-    for (unsigned i = 0; i < vsize; ++i) {
-      train[i].Add(b.train[i]);
-      valid[i].Add(b.valid[i]);
-    }
-  }
-  /*! \brief same as add, reduce is used in All Reduce */
-  inline static void Reduce(CVGradStats &a, const CVGradStats &b) { // NOLINT(*)
-    a.Add(b);
-  }
-  /*! \brief set current value to a - b */
-  inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
-    GradStats::SetSubstract(a, b);
-    for (int i = 0; i < vsize; ++i) {
-      train[i].SetSubstract(a.train[i], b.train[i]);
-      valid[i].SetSubstract(a.valid[i], b.valid[i]);
-    }
-  }
-  /*! \brief set leaf vector value based on statistics */
-  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
-    for (int i = 0; i < vsize; ++i) {
-      vec[i] = param.learning_rate *
-          param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
-    }
-  }
-};
-
-/*!
- * \brief statistics that is helpful to store
- *   and represent a split solution for the tree
- */
-struct SplitEntry{
-  /*! \brief loss change after split this node */
-  bst_float loss_chg;
-  /*! \brief split index */
-  unsigned sindex;
-  /*! \brief split value */
-  float split_value;
-  /*! \brief constructor */
-  SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
-  /*!
-   * \brief decides whether we can replace current entry with the given statistics
-   *   This function gives better priority to lower index when loss_chg == new_loss_chg.
-   *   Not the best way, but helps to give consistent result during multi-thread execution.
-   * \param new_loss_chg the loss reduction get through the split
-   * \param split_index the feature index where the split is on
-   */
-  inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
-    if (this->split_index() <= split_index) {
-      return new_loss_chg > this->loss_chg;
-    } else {
-      return !(this->loss_chg > new_loss_chg);
-    }
-  }
-  /*!
-   * \brief update the split entry, replace it if e is better
-   * \param e candidate split solution
-   * \return whether the proposed split is better and can replace current split
-   */
-  inline bool Update(const SplitEntry &e) {
-    if (this->NeedReplace(e.loss_chg, e.split_index())) {
-      this->loss_chg = e.loss_chg;
-      this->sindex = e.sindex;
-      this->split_value = e.split_value;
-      return true;
-    } else {
-      return false;
-    }
-  }
-  /*!
-   * \brief update the split entry, replace it if e is better
-   * \param new_loss_chg loss reduction of new candidate
-   * \param split_index feature index to split on
-   * \param new_split_value the split point
-   * \param default_left whether the missing value goes to left
-   * \return whether the proposed split is better and can replace current split
-   */
-  inline bool Update(bst_float new_loss_chg, unsigned split_index,
-                     float new_split_value, bool default_left) {
-    if (this->NeedReplace(new_loss_chg, split_index)) {
-      this->loss_chg = new_loss_chg;
-      if (default_left) split_index |= (1U << 31);
-      this->sindex = split_index;
-      this->split_value = new_split_value;
-      return true;
-    } else {
-      return false;
-    }
-  }
-  /*! \brief same as update, used by AllReduce*/
-  inline static void Reduce(SplitEntry &dst, const SplitEntry &src) { // NOLINT(*)
-    dst.Update(src);
-  }
-  /*!\return feature index to split on */
-  inline unsigned split_index(void) const {
-    return sindex & ((1U << 31) - 1U);
-  }
-  /*!\return whether missing value goes to left branch */
-  inline bool default_left(void) const {
-    return (sindex >> 31) != 0;
-  }
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_PARAM_H_
--- a/old_src/tree/updater_colmaker-inl.hpp
+++ b/old_src/tree/updater_colmaker-inl.hpp
@@ -1,732 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_colmaker-inl.hpp
- * \brief use columnwise update to construct a tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
-
-#include <vector>
-#include <cmath>
-#include <algorithm>
-#include "./param.h"
-#include "./updater.h"
-#include "../utils/omp.h"
-#include "../utils/random.h"
-
-namespace xgboost {
-namespace tree {
-/*! \brief column-wise update to construct a tree */
-template<typename TStats>
-class ColMaker: public IUpdater {
- public:
-  virtual ~ColMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-  }
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
-    // rescale learning rate according to size of trees
-    float lr = param.learning_rate;
-    param.learning_rate = lr / trees.size();
-    // build tree
-    for (size_t i = 0; i < trees.size(); ++i) {
-      Builder builder(param);
-      builder.Update(gpair, p_fmat, info, trees[i]);
-    }
-
-    param.learning_rate = lr;
-  }
-
- protected:
-  // training parameter
-  TrainParam param;
-  // data structure
-  /*! \brief per thread x per node entry to store tmp data */
-  struct ThreadEntry {
-    /*! \brief statistics of data */
-    TStats stats;
-    /*! \brief extra statistics of data */
-    TStats stats_extra;
-    /*! \brief last feature value scanned */
-    float  last_fvalue;
-    /*! \brief first feature value scanned */
-    float  first_fvalue;
-    /*! \brief current best solution */
-    SplitEntry best;
-    // constructor
-    explicit ThreadEntry(const TrainParam &param)
-        : stats(param), stats_extra(param) {
-    }
-  };
-  struct NodeEntry {
-    /*! \brief statics for node entry */
-    TStats stats;
-    /*! \brief loss of this node, without split */
-    bst_float root_gain;
-    /*! \brief weight calculated related to current data */
-    float weight;
-    /*! \brief current best solution */
-    SplitEntry best;
-    // constructor
-    explicit NodeEntry(const TrainParam &param)
-        : stats(param), root_gain(0.0f), weight(0.0f){
-    }
-  };
-  // actual builder that runs the algorithm
-  struct Builder{
-   public:
-    // constructor
-    explicit Builder(const TrainParam &param) : param(param) {}
-    // update one tree, growing
-    virtual void Update(const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
-                        RegTree *p_tree) {
-      this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-      this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
-      for (int depth = 0; depth < param.max_depth; ++depth) {
-        this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
-        this->ResetPosition(qexpand_, p_fmat, *p_tree);
-        this->UpdateQueueExpand(*p_tree, &qexpand_);
-        this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
-        // if nothing left to be expand, break
-        if (qexpand_.size() == 0) break;
-      }
-      // set all the rest expanding nodes to leaf
-      for (size_t i = 0; i < qexpand_.size(); ++i) {
-        const int nid = qexpand_[i];
-        (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
-      }
-      // remember auxiliary statistics in the tree node
-      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
-        p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
-        p_tree->stat(nid).base_weight = snode[nid].weight;
-        p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
-        snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
-      }
-    }
-
-   protected:
-    // initialize temp data structure
-    inline void InitData(const std::vector<bst_gpair> &gpair,
-                         const IFMatrix &fmat,
-                         const std::vector<unsigned> &root_index,
-                         const RegTree &tree) {
-      utils::Assert(tree.param.num_nodes == tree.param.num_roots,
-                    "ColMaker: can only grow new tree");
-      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-      {
-        // setup position
-        position.resize(gpair.size());
-        if (root_index.size() == 0) {
-          for (size_t i = 0; i < rowset.size(); ++i) {
-            position[rowset[i]] = 0;
-          }
-        } else {
-          for (size_t i = 0; i < rowset.size(); ++i) {
-            const bst_uint ridx = rowset[i];
-            position[ridx] = root_index[ridx];
-            utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots,
-                          "root index exceed setting");
-          }
-        }
-        // mark delete for the deleted datas
-        for (size_t i = 0; i < rowset.size(); ++i) {
-          const bst_uint ridx = rowset[i];
-          if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
-        }
-        // mark subsample
-        if (param.subsample < 1.0f) {
-          for (size_t i = 0; i < rowset.size(); ++i) {
-            const bst_uint ridx = rowset[i];
-            if (gpair[ridx].hess < 0.0f) continue;
-            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
-          }
-        }
-      }
-      {
-        // initialize feature index
-        unsigned ncol = static_cast<unsigned>(fmat.NumCol());
-        for (unsigned i = 0; i < ncol; ++i) {
-          if (fmat.GetColSize(i) != 0) {
-            feat_index.push_back(i);
-          }
-        }
-        unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
-        random::Shuffle(feat_index);
-        utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included",
-                     param.colsample_bytree);
-        feat_index.resize(n);
-      }
-      {
-        // setup temp space for each thread
-        #pragma omp parallel
-        {
-          this->nthread = omp_get_num_threads();
-        }
-        // reserve a small space
-        stemp.clear();
-        stemp.resize(this->nthread, std::vector<ThreadEntry>());
-        for (size_t i = 0; i < stemp.size(); ++i) {
-          stemp[i].clear(); stemp[i].reserve(256);
-        }
-        snode.reserve(256);
-      }
-      {
-        // expand query
-        qexpand_.reserve(256); qexpand_.clear();
-        for (int i = 0; i < tree.param.num_roots; ++i) {
-          qexpand_.push_back(i);
-        }
-      }
-    }
-    /*!
-     * \brief initialize the base_weight, root_gain,
-     *  and NodeEntry for all the new nodes in qexpand
-     */
-    inline void InitNewNode(const std::vector<int> &qexpand,
-                            const std::vector<bst_gpair> &gpair,
-                            const IFMatrix &fmat,
-                            const BoosterInfo &info,
-                            const RegTree &tree) {
-      {
-        // setup statistics space for each tree node
-        for (size_t i = 0; i < stemp.size(); ++i) {
-          stemp[i].resize(tree.param.num_nodes, ThreadEntry(param));
-        }
-        snode.resize(tree.param.num_nodes, NodeEntry(param));
-      }
-      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-      // setup position
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int tid = omp_get_thread_num();
-        if (position[ridx] < 0) continue;
-        stemp[tid][position[ridx]].stats.Add(gpair, info, ridx);
-      }
-      // sum the per thread statistics together
-      for (size_t j = 0; j < qexpand.size(); ++j) {
-        const int nid = qexpand[j];
-        TStats stats(param);
-        for (size_t tid = 0; tid < stemp.size(); ++tid) {
-          stats.Add(stemp[tid][nid].stats);
-        }
-        // update node statistics
-        snode[nid].stats = stats;
-        snode[nid].root_gain = static_cast<float>(stats.CalcGain(param));
-        snode[nid].weight = static_cast<float>(stats.CalcWeight(param));
-      }
-    }
-    /*! \brief update queue expand add in new leaves */
-    inline void UpdateQueueExpand(const RegTree &tree, std::vector<int> *p_qexpand) {
-      std::vector<int> &qexpand = *p_qexpand;
-      std::vector<int> newnodes;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[ nid ].is_leaf()) {
-          newnodes.push_back(tree[nid].cleft());
-          newnodes.push_back(tree[nid].cright());
-        }
-      }
-      // use new nodes for qexpand
-      qexpand = newnodes;
-    }
-    // parallel find the best split of current fid
-    // this function does not support nested functions
-    inline void ParallelFindSplit(const ColBatch::Inst &col,
-                                  bst_uint fid,
-                                  const IFMatrix &fmat,
-                                  const std::vector<bst_gpair> &gpair,
-                                  const BoosterInfo &info) {
-      const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
-      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
-      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
-      const std::vector<int> &qexpand = qexpand_;
-      #pragma omp parallel
-      {
-        const int tid = omp_get_thread_num();
-        std::vector<ThreadEntry> &temp = stemp[tid];
-        // cleanup temp statistics
-        for (size_t j = 0; j < qexpand.size(); ++j) {
-          temp[qexpand[j]].stats.Clear();
-        }
-        nthread = omp_get_num_threads();
-        bst_uint step = (col.length + nthread - 1) / nthread;
-        bst_uint end = std::min(col.length, step * (tid + 1));
-        for (bst_uint i = tid * step; i < end; ++i) {
-          const bst_uint ridx = col[i].index;
-          const int nid = position[ridx];
-          if (nid < 0) continue;
-          const float fvalue = col[i].fvalue;
-          if (temp[nid].stats.Empty()) {
-            temp[nid].first_fvalue = fvalue;
-          }
-          temp[nid].stats.Add(gpair, info, ridx);
-          temp[nid].last_fvalue = fvalue;
-        }
-      }
-      // start collecting the partial sum statistics
-      bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint j = 0; j < nnode; ++j) {
-        const int nid = qexpand[j];
-        TStats sum(param), tmp(param), c(param);
-        for (int tid = 0; tid < nthread; ++tid) {
-          tmp = stemp[tid][nid].stats;
-          stemp[tid][nid].stats = sum;
-          sum.Add(tmp);
-          if (tid != 0) {
-            std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
-          }
-        }
-        for (int tid = 0; tid < nthread; ++tid) {
-          stemp[tid][nid].stats_extra = sum;
-          ThreadEntry &e = stemp[tid][nid];
-          float fsplit;
-          if (tid != 0) {
-            if (std::abs(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
-              fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
-            } else {
-              continue;
-            }
-          } else {
-            fsplit = e.first_fvalue - rt_eps;
-          }
-          if (need_forward && tid != 0) {
-            c.SetSubstract(snode[nid].stats, e.stats);
-            if (c.sum_hess >= param.min_child_weight &&
-                e.stats.sum_hess >= param.min_child_weight) {
-              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                          c.CalcGain(param) - snode[nid].root_gain);
-              e.best.Update(loss_chg, fid, fsplit, false);
-            }
-          }
-          if (need_backward) {
-            tmp.SetSubstract(sum, e.stats);
-            c.SetSubstract(snode[nid].stats, tmp);
-            if (c.sum_hess >= param.min_child_weight &&
-                tmp.sum_hess >= param.min_child_weight) {
-              bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) +
-                                                          c.CalcGain(param) - snode[nid].root_gain);
-              e.best.Update(loss_chg, fid, fsplit, true);
-            }
-          }
-        }
-        if (need_backward) {
-          tmp = sum;
-          ThreadEntry &e = stemp[nthread-1][nid];
-          c.SetSubstract(snode[nid].stats, tmp);
-          if (c.sum_hess >= param.min_child_weight &&
-              tmp.sum_hess >= param.min_child_weight) {
-            bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) +
-                                                        c.CalcGain(param) - snode[nid].root_gain);
-            e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
-          }
-        }
-      }
-      // rescan, generate candidate split
-      #pragma omp parallel
-      {
-        TStats c(param), cright(param);
-        const int tid = omp_get_thread_num();
-        std::vector<ThreadEntry> &temp = stemp[tid];
-        nthread = static_cast<bst_uint>(omp_get_num_threads());
-        bst_uint step = (col.length + nthread - 1) / nthread;
-        bst_uint end = std::min(col.length, step * (tid + 1));
-        for (bst_uint i = tid * step; i < end; ++i) {
-          const bst_uint ridx = col[i].index;
-          const int nid = position[ridx];
-          if (nid < 0) continue;
-          const float fvalue = col[i].fvalue;
-          // get the statistics of nid
-          ThreadEntry &e = temp[nid];
-          if (e.stats.Empty()) {
-            e.stats.Add(gpair, info, ridx);
-            e.first_fvalue = fvalue;
-          } else {
-            // forward default right
-            if (std::abs(fvalue - e.first_fvalue) > rt_2eps) {
-              if (need_forward) {
-                c.SetSubstract(snode[nid].stats, e.stats);
-                if (c.sum_hess >= param.min_child_weight &&
-                    e.stats.sum_hess >= param.min_child_weight) {
-                  bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                              c.CalcGain(param) -
-                                                              snode[nid].root_gain);
-                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
-                }
-              }
-              if (need_backward) {
-                cright.SetSubstract(e.stats_extra, e.stats);
-                c.SetSubstract(snode[nid].stats, cright);
-                if (c.sum_hess >= param.min_child_weight &&
-                    cright.sum_hess >= param.min_child_weight) {
-                  bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) +
-                                                              c.CalcGain(param) -
-                                                              snode[nid].root_gain);
-                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
-                }
-              }
-            }
-            e.stats.Add(gpair, info, ridx);
-            e.first_fvalue = fvalue;
-          }
-        }
-      }
-    }
-    // update enumeration solution
-    inline void UpdateEnumeration(int nid, bst_gpair gstats,
-                                  float fvalue, int d_step, bst_uint fid,
-                                  TStats &c, std::vector<ThreadEntry> &temp) { // NOLINT(*)
-      // get the statistics of nid
-      ThreadEntry &e = temp[nid];
-      // test if first hit, this is fine, because we set 0 during init
-      if (e.stats.Empty()) {
-        e.stats.Add(gstats);
-        e.last_fvalue = fvalue;
-      } else {
-        // try to find a split
-        if (std::abs(fvalue - e.last_fvalue) > rt_2eps &&
-            e.stats.sum_hess >= param.min_child_weight) {
-          c.SetSubstract(snode[nid].stats, e.stats);
-          if (c.sum_hess >= param.min_child_weight) {
-            bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                        c.CalcGain(param) - snode[nid].root_gain);
-            e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
-          }
-        }
-        // update the statistics
-        e.stats.Add(gstats);
-        e.last_fvalue = fvalue;
-      }
-    }
-    // same as EnumerateSplit, with cacheline prefetch optimization
-    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
-                                       const ColBatch::Entry *end,
-                                       int d_step,
-                                       bst_uint fid,
-                                       const std::vector<bst_gpair> &gpair,
-                                       std::vector<ThreadEntry> &temp) { // NOLINT(*)
-      const std::vector<int> &qexpand = qexpand_;
-      // clear all the temp statistics
-      for (size_t j = 0; j < qexpand.size(); ++j) {
-        temp[qexpand[j]].stats.Clear();
-      }
-      // left statistics
-      TStats c(param);
-      // local cache buffer for position and gradient pair
-      const int kBuffer = 32;
-      int buf_position[kBuffer];
-      bst_gpair buf_gpair[kBuffer];
-      // aligned ending position
-      const ColBatch::Entry *align_end;
-      if (d_step > 0) {
-        align_end = begin + (end - begin) / kBuffer * kBuffer;
-      } else {
-        align_end = begin - (begin - end) / kBuffer * kBuffer;
-      }
-      int i;
-      const ColBatch::Entry *it;
-      const int align_step = d_step * kBuffer;
-      // internal cached loop
-      for (it = begin; it != align_end; it += align_step) {
-        const ColBatch::Entry *p;
-        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
-          buf_position[i] = position[p->index];
-          buf_gpair[i] = gpair[p->index];
-        }
-        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
-          const int nid = buf_position[i];
-          if (nid < 0) continue;
-          this->UpdateEnumeration(nid, buf_gpair[i],
-                                  p->fvalue, d_step,
-                                  fid, c, temp);
-        }
-      }
-      // finish up the ending piece
-      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
-        buf_position[i] = position[it->index];
-        buf_gpair[i] = gpair[it->index];
-      }
-      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
-        const int nid = buf_position[i];
-        if (nid < 0) continue;
-        this->UpdateEnumeration(nid, buf_gpair[i],
-                                it->fvalue, d_step,
-                                fid, c, temp);
-      }
-      // finish updating all statistics, check if it is possible to include all sum statistics
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        ThreadEntry &e = temp[nid];
-        c.SetSubstract(snode[nid].stats, e.stats);
-        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
-          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                      c.CalcGain(param) - snode[nid].root_gain);
-          const float gap = std::abs(e.last_fvalue) + rt_eps;
-          const float delta = d_step == +1 ? gap: -gap;
-          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
-        }
-      }
-    }
-
-    // enumerate the split values of specific feature
-    inline void EnumerateSplit(const ColBatch::Entry *begin,
-                               const ColBatch::Entry *end,
-                               int d_step,
-                               bst_uint fid,
-                               const std::vector<bst_gpair> &gpair,
-                               const BoosterInfo &info,
-                               std::vector<ThreadEntry> &temp) { // NOLINT(*)
-      // use cacheline aware optimization
-      if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
-        EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
-        return;
-      }
-      const std::vector<int> &qexpand = qexpand_;
-      // clear all the temp statistics
-      for (size_t j = 0; j < qexpand.size(); ++j) {
-        temp[qexpand[j]].stats.Clear();
-      }
-      // left statistics
-      TStats c(param);
-      for (const ColBatch::Entry *it = begin; it != end; it += d_step) {
-        const bst_uint ridx = it->index;
-        const int nid = position[ridx];
-        if (nid < 0) continue;
-        // start working
-        const float fvalue = it->fvalue;
-        // get the statistics of nid
-        ThreadEntry &e = temp[nid];
-        // test if first hit, this is fine, because we set 0 during init
-        if (e.stats.Empty()) {
-          e.stats.Add(gpair, info, ridx);
-          e.last_fvalue = fvalue;
-        } else {
-          // try to find a split
-          if (std::abs(fvalue - e.last_fvalue) > rt_2eps &&
-              e.stats.sum_hess >= param.min_child_weight) {
-            c.SetSubstract(snode[nid].stats, e.stats);
-            if (c.sum_hess >= param.min_child_weight) {
-              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                          c.CalcGain(param) - snode[nid].root_gain);
-              e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
-            }
-          }
-          // update the statistics
-          e.stats.Add(gpair, info, ridx);
-          e.last_fvalue = fvalue;
-        }
-      }
-      // finish updating all statistics, check if it is possible to include all sum statistics
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        ThreadEntry &e = temp[nid];
-        c.SetSubstract(snode[nid].stats, e.stats);
-        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
-          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                      c.CalcGain(param) - snode[nid].root_gain);
-          const float gap = std::abs(e.last_fvalue) + rt_eps;
-          const float delta = d_step == +1 ? gap: -gap;
-          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
-        }
-      }
-    }
-
-    // update the solution candidate
-    virtual void UpdateSolution(const ColBatch &batch,
-                                const std::vector<bst_gpair> &gpair,
-                                const IFMatrix &fmat,
-                                const BoosterInfo &info) {
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #if defined(_OPENMP)
-      const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
-      #endif
-      int poption = param.parallel_option;
-      if (poption == 2) {
-        poption = static_cast<int>(nsize) * 2 < nthread ? 1 : 0;
-      }
-      if (poption == 0) {
-        #pragma omp parallel for schedule(dynamic, batch_size)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
-          const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
-          if (param.need_forward_search(fmat.GetColDensity(fid), ind)) {
-            this->EnumerateSplit(c.data, c.data + c.length, +1,
-                                 fid, gpair, info, stemp[tid]);
-          }
-          if (param.need_backward_search(fmat.GetColDensity(fid), ind)) {
-            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
-                                 fid, gpair, info, stemp[tid]);
-          }
-        }
-      } else {
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          this->ParallelFindSplit(batch[i], batch.col_index[i],
-                                  fmat, gpair, info);
-        }
-      }
-    }
-    // find splits at current level, do split per level
-    inline void FindSplit(int depth,
-                          const std::vector<int> &qexpand,
-                          const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
-                          RegTree *p_tree) {
-      std::vector<bst_uint> feat_set = feat_index;
-      if (param.colsample_bylevel != 1.0f) {
-        random::Shuffle(feat_set);
-        unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
-        utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
-        feat_set.resize(n);
-      }
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
-      while (iter->Next()) {
-        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
-      }
-      // after this each thread's stemp will get the best candidates, aggregate results
-      this->SyncBestSolution(qexpand);
-      // get the best result, we can synchronize the solution
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        NodeEntry &e = snode[nid];
-        // now we know the solution in snode[nid], set split
-        if (e.best.loss_chg > rt_eps) {
-          p_tree->AddChilds(nid);
-          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
-          // mark right child as 0, to indicate fresh leaf
-          (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
-          (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
-        } else {
-          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
-        }
-      }
-    }
-    // reset position of each data points after split is created in the tree
-    inline void ResetPosition(const std::vector<int> &qexpand,
-                              IFMatrix *p_fmat, const RegTree &tree) {
-      // set the positions in the nondefault
-      this->SetNonDefaultPosition(qexpand, p_fmat, tree);
-      // set rest of instances to default position
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // set default direct nodes to default
-      // for leaf nodes that are not fresh, mark then to ~nid,
-      // so that they are ignored in future statistics collection
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        if (ridx >= position.size()) {
-          utils::Printf("ridx exceed bound\n");
-        }
-        const int nid = this->DecodePosition(ridx);
-        if (tree[nid].is_leaf()) {
-          // mark finish when it is not a fresh leaf
-          if (tree[nid].cright() == -1) {
-            position[ridx] = ~nid;
-          }
-        } else {
-          // push to default branch
-          if (tree[nid].default_left()) {
-            this->SetEncodePosition(ridx, tree[nid].cleft());
-          } else {
-            this->SetEncodePosition(ridx, tree[nid].cright());
-          }
-        }
-      }
-    }
-    // customization part
-    // synchronize the best solution of each node
-    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        NodeEntry &e = snode[nid];
-        for (int tid = 0; tid < this->nthread; ++tid) {
-          e.best.Update(stemp[tid][nid].best);
-        }
-      }
-    }
-    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       IFMatrix *p_fmat, const RegTree &tree) {
-      // step 1, classify the non-default data into right places
-      std::vector<unsigned> fsplits;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) {
-          fsplits.push_back(tree[nid].split_index());
-        }
-      }
-      std::sort(fsplits.begin(), fsplits.end());
-      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
-          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
-          #pragma omp parallel for schedule(static)
-          for (bst_omp_uint j = 0; j < ndata; ++j) {
-            const bst_uint ridx = col[j].index;
-            const int nid = this->DecodePosition(ridx);
-            const float fvalue = col[j].fvalue;
-            // go back to parent, correct those who are not default
-            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                this->SetEncodePosition(ridx, tree[nid].cleft());
-              } else {
-                this->SetEncodePosition(ridx, tree[nid].cright());
-              }
-            }
-          }
-        }
-      }
-    }
-    // utils to get/set position, with encoded format
-    // return decoded position
-    inline int DecodePosition(bst_uint ridx) const {
-      const int pid = position[ridx];
-      return pid < 0 ? ~pid : pid;
-    }
-    // encode the encoded position value for ridx
-    inline void SetEncodePosition(bst_uint ridx, int nid) {
-      if (position[ridx] < 0) {
-        position[ridx] = ~nid;
-      } else {
-        position[ridx] = nid;
-      }
-    }
-    //  --data fields--
-    const TrainParam &param;
-    // number of omp thread used during training
-    int nthread;
-    // Per feature: shuffle index of each feature index
-    std::vector<bst_uint> feat_index;
-    // Instance Data: current node position in the tree of each instance
-    std::vector<int> position;
-    // PerThread x PerTreeNode: statistics for per thread construction
-    std::vector< std::vector<ThreadEntry> > stemp;
-    /*! \brief TreeNode Data: statistics for each constructed node */
-    std::vector<NodeEntry> snode;
-    /*! \brief queue of nodes to be expanded */
-    std::vector<int> qexpand_;
-  };
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
--- a/old_src/tree/updater_distcol-inl.hpp
+++ b/old_src/tree/updater_distcol-inl.hpp
@@ -1,175 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_distcol-inl.hpp
- * \brief beta distributed version that takes a sub-column
- *        and construct a tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
-#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/bitmap.h"
-#include "../utils/io.h"
-#include "./updater_colmaker-inl.hpp"
-#include "./updater_prune-inl.hpp"
-
-namespace xgboost {
-namespace tree {
-template<typename TStats>
-class DistColMaker : public ColMaker<TStats> {
- public:
-  DistColMaker(void) : builder(param) {}
-  virtual ~DistColMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-    pruner.SetParam(name, val);
-  }
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
-    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
-    // build the tree
-    builder.Update(gpair, p_fmat, info, trees[0]);
-    //// prune the tree, note that pruner will sync the tree
-    pruner.Update(gpair, p_fmat, info, trees);
-    // update position after the tree is pruned
-    builder.UpdatePosition(p_fmat, *trees[0]);
-  }
-  virtual const int* GetLeafPosition(void) const {
-    return builder.GetLeafPosition();
-  }
-
- private:
-  struct Builder : public ColMaker<TStats>::Builder {
-   public:
-    explicit Builder(const TrainParam &param)
-        : ColMaker<TStats>::Builder(param) {
-    }
-    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        int nid = this->DecodePosition(ridx);
-        while (tree[nid].is_deleted()) {
-          nid = tree[nid].parent();
-          utils::Assert(nid >=0, "distributed learning error");
-        }
-        this->position[ridx] = nid;
-      }
-    }
-    virtual const int* GetLeafPosition(void) const {
-      return BeginPtr(this->position);
-    }
-
-   protected:
-    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       IFMatrix *p_fmat, const RegTree &tree) {
-      // step 2, classify the non-default data into right places
-      std::vector<unsigned> fsplits;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) {
-          fsplits.push_back(tree[nid].split_index());
-        }
-      }
-      // get the candidate split index
-      std::sort(fsplits.begin(), fsplits.end());
-      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
-        fsplits.pop_back();
-      }
-      // bitmap is only word concurrent, set to bool first
-      {
-        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
-        boolmap.resize(ndata);
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = 0; j < ndata; ++j) {
-            boolmap[j] = 0;
-        }
-      }
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
-          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
-          #pragma omp parallel for schedule(static)
-          for (bst_omp_uint j = 0; j < ndata; ++j) {
-            const bst_uint ridx = col[j].index;
-            const float fvalue = col[j].fvalue;
-            const int nid = this->DecodePosition(ridx);
-            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                if (!tree[nid].default_left()) boolmap[ridx] = 1;
-              } else {
-                if (tree[nid].default_left()) boolmap[ridx] = 1;
-              }
-            }
-          }
-        }
-      }
-
-      bitmap.InitFromBool(boolmap);
-      // communicate bitmap
-      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // get the new position
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int nid = this->DecodePosition(ridx);
-        if (bitmap.Get(ridx)) {
-          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
-          if (tree[nid].default_left()) {
-            this->SetEncodePosition(ridx, tree[nid].cright());
-          } else {
-            this->SetEncodePosition(ridx, tree[nid].cleft());
-          }
-        }
-      }
-    }
-    // synchronize the best solution of each node
-    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
-      std::vector<SplitEntry> vec;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        for (int tid = 0; tid < this->nthread; ++tid) {
-          this->snode[nid].best.Update(this->stemp[tid][nid].best);
-        }
-        vec.push_back(this->snode[nid].best);
-      }
-      // TODO(tqchen) lazy version
-      // communicate best solution
-      reducer.Allreduce(BeginPtr(vec), vec.size());
-      // assign solution back
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        this->snode[nid].best = vec[i];
-      }
-    }
-
-   private:
-    utils::BitMap bitmap;
-    std::vector<int> boolmap;
-    rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
-  };
-  // we directly introduce pruner here
-  TreePruner pruner;
-  // training parameter
-  TrainParam param;
-  // pointer to the builder
-  Builder builder;
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_