[REFACTOR] cleanup structure

2015-11-24 14:25:56 -08:00
parent 5ed4dc4f60
commit d530e0c14f
60 changed files with 42 additions and 51 deletions
--- a/src/README.md
+++ b/src/README.md
@@ -1,26 +0,0 @@
-Coding Guide
-======
-This file is intended to be notes about code structure in xgboost
-
-Project Logical Layout
-=======
-* Dependency order: io->learner->gbm->tree
-  - All module depends on data.h
-* tree are implementations of tree construction algorithms.
-* gbm is gradient boosting interface, that takes trees and other base learner to do boosting.
-  - gbm only takes gradient as sufficient statistics, it does not compute the gradient.
-* learner is learning module that computes gradient for specific object, and pass it to GBM
-
-File Naming Convention
-======= 
-* .h files are data structures and interface, which are needed to use functions in that layer.
-* -inl.hpp files are implementations of interface, like cpp file in most project.
-  - You only need to understand the interface file to understand the usage of that layer
-* In each folder, there can be a .cpp file, that compiles the module of that layer
-
-How to Hack the Code
-======
-* Add objective function: add to learner/objective-inl.hpp and register it in learner/objective.h ```CreateObjFunction``` 
-  - You can also directly do it in python
-* Add new evaluation metric: add to learner/evaluation-inl.hpp and register it in learner/evaluation.h ```CreateEvaluator``` 
-* Add wrapper for a new language, most likely you can do it by taking the functions in python/xgboost_wrapper.h, which is purely C based, and call these C functions to use xgboost
--- a/src/c_api.cc
+++ b/src/c_api.cc
@@ -0,0 +1,2 @@
+#include <xgboost/c_api.h>
+
--- a/src/data.h
+++ b/src/data.h
@@ -1,166 +0,0 @@
-/*!
- * Copyright (c) 2014 by Contributors
- * \file data.h
- * \brief the input data structure for gradient boosting
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_DATA_H_
-#define XGBOOST_DATA_H_
-
-#include <cstdio>
-#include <vector>
-#include "utils/utils.h"
-#include "utils/iterator.h"
-
-namespace xgboost {
-/*!
- * \brief unsigned integer type used in boost,
- *        used for feature index and row index
- */
-typedef unsigned bst_uint;
-/*! \brief float type, used for storing statistics */
-typedef float bst_float;
-const float rt_eps = 1e-5f;
-// min gap between feature values to allow a split happen
-const float rt_2eps = rt_eps * 2.0f;
-
-/*! \brief gradient statistics pair usually needed in gradient boosting */
-struct bst_gpair {
-  /*! \brief gradient statistics */
-  bst_float grad;
-  /*! \brief second order gradient statistics */
-  bst_float hess;
-  bst_gpair(void) {}
-  bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {}
-};
-
-/*!
- * \brief extra information that might be needed by gbm and tree module
- * this information is not necessarily present, and can be empty
- */
-struct BoosterInfo {
-  /*! \brief number of rows in the data */
-  size_t num_row;
-  /*! \brief number of columns in the data */
-  size_t num_col;
-  /*!
-   * \brief specified root index of each instance,
-   *  can be used for multi task setting
-   */
-  std::vector<unsigned> root_index;
-  /*! \brief set fold indicator */
-  std::vector<unsigned> fold_index;
-  /*! \brief number of rows, number of columns */
-  BoosterInfo(void) : num_row(0), num_col(0) {
-  }
-  /*! \brief get root of i-th instance */
-  inline unsigned GetRoot(size_t i) const {
-    return root_index.size() == 0 ? 0 : root_index[i];
-  }
-};
-
-/*! \brief read-only sparse instance batch in CSR format */
-struct SparseBatch {
-  /*! \brief an entry of sparse vector */
-  struct Entry {
-    /*! \brief feature index */
-    bst_uint index;
-    /*! \brief feature value */
-    bst_float fvalue;
-    // default constructor
-    Entry(void) {}
-    Entry(bst_uint index, bst_float fvalue) : index(index), fvalue(fvalue) {}
-    /*! \brief reversely compare feature values */
-    inline static bool CmpValue(const Entry &a, const Entry &b) {
-      return a.fvalue < b.fvalue;
-    }
-  };
-  /*! \brief an instance of sparse vector in the batch */
-  struct Inst {
-    /*! \brief pointer to the elements*/
-    const Entry *data;
-    /*! \brief length of the instance */
-    bst_uint length;
-    /*! \brief constructor */
-    Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
-    /*! \brief get i-th pair in the sparse vector*/
-    inline const Entry& operator[](size_t i) const {
-      return data[i];
-    }
-  };
-  /*! \brief batch size */
-  size_t size;
-};
-/*! \brief read-only row batch, used to access row continuously */
-struct RowBatch : public SparseBatch {
-  /*! \brief the offset of rowid of this batch */
-  size_t base_rowid;
-  /*! \brief array[size+1], row pointer of each of the elements */
-  const size_t *ind_ptr;
-  /*! \brief array[ind_ptr.back()], content of the sparse element */
-  const Entry *data_ptr;
-  /*! \brief get i-th row from the batch */
-  inline Inst operator[](size_t i) const {
-    return Inst(data_ptr + ind_ptr[i], static_cast<bst_uint>(ind_ptr[i+1] - ind_ptr[i]));
-  }
-};
-/*!
- * \brief read-only column batch, used to access columns,
- * the columns are not required to be continuous
- */
-struct ColBatch : public SparseBatch {
-  /*! \brief column index of each columns in the data */
-  const bst_uint *col_index;
-  /*! \brief pointer to the column data */
-  const Inst *col_data;
-  /*! \brief get i-th column from the batch */
-  inline Inst operator[](size_t i) const {
-    return col_data[i];
-  }
-};
-/**
- * \brief interface of feature matrix, needed for tree construction
- *  this interface defines two ways to access features:
- *   row access is defined by iterator of RowBatch
- *   col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
- */
-class IFMatrix {
- public:
-  // the interface only need to guarantee row iter
-  // column iter is active, when ColIterator is called, row_iter can be disabled
-  /*! \brief get the row iterator associated with FMatrix */
-  virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
-  /*!\brief get column iterator */
-  virtual utils::IIterator<ColBatch> *ColIterator(void) = 0;
-  /*!
-   * \brief get the column iterator associated with FMatrix with subset of column features
-   * \param fset is the list of column index set that must be contained in the returning Column iterator
-   * \return the column iterator, initialized so that it reads the elements in fset
-   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
-  /*!
-   * \brief check if column access is supported, if not, initialize column access
-   * \param enabled whether certain feature should be included in column access
-   * \param subsample subsample ratio when generating column access
-   * \param max_row_perbatch auxiliary information, maximum row used in each column batch
-   *         this is a hint information that can be ignored by the implementation
-   */
-  virtual void InitColAccess(const std::vector<bool> &enabled,
-                             float subsample,
-                             size_t max_row_perbatch) = 0;
-  // the following are column meta data, should be able to answer them fast
-  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const = 0;
-  /*! \return number of columns in the FMatrix */
-  virtual size_t NumCol(void) const = 0;
-  /*! \brief get number of non-missing entries in column */
-  virtual size_t GetColSize(size_t cidx) const = 0;
-  /*! \brief get column density */
-  virtual float GetColDensity(size_t cidx) const = 0;
-  /*! \brief reference of buffered rowset */
-  virtual const std::vector<bst_uint> &buffered_rowset(void) const = 0;
-  // virtual destructor
-  virtual ~IFMatrix(void){}
-};
-}  // namespace xgboost
-#endif  // XGBOOST_DATA_H_
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -1,292 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file gblinear-inl.hpp
- * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
- *        the update rule is parallel coordinate descent (shotgun)
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_GBM_GBLINEAR_INL_HPP_
-#define XGBOOST_GBM_GBLINEAR_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <sstream>
-#include <algorithm>
-#include "./gbm.h"
-#include "../tree/updater.h"
-
-namespace xgboost {
-namespace gbm {
-/*!
- * \brief gradient boosted linear model
- * \tparam FMatrix the data type updater taking
- */
-class GBLinear : public IGradBooster {
- public:
-  virtual ~GBLinear(void) {
-  }
-  // set model parameters
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strncmp(name, "bst:", 4)) {
-      param.SetParam(name + 4, val);
-    }
-    if (model.weight.size() == 0) {
-      model.param.SetParam(name, val);
-    }
-  }
-  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*)
-    model.LoadModel(fi);
-  }
-  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*)
-    model.SaveModel(fo);
-  }
-  virtual void InitModel(void) {
-    model.InitModel();
-  }
-  virtual void DoBoost(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<bst_gpair> *in_gpair) {
-    std::vector<bst_gpair> &gpair = *in_gpair;
-    const int ngroup = model.param.num_output_group;
-    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-    // for all the output group
-    for (int gid = 0; gid < ngroup; ++gid) {
-      double sum_grad = 0.0, sum_hess = 0.0;
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static) reduction(+: sum_grad, sum_hess)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        bst_gpair &p = gpair[rowset[i] * ngroup + gid];
-        if (p.hess >= 0.0f) {
-          sum_grad += p.grad; sum_hess += p.hess;
-        }
-      }
-      // remove bias effect
-      bst_float dw = static_cast<bst_float>(
-          param.learning_rate * param.CalcDeltaBias(sum_grad, sum_hess, model.bias()[gid]));
-      model.bias()[gid] += dw;
-      // update grad value
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        bst_gpair &p = gpair[rowset[i] * ngroup + gid];
-        if (p.hess >= 0.0f) {
-          p.grad += p.hess * dw;
-        }
-      }
-    }
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
-    while (iter->Next()) {
-      // number of features
-      const ColBatch &batch = iter->Value();
-      const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const bst_uint fid = batch.col_index[i];
-        ColBatch::Inst col = batch[i];
-        for (int gid = 0; gid < ngroup; ++gid) {
-          double sum_grad = 0.0, sum_hess = 0.0;
-          for (bst_uint j = 0; j < col.length; ++j) {
-            const float v = col[j].fvalue;
-            bst_gpair &p = gpair[col[j].index * ngroup + gid];
-            if (p.hess < 0.0f) continue;
-            sum_grad += p.grad * v;
-            sum_hess += p.hess * v * v;
-          }
-          float &w = model[fid][gid];
-          bst_float dw = static_cast<bst_float>(param.learning_rate *
-                                                param.CalcDelta(sum_grad, sum_hess, w));
-          w += dw;
-          // update grad value
-          for (bst_uint j = 0; j < col.length; ++j) {
-            bst_gpair &p = gpair[col[j].index * ngroup + gid];
-            if (p.hess < 0.0f) continue;
-            p.grad += p.hess * col[j].fvalue * dw;
-          }
-        }
-      }
-    }
-  }
-
-  virtual void Predict(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) {
-    utils::Check(ntree_limit == 0,
-                 "GBLinear::Predict ntrees is only valid for gbtree predictor");
-    std::vector<float> &preds = *out_preds;
-    preds.resize(0);
-    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    const int ngroup = model.param.num_output_group;
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      utils::Assert(batch.base_rowid * ngroup == preds.size(),
-                    "base_rowid is not set correctly");
-      // output convention: nrow * k, where nrow is number of rows
-      // k is number of group
-      preds.resize(preds.size() + batch.size * ngroup);
-      // parallel over local batch
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        const size_t ridx = batch.base_rowid + i;
-        // loop over output groups
-        for (int gid = 0; gid < ngroup; ++gid) {
-          this->Pred(batch[i], &preds[ridx * ngroup]);
-        }
-      }
-    }
-  }
-  virtual void Predict(const SparseBatch::Inst &inst,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit,
-                       unsigned root_index) {
-    const int ngroup = model.param.num_output_group;
-    for (int gid = 0; gid < ngroup; ++gid) {
-      this->Pred(inst, BeginPtr(*out_preds));
-    }
-  }
-  virtual void PredictLeaf(IFMatrix *p_fmat,
-                           const BoosterInfo &info,
-                           std::vector<float> *out_preds,
-                           unsigned ntree_limit = 0) {
-    utils::Error("gblinear does not support predict leaf index");
-  }
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    std::stringstream fo("");
-    fo << "bias:\n";
-    for (int i = 0; i < model.param.num_output_group; ++i) {
-      fo << model.bias()[i] << std::endl;
-    }
-    fo << "weight:\n";
-    for (int i = 0; i < model.param.num_output_group; ++i) {
-      for (unsigned j = 0; j <model.param.num_feature; ++j) {
-        fo << model[i][j] << std::endl;
-      }
-    }
-    std::vector<std::string> v;
-    v.push_back(fo.str());
-    return v;
-  }
-
- protected:
-  inline void Pred(const RowBatch::Inst &inst, float *preds) {
-    for (int gid = 0; gid < model.param.num_output_group; ++gid) {
-      float psum = model.bias()[gid];
-      for (bst_uint i = 0; i < inst.length; ++i) {
-        if (inst[i].index >= model.param.num_feature) continue;
-        psum += inst[i].fvalue * model[inst[i].index][gid];
-      }
-      preds[gid] = psum;
-    }
-  }
-  // training parameter
-  struct ParamTrain {
-    /*! \brief learning_rate */
-    float learning_rate;
-    /*! \brief regularization weight for L2 norm */
-    float reg_lambda;
-    /*! \brief regularization weight for L1 norm */
-    float reg_alpha;
-    /*! \brief regularization weight for L2 norm in bias */
-    float reg_lambda_bias;
-    // parameter
-    ParamTrain(void) {
-      reg_alpha = 0.0f;
-      reg_lambda = 0.0f;
-      reg_lambda_bias = 0.0f;
-      learning_rate = 1.0f;
-    }
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      // sync-names
-      if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
-      if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
-      if (!strcmp( "alpha", name)) reg_alpha = static_cast<float>(atof(val));
-      if (!strcmp( "lambda_bias", name)) reg_lambda_bias = static_cast<float>(atof(val));
-      // real names
-      if (!strcmp( "learning_rate", name)) learning_rate = static_cast<float>(atof(val));
-      if (!strcmp( "reg_lambda", name)) reg_lambda = static_cast<float>(atof(val));
-      if (!strcmp( "reg_alpha", name)) reg_alpha = static_cast<float>(atof(val));
-      if (!strcmp( "reg_lambda_bias", name)) reg_lambda_bias = static_cast<float>(atof(val));
-    }
-    // given original weight calculate delta
-    inline double CalcDelta(double sum_grad, double sum_hess, double w) {
-      if (sum_hess < 1e-5f) return 0.0f;
-      double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
-      if (tmp >=0) {
-        return std::max(-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
-      } else {
-        return std::min(-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
-      }
-    }
-    // given original weight calculate delta bias
-    inline double CalcDeltaBias(double sum_grad, double sum_hess, double w) {
-      return - (sum_grad + reg_lambda_bias * w) / (sum_hess + reg_lambda_bias);
-    }
-  };
-  // model for linear booster
-  class Model {
-   public:
-    // model parameter
-    struct Param {
-      // number of feature dimension
-      unsigned num_feature;
-      // number of output group
-      int num_output_group;
-      // reserved field
-      int reserved[32];
-      // constructor
-      Param(void) {
-        num_feature = 0;
-        num_output_group = 1;
-        std::memset(reserved, 0, sizeof(reserved));
-      }
-      inline void SetParam(const char *name, const char *val) {
-        using namespace std;
-        if (!strcmp(name, "bst:num_feature")) num_feature = static_cast<unsigned>(atoi(val));
-        if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
-      }
-    };
-    // parameter
-    Param param;
-    // weight for each of feature, bias is the last one
-    std::vector<float> weight;
-    // initialize the model parameter
-    inline void InitModel(void) {
-      // bias is the last weight
-      weight.resize((param.num_feature + 1) * param.num_output_group);
-      std::fill(weight.begin(), weight.end(), 0.0f);
-    }
-    // save the model to file
-    inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
-      fo.Write(&param, sizeof(Param));
-      fo.Write(weight);
-    }
-    // load model from file
-    inline void LoadModel(utils::IStream &fi) { // NOLINT(*)
-      utils::Assert(fi.Read(&param, sizeof(Param)) != 0, "Load LinearBooster");
-      fi.Read(&weight);
-    }
-    // model bias
-    inline float* bias(void) {
-      return &weight[param.num_feature * param.num_output_group];
-    }
-    // get i-th weight
-    inline float* operator[](size_t i) {
-      return &weight[i * param.num_output_group];
-    }
-  };
-  // model field
-  Model model;
-  // training parameter
-  ParamTrain param;
-  // Per feature: shuffle index of each feature index
-  std::vector<bst_uint> feat_index;
-};
-
-}  // namespace gbm
-}  // namespace xgboost
-#endif  // XGBOOST_GBM_GBLINEAR_INL_HPP_
--- a/src/gbm/gbm.cpp
+++ b/src/gbm/gbm.cpp
@@ -1,21 +0,0 @@
-// Copyright by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <cstring>
-#include "./gbm.h"
-#include "./gbtree-inl.hpp"
-#include "./gblinear-inl.hpp"
-
-namespace xgboost {
-namespace gbm {
-IGradBooster* CreateGradBooster(const char *name) {
-  using namespace std;
-  if (!strcmp("gbtree", name)) return new GBTree();
-  if (!strcmp("gblinear", name)) return new GBLinear();
-  utils::Error("unknown booster type: %s", name);
-  return NULL;
-}
-}  // namespace gbm
-}  // namespace xgboost
-
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -1,136 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file gbm.h
- * \brief interface of gradient booster, that learns through gradient statistics
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_GBM_GBM_H_
-#define XGBOOST_GBM_GBM_H_
-
-#include <vector>
-#include <string>
-#include "../data.h"
-#include "../utils/io.h"
-#include "../utils/fmap.h"
-
-namespace xgboost {
-/*! \brief namespace for gradient booster */
-namespace gbm {
-/*!
- * \brief interface of gradient boosting model
- */
-class IGradBooster {
- public:
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  virtual void SetParam(const char *name, const char *val) = 0;
-  /*!
-   * \brief load model from stream
-   * \param fi input stream
-   * \param with_pbuffer whether the incoming data contains pbuffer
-   */
-  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0; // NOLINT(*)
-  /*!
-   * \brief save model to stream
-   * \param fo output stream
-   * \param with_pbuffer whether save out pbuffer
-   */
-  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0; // NOLINT(*)
-  /*!
-   * \brief initialize the model
-   */
-  virtual void InitModel(void) = 0;
-  /*!
-   * \brief reset the predict buffer
-   * this will invalidate all the previous cached results
-   * and recalculate from scratch
-   */
-  virtual void ResetPredBuffer(size_t num_pbuffer) {}
-  /*!
-   * \brief whether the model allow lazy checkpoint
-   * return true if model is only updated in DoBoost
-   * after all Allreduce calls
-   */
-  virtual bool AllowLazyCheckPoint(void) const {
-    return false;
-  }
-  /*!
-   * \brief perform update to the model(boosting)
-   * \param p_fmat feature matrix that provide access to features
-   * \param buffer_offset buffer index offset of these instances, if equals -1
-   *        this means we do not have buffer index allocated to the gbm
-   * \param info meta information about training
-   * \param in_gpair address of the gradient pair statistics of the data
-   * the booster may change content of gpair
-   */
-  virtual void DoBoost(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<bst_gpair> *in_gpair) = 0;
-  /*!
-   * \brief generate predictions for given feature matrix
-   * \param p_fmat feature matrix
-   * \param buffer_offset buffer index offset of these instances, if equals -1
-   *        this means we do not have buffer index allocated to the gbm
-   *  a buffer index is assigned to each instance that requires repeative prediction
-   *  the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
-   * \param info extra side information that may be needed for prediction
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
-   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
-   */
-  virtual void Predict(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) = 0;
-  /*!
-   * \brief online prediction function, predict score for one instance at a time
-   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
-   *        more efficient than online prediction
-   *        This function is NOT threadsafe, make sure you only call from one thread
-   *
-   * \param inst the instance you want to predict
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction
-   * \param root_index the root index
-   * \sa Predict
-   */
-  virtual void Predict(const SparseBatch::Inst &inst,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0,
-                       unsigned root_index = 0)  = 0;
-  /*!
-   * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
-   *        this is only valid in gbtree predictor
-   * \param p_fmat feature matrix
-   * \param info extra side information that may be needed for prediction
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
-   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
-   */
-  virtual void PredictLeaf(IFMatrix *p_fmat,
-                           const BoosterInfo &info,
-                           std::vector<float> *out_preds,
-                           unsigned ntree_limit = 0) = 0;
-  /*!
-   * \brief dump the model in text format
-   * \param fmap feature map that may help give interpretations of feature
-   * \param option extra option of the dump model
-   * \return a vector of dump for boosters
-   */
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) = 0;
-  // destrcutor
-  virtual ~IGradBooster(void){}
-};
-/*!
- * \breif create a gradient booster from given name
- * \param name name of gradient booster
- */
-IGradBooster* CreateGradBooster(const char *name);
-}  // namespace gbm
-}  // namespace xgboost
-#endif  // XGBOOST_GBM_GBM_H_
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -1,520 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file gbtree-inl.hpp
- * \brief gradient boosted tree implementation
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_GBM_GBTREE_INL_HPP_
-#define XGBOOST_GBM_GBTREE_INL_HPP_
-
-#include <vector>
-#include <utility>
-#include <string>
-#include <limits>
-#include "./gbm.h"
-#include "../utils/omp.h"
-#include "../tree/updater.h"
-
-namespace xgboost {
-namespace gbm {
-/*!
- * \brief gradient boosted tree
- */
-class GBTree : public IGradBooster {
- public:
-  GBTree(void) {
-  }
-  virtual ~GBTree(void) {
-    this->Clear();
-  }
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strncmp(name, "bst:", 4)) {
-      cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
-      // set into updaters, if already initialized
-      for (size_t i = 0; i < updaters.size(); ++i) {
-        updaters[i]->SetParam(name+4, val);
-      }
-    }
-    if (!strcmp(name, "silent")) {
-      this->SetParam("bst:silent", val);
-    }
-    tparam.SetParam(name, val);
-    if (trees.size() == 0) mparam.SetParam(name, val);
-  }
-  virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*)
-    this->Clear();
-    utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
-                 "GBTree: invalid model file");
-    trees.resize(mparam.num_trees);
-    for (size_t i = 0; i < trees.size(); ++i) {
-      trees[i] = new tree::RegTree();
-      trees[i]->LoadModel(fi);
-    }
-    tree_info.resize(mparam.num_trees);
-    if (mparam.num_trees != 0) {
-      utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
-                   "GBTree: invalid model file");
-    }
-    if (mparam.num_pbuffer != 0 && with_pbuffer) {
-      pred_buffer.resize(mparam.PredBufferSize());
-      pred_counter.resize(mparam.PredBufferSize());
-      utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
-                   "GBTree: invalid model file");
-      utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
-                   "GBTree: invalid model file");
-    }
-  }
-  virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*)
-    utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
-    if (with_pbuffer) {
-      fo.Write(&mparam, sizeof(ModelParam));
-    } else {
-      ModelParam p = mparam;
-      p.num_pbuffer = 0;
-      fo.Write(&p, sizeof(ModelParam));
-    }
-    for (size_t i = 0; i < trees.size(); ++i) {
-      trees[i]->SaveModel(fo);
-    }
-    if (tree_info.size() != 0) {
-      fo.Write(BeginPtr(tree_info), sizeof(int) * tree_info.size());
-    }
-    if (mparam.num_pbuffer != 0 && with_pbuffer) {
-      fo.Write(BeginPtr(pred_buffer), pred_buffer.size() * sizeof(float));
-      fo.Write(BeginPtr(pred_counter), pred_counter.size() * sizeof(unsigned));
-    }
-  }
-  // initialize the predict buffer
-  virtual void InitModel(void) {
-    pred_buffer.clear(); pred_counter.clear();
-    pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
-    pred_counter.resize(mparam.PredBufferSize(), 0);
-    utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
-    utils::Assert(trees.size() == 0, "GBTree: model already initialized");
-  }
-  virtual void ResetPredBuffer(size_t num_pbuffer) {
-    mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer);
-    pred_buffer.clear(); pred_counter.clear();
-    pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
-    pred_counter.resize(mparam.PredBufferSize(), 0);
-  }
-  virtual bool AllowLazyCheckPoint(void) const {
-    return !(tparam.distcol_mode != 0  && mparam.num_output_group != 1);
-  }
-  virtual void DoBoost(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<bst_gpair> *in_gpair) {
-    const std::vector<bst_gpair> &gpair = *in_gpair;
-    std::vector<std::vector<tree::RegTree*> > new_trees;
-    if (mparam.num_output_group == 1) {
-      new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
-    } else {
-      const int ngroup = mparam.num_output_group;
-      utils::Check(gpair.size() % ngroup == 0,
-                   "must have exactly ngroup*nrow gpairs");
-      std::vector<bst_gpair> tmp(gpair.size()/ngroup);
-      for (int gid = 0; gid < ngroup; ++gid) {
-        bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          tmp[i] = gpair[i * ngroup + gid];
-        }
-        new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
-      }
-    }
-    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-      this->CommitModel(new_trees[gid], gid);
-    }
-  }
-  virtual void Predict(IFMatrix *p_fmat,
-                       int64_t buffer_offset,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit = 0) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    InitThreadTemp(nthread);
-    std::vector<float> &preds = *out_preds;
-    const size_t stride = info.num_row * mparam.num_output_group;
-    preds.resize(stride * (mparam.size_leaf_vector+1));
-    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      // parallel over local batch
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        const int tid = omp_get_thread_num();
-        tree::RegTree::FVec &feats = thread_temp[tid];
-        int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
-        utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound");
-        // loop over output groups
-        for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-          this->Pred(batch[i],
-                     buffer_offset < 0 ? -1 : buffer_offset + ridx,
-                     gid, info.GetRoot(ridx), &feats,
-                     &preds[ridx * mparam.num_output_group + gid], stride,
-                     ntree_limit);
-        }
-      }
-    }
-  }
-  virtual void Predict(const SparseBatch::Inst &inst,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit,
-                       unsigned root_index) {
-    if (thread_temp.size() == 0) {
-      thread_temp.resize(1, tree::RegTree::FVec());
-      thread_temp[0].Init(mparam.num_feature);
-    }
-    out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
-    // loop over output groups
-    for (int gid = 0; gid < mparam.num_output_group; ++gid) {
-      this->Pred(inst, -1, gid, root_index, &thread_temp[0],
-                 &(*out_preds)[gid], mparam.num_output_group,
-                 ntree_limit);
-    }
-  }
-  virtual void PredictLeaf(IFMatrix *p_fmat,
-                           const BoosterInfo &info,
-                           std::vector<float> *out_preds,
-                           unsigned ntree_limit) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    InitThreadTemp(nthread);
-    this->PredPath(p_fmat, info, out_preds, ntree_limit);
-  }
-  virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    std::vector<std::string> dump;
-    for (size_t i = 0; i < trees.size(); i++) {
-      dump.push_back(trees[i]->DumpModel(fmap, option&1));
-    }
-    return dump;
-  }
-
- protected:
-  // clear the model
-  inline void Clear(void) {
-    for (size_t i = 0; i < trees.size(); ++i) {
-      delete trees[i];
-    }
-    for (size_t i = 0; i < updaters.size(); ++i) {
-      delete updaters[i];
-    }
-    updaters.clear();
-    trees.clear();
-    pred_buffer.clear();
-    pred_counter.clear();
-  }
-  // initialize updater before using them
-  inline void InitUpdater(void) {
-    if (tparam.updater_initialized != 0) return;
-    for (size_t i = 0; i < updaters.size(); ++i) {
-      delete updaters[i];
-    }
-    updaters.clear();
-    std::string tval = tparam.updater_seq;
-    char *pstr;
-    pstr = std::strtok(&tval[0], ",");
-    while (pstr != NULL) {
-      updaters.push_back(tree::CreateUpdater(pstr));
-      for (size_t j = 0; j < cfg.size(); ++j) {
-        // set parameters
-        updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
-      }
-      pstr = std::strtok(NULL, ",");
-    }
-    tparam.updater_initialized = 1;
-  }
-  // do group specific group
-  inline std::vector<tree::RegTree*>
-  BoostNewTrees(const std::vector<bst_gpair> &gpair,
-                IFMatrix *p_fmat,
-                int64_t buffer_offset,
-                const BoosterInfo &info,
-                int bst_group) {
-    std::vector<tree::RegTree *> new_trees;
-    this->InitUpdater();
-    // create the trees
-    for (int i = 0; i < tparam.num_parallel_tree; ++i) {
-      new_trees.push_back(new tree::RegTree());
-      for (size_t j = 0; j < cfg.size(); ++j) {
-        new_trees.back()->param.SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
-      }
-      new_trees.back()->InitModel();
-    }
-    // update the trees
-    for (size_t i = 0; i < updaters.size(); ++i) {
-      updaters[i]->Update(gpair, p_fmat, info, new_trees);
-    }
-    // optimization, update buffer, if possible
-    // this is only under distributed column mode
-    // for safety check of lazy checkpoint
-    if (
-        buffer_offset >= 0 &&
-        new_trees.size() == 1 && updaters.size() > 0 &&
-        updaters.back()->GetLeafPosition() != NULL) {
-      utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
-                   "distributed mode is not compatible with prob_buffer_row");
-      this->UpdateBufferByPosition(p_fmat,
-                                   buffer_offset, bst_group,
-                                   *new_trees[0],
-                                   updaters.back()->GetLeafPosition());
-    }
-    return new_trees;
-  }
-  // commit new trees all at once
-  inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
-    for (size_t i = 0; i < new_trees.size(); ++i) {
-      trees.push_back(new_trees[i]);
-      tree_info.push_back(bst_group);
-    }
-    mparam.num_trees += static_cast<int>(new_trees.size());
-  }
-  // update buffer by pre-cached position
-  inline void UpdateBufferByPosition(IFMatrix *p_fmat,
-                                     int64_t buffer_offset,
-                                     int bst_group,
-                                     const tree::RegTree &new_tree,
-                                     const int* leaf_position) {
-    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_uint ridx = rowset[i];
-      const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
-      const int tid = leaf_position[ridx];
-      utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date");
-      utils::Assert(tid >= 0, "invalid leaf position");
-      pred_buffer[bid] += new_tree[tid].leaf_value();
-      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-        pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
-      }
-      pred_counter[bid] += tparam.num_parallel_tree;
-    }
-  }
-  // make a prediction for a single instance
-  inline void Pred(const RowBatch::Inst &inst,
-                   int64_t buffer_index,
-                   int bst_group,
-                   unsigned root_index,
-                   tree::RegTree::FVec *p_feats,
-                   float *out_pred, size_t stride,
-                   unsigned ntree_limit) {
-    size_t itop = 0;
-    float  psum = 0.0f;
-    // sum of leaf vector
-    std::vector<float> vec_psum(mparam.size_leaf_vector, 0.0f);
-    const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
-    // number of valid trees
-    unsigned treeleft = ntree_limit == 0 ? std::numeric_limits<unsigned>::max() : ntree_limit;
-    // load buffered results if any
-    if (bid >= 0 && ntree_limit == 0) {
-      itop = pred_counter[bid];
-      psum = pred_buffer[bid];
-      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-        vec_psum[i] = pred_buffer[bid + i + 1];
-      }
-    }
-    if (itop != trees.size()) {
-      p_feats->Fill(inst);
-      for (size_t i = itop; i < trees.size(); ++i) {
-        if (tree_info[i] == bst_group) {
-          int tid = trees[i]->GetLeafIndex(*p_feats, root_index);
-          psum += (*trees[i])[tid].leaf_value();
-          for (int j = 0; j < mparam.size_leaf_vector; ++j) {
-            vec_psum[j] += trees[i]->leafvec(tid)[j];
-          }
-          if (--treeleft == 0) break;
-        }
-      }
-      p_feats->Drop(inst);
-    }
-    // updated the buffered results
-    if (bid >= 0 && ntree_limit == 0) {
-      pred_counter[bid] = static_cast<unsigned>(trees.size());
-      pred_buffer[bid] = psum;
-      for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-        pred_buffer[bid + i + 1] = vec_psum[i];
-      }
-    }
-    out_pred[0] = psum;
-    for (int i = 0; i < mparam.size_leaf_vector; ++i) {
-      out_pred[stride * (i + 1)] = vec_psum[i];
-    }
-  }
-  // predict independent leaf index
-  inline void PredPath(IFMatrix *p_fmat,
-                       const BoosterInfo &info,
-                       std::vector<float> *out_preds,
-                       unsigned ntree_limit) {
-    // number of valid trees
-    if (ntree_limit == 0 || ntree_limit > trees.size()) {
-      ntree_limit = static_cast<unsigned>(trees.size());
-    }
-    std::vector<float> &preds = *out_preds;
-    preds.resize(info.num_row * ntree_limit);
-    // start collecting the prediction
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      // parallel over local batch
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        const int tid = omp_get_thread_num();
-        size_t ridx = static_cast<size_t>(batch.base_rowid + i);
-        tree::RegTree::FVec &feats = thread_temp[tid];
-        feats.Fill(batch[i]);
-        for (unsigned j = 0; j < ntree_limit; ++j) {
-          int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
-          preds[ridx * ntree_limit + j] = static_cast<float>(tid);
-        }
-        feats.Drop(batch[i]);
-      }
-    }
-  }
-  // init thread buffers
-  inline void InitThreadTemp(int nthread) {
-    int prev_thread_temp_size = thread_temp.size();
-    if (prev_thread_temp_size < nthread) {
-      thread_temp.resize(nthread, tree::RegTree::FVec());
-      for (int i = prev_thread_temp_size; i < nthread; ++i) {
-        thread_temp[i].Init(mparam.num_feature);
-      }
-    }
-  }
-
-  // --- data structure ---
-  /*! \brief training parameters */
-  struct TrainParam {
-    /*! \brief number of threads */
-    int nthread;
-    /*!
-     * \brief number of parallel trees constructed each iteration
-     *  use this option to support boosted random forest
-     */
-    int num_parallel_tree;
-    /*! \brief whether updater is already initialized */
-    int updater_initialized;
-    /*! \brief distributed column mode */
-    int distcol_mode;
-    /*! \brief tree updater sequence */
-    std::string updater_seq;
-    // construction
-    TrainParam(void) {
-      nthread = 0;
-      updater_seq = "grow_colmaker,prune";
-      num_parallel_tree = 1;
-      updater_initialized = 0;
-      distcol_mode = 0;
-    }
-    inline void SetParam(const char *name, const char *val){
-      using namespace std;
-      if (!strcmp(name, "updater") &&
-          strcmp(updater_seq.c_str(), val) != 0) {
-        updater_seq = val;
-        updater_initialized = 0;
-      }
-      if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
-        distcol_mode = 1;
-      }
-      if (!strcmp(name, "nthread")) {
-        omp_set_num_threads(nthread = atoi(val));
-      }
-      if (!strcmp(name, "num_parallel_tree")) {
-        num_parallel_tree = atoi(val);
-      }
-    }
-  };
-  /*! \brief model parameters */
-  struct ModelParam {
-    /*! \brief number of trees */
-    int num_trees;
-    /*! \brief number of root: default 0, means single tree */
-    int num_roots;
-    /*! \brief number of features to be used by trees */
-    int num_feature;
-    /*! \brief size of prediction buffer allocated used for buffering */
-    int64_t num_pbuffer;
-    /*!
-     * \brief how many output group a single instance can produce
-     *  this affects the behavior of number of output we have:
-     *    suppose we have n instance and k group, output will be k*n
-     */
-    int num_output_group;
-    /*! \brief size of leaf vector needed in tree */
-    int size_leaf_vector;
-    /*! \brief reserved parameters */
-    int reserved[31];
-    /*! \brief constructor */
-    ModelParam(void) {
-      std::memset(this, 0, sizeof(ModelParam));
-      num_trees = 0;
-      num_roots = num_feature = 0;
-      num_pbuffer = 0;
-      num_output_group = 1;
-      size_leaf_vector = 0;
-    }
-    /*!
-     * \brief set parameters from outside
-     * \param name name of the parameter
-     * \param val  value of the parameter
-     */
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
-      if (!strcmp("num_output_group", name)) num_output_group = atol(val);
-      if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
-      if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
-      if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
-    }
-    /*! \return size of prediction buffer actually needed */
-    inline size_t PredBufferSize(void) const {
-      return num_output_group * num_pbuffer * (size_leaf_vector + 1);
-    }
-    /*!
-     * \brief get the buffer offset given a buffer index and group id
-     * \return calculated buffer offset
-     */
-    inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
-      if (buffer_index < 0) return -1;
-      utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
-      return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
-    }
-  };
-  // training parameter
-  TrainParam tparam;
-  // model parameter
-  ModelParam mparam;
-  /*! \brief vector of trees stored in the model */
-  std::vector<tree::RegTree*> trees;
-  /*! \brief some information indicator of the tree, reserved */
-  std::vector<int> tree_info;
-  /*! \brief prediction buffer */
-  std::vector<float>  pred_buffer;
-  /*! \brief prediction buffer counter, remember the prediction */
-  std::vector<unsigned> pred_counter;
-  // ----training fields----
-  // configurations for tree
-  std::vector< std::pair<std::string, std::string> > cfg;
-  // temporal storage for per thread
-  std::vector<tree::RegTree::FVec> thread_temp;
-  // the updaters that can be applied to each of tree
-  std::vector<tree::IUpdater*> updaters;
-};
-
-}  // namespace gbm
-}  // namespace xgboost
-#endif  // XGBOOST_GBM_GBTREE_INL_HPP_
--- a/src/io/dmlc_simple.cpp
+++ b/src/io/dmlc_simple.cpp
@@ -1,229 +0,0 @@
-// Copyright by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <string>
-#include "../utils/io.h"
-
-// implements a single no split version of DMLC
-// in case we want to avoid dependency on dmlc-core
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief line split implementation from single FILE
- * simply returns lines of files, used for stdin
- */
-class SingleFileSplit : public dmlc::InputSplit {
- public:
-  explicit SingleFileSplit(const char *fname)
-      : use_stdin_(false),
-        chunk_begin_(NULL), chunk_end_(NULL) {
-    if (!std::strcmp(fname, "stdin")) {
-#ifndef XGBOOST_STRICT_CXX98_
-      use_stdin_ = true; fp_ = stdin;
-#endif
-    }
-    if (!use_stdin_) {
-      fp_ = utils::FopenCheck(fname, "rb");
-    }
-    buffer_.resize(kBufferSize);
-  }
-  virtual ~SingleFileSplit(void) {
-    if (!use_stdin_) std::fclose(fp_);
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, 1, size, fp_);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    utils::Error("cannot do write in inputsplit");
-  }
-  virtual void BeforeFirst(void) {
-    std::fseek(fp_, 0, SEEK_SET);
-  }
-  virtual bool NextRecord(Blob *out_rec) {
-    if (chunk_begin_ == chunk_end_) {
-      if (!LoadChunk()) return false;
-    }
-    char *next = FindNextRecord(chunk_begin_,
-                                chunk_end_);
-    out_rec->dptr = chunk_begin_;
-    out_rec->size = next - chunk_begin_;
-    chunk_begin_ = next;
-    return true;
-  }
-  virtual bool NextChunk(Blob *out_chunk) {
-    if (chunk_begin_ == chunk_end_) {
-      if (!LoadChunk()) return false;
-    }
-    out_chunk->dptr = chunk_begin_;
-    out_chunk->size = chunk_end_ - chunk_begin_;
-    chunk_begin_ = chunk_end_;
-    return true;
-  }
-  inline bool ReadChunk(void *buf, size_t *size) {
-    size_t max_size = *size;
-    if (max_size <= overflow_.length()) {
-      *size = 0; return true;
-    }
-    if (overflow_.length() != 0) {
-      std::memcpy(buf, BeginPtr(overflow_), overflow_.length());
-    }
-    size_t olen = overflow_.length();
-    overflow_.resize(0);
-    size_t nread = this->Read(reinterpret_cast<char*>(buf) + olen,
-                              max_size - olen);
-    nread += olen;
-    if (nread == 0) return false;
-    if (nread != max_size) {
-      *size = nread;
-      return true;
-    } else {
-      const char *bptr = reinterpret_cast<const char*>(buf);
-      // return the last position where a record starts
-      const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size);
-      *size = bend - bptr;
-      overflow_.resize(max_size - *size);
-      if (overflow_.length() != 0) {
-        std::memcpy(BeginPtr(overflow_), bend, overflow_.length());
-      }
-      return true;
-    }
-  }
-
- protected:
-  inline const char* FindLastRecordBegin(const char *begin,
-                                         const char *end) {
-    if (begin == end) return begin;
-    for (const char *p = end - 1; p != begin; --p) {
-      if (*p == '\n' || *p == '\r') return p + 1;
-    }
-    return begin;
-  }
-  inline char* FindNextRecord(char *begin, char *end) {
-    char *p;
-    for (p = begin; p != end; ++p) {
-      if (*p == '\n' || *p == '\r') break;
-    }
-    for (; p != end; ++p) {
-      if (*p != '\n' && *p != '\r') return p;
-    }
-    return end;
-  }
-  inline bool LoadChunk(void) {
-    while (true) {
-      size_t size = buffer_.length();
-      if (!ReadChunk(BeginPtr(buffer_), &size)) return false;
-      if (size == 0) {
-        buffer_.resize(buffer_.length() * 2);
-      } else {
-        chunk_begin_ = reinterpret_cast<char *>(BeginPtr(buffer_));
-        chunk_end_ = chunk_begin_ + size;
-        break;
-      }
-    }
-    return true;
-  }
-
- private:
-  // buffer size
-  static const size_t kBufferSize = 1 << 18UL;
-  // file
-  std::FILE *fp_;
-  bool use_stdin_;
-  // internal overflow
-  std::string overflow_;
-  // internal buffer
-  std::string buffer_;
-  // beginning of chunk
-  char *chunk_begin_;
-  // end of chunk
-  char *chunk_end_;
-};
-
-class StdFile : public dmlc::Stream {
- public:
-  explicit StdFile(std::FILE *fp, bool use_stdio)
-      : fp(fp), use_stdio(use_stdio) {
-  }
-  virtual ~StdFile(void) {
-    this->Close();
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, 1, size, fp);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    Check(std::fwrite(ptr, size, 1, fp) == 1, "StdFile::Write: fwrite error!");
-  }
-  virtual void Seek(size_t pos) {
-    std::fseek(fp, static_cast<long>(pos), SEEK_SET);  // NOLINT(*)
-  }
-  virtual size_t Tell(void) {
-    return std::ftell(fp);
-  }
-  virtual bool AtEnd(void) const {
-    return std::feof(fp) != 0;
-  }
-  inline void Close(void) {
-    if (fp != NULL && !use_stdio) {
-      std::fclose(fp); fp = NULL;
-    }
-  }
-
- private:
-  std::FILE *fp;
-  bool use_stdio;
-};
-}  // namespace utils
-}  // namespace xgboost
-
-namespace dmlc {
-InputSplit* InputSplit::Create(const char *uri,
-                               unsigned part,
-                               unsigned nsplit,
-                               const char *type) {
-  using namespace std;
-  using namespace xgboost;
-  const char *msg = "xgboost is compiled in local mode\n"\
-      "to use hdfs, s3 or distributed version, compile with make dmlc=1";
-  utils::Check(strncmp(uri, "s3://", 5) != 0, msg);
-  utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg);
-  utils::Check(nsplit == 1, msg);
-  return new utils::SingleFileSplit(uri);
-}
-
-Stream *Stream::Create(const char *fname, const char * const mode, bool allow_null) {
-  using namespace std;
-  using namespace xgboost;
-  const char *msg = "xgboost is compiled in local mode\n"\
-      "to use hdfs, s3 or distributed version, compile with make dmlc=1";
-  utils::Check(strncmp(fname, "s3://", 5) != 0, msg);
-  utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg);
-
-  std::FILE *fp = NULL;
-  bool use_stdio = false;
-  using namespace std;
-#ifndef XGBOOST_STRICT_CXX98_
-  if (!strcmp(fname, "stdin")) {
-    use_stdio = true; fp = stdin;
-  }
-  if (!strcmp(fname, "stdout")) {
-    use_stdio = true; fp = stdout;
-  }
-#endif
-  if (!strncmp(fname, "file://", 7)) fname += 7;
-  if (!use_stdio) {
-    std::string flag = mode;
-    if (flag == "w") flag = "wb";
-    if (flag == "r") flag = "rb";
-    fp = fopen64(fname, flag.c_str());
-  }
-  if (fp != NULL) {
-    return new utils::StdFile(fp, use_stdio);
-  } else {
-    utils::Check(allow_null, "fail to open file %s", fname);
-    return NULL;
-  }
-}
-}  // namespace dmlc
-
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -1,97 +0,0 @@
-// Copyright 2014 by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <string>
-#include "./io.h"
-#include "../utils/io.h"
-#include "../utils/utils.h"
-#include "simple_dmatrix-inl.hpp"
-#include "page_dmatrix-inl.hpp"
-
-namespace xgboost {
-namespace io {
-DataMatrix* LoadDataMatrix(const char *fname,
-                           bool silent,
-                           bool savebuffer,
-                           bool loadsplit,
-                           const char *cache_file) {
-  using namespace std;
-  std::string fname_ = fname;
-
-  const char *dlm = strchr(fname, '#');
-  if (dlm != NULL) {
-    utils::Check(strchr(dlm + 1, '#') == NULL,
-                 "only one `#` is allowed in file path for cachefile specification");
-    utils::Check(cache_file == NULL,
-                 "can only specify the cachefile with `#` or argument, not both");
-    fname_ = std::string(fname, dlm - fname);
-    fname = fname_.c_str();
-    cache_file = dlm +1;
-  }
-
-  if (cache_file == NULL) {
-    if (!std::strcmp(fname, "stdin") ||
-        !std::strncmp(fname, "s3://", 5) ||
-        !std::strncmp(fname, "hdfs://", 7) ||
-        loadsplit) {
-      DMatrixSimple *dmat = new DMatrixSimple();
-      dmat->LoadText(fname, silent, loadsplit);
-      return dmat;
-    }
-    int magic;
-    utils::FileStream fs(utils::FopenCheck(fname, "rb"));
-    utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
-    fs.Seek(0);
-    if (magic == DMatrixSimple::kMagic) {
-      DMatrixSimple *dmat = new DMatrixSimple();
-      dmat->LoadBinary(fs, silent, fname);
-      fs.Close();
-      return dmat;
-    }
-    fs.Close();
-    DMatrixSimple *dmat = new DMatrixSimple();
-    dmat->CacheLoad(fname, silent, savebuffer);
-    return dmat;
-  } else {
-    std::string cache_fname = cache_file;
-    if (loadsplit) {
-      std::ostringstream os;
-      os << cache_file << ".r" << rabit::GetRank();
-      cache_fname = os.str();
-      cache_file = cache_fname.c_str();
-    }
-    FILE *fi = fopen64(cache_file, "rb");
-    if (fi != NULL) {
-      DMatrixPage *dmat = new DMatrixPage();
-      utils::FileStream fs(fi);
-      dmat->LoadBinary(fs, silent, cache_file);
-      fs.Close();
-      return dmat;
-    } else {
-      if (fname[0] == '!') {
-        DMatrixHalfRAM *dmat = new DMatrixHalfRAM();
-        dmat->LoadText(fname + 1, cache_file, false, loadsplit);
-        return dmat;
-      } else {
-        DMatrixPage *dmat = new DMatrixPage();
-        dmat->LoadText(fname, cache_file, false, loadsplit);
-        return dmat;
-      }
-    }
-  }
-}
-
-void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
-  if (dmat.magic == DMatrixSimple::kMagic) {
-    const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
-    p_dmat->SaveBinary(fname, silent);
-  } else {
-    DMatrixSimple smat;
-    smat.CopyFrom(dmat);
-    smat.SaveBinary(fname, silent);
-  }
-}
-
-}  // namespace io
-}  // namespace xgboost
--- a/src/io/io.h
+++ b/src/io/io.h
@@ -1,47 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file io.h
- * \brief handles input data format of xgboost
- *    I/O module handles a specific DMatrix format
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_IO_H_
-#define XGBOOST_IO_IO_H_
-
-#include "../data.h"
-#include "../learner/dmatrix.h"
-
-namespace xgboost {
-/*! \brief namespace related to data format */
-namespace io {
-/*! \brief DMatrix object that I/O module support save/load */
-typedef learner::DMatrix DataMatrix;
-/*!
- * \brief load DataMatrix from stream
- * \param fname file name to be loaded
- * \param silent whether print message during loading
- * \param savebuffer whether temporal buffer the file if the file is in text format
- * \param loadsplit whether we only load a split of input files
- *        such that each worker node get a split of the data
- * \param cache_file name of cache_file, used by external memory version
- *        can be NULL, if cache_file is specified, this will be the temporal
- *        space that can be re-used to store intermediate data
- * \return a loaded DMatrix
- */
-DataMatrix* LoadDataMatrix(const char *fname,
-                           bool silent,
-                           bool savebuffer,
-                           bool loadsplit,
-                           const char *cache_file = NULL);
-/*!
- * \brief save DataMatrix into stream,
- *  note: the saved dmatrix format may not be in exactly same as input
- *  SaveDMatrix will choose the best way to materialize the dmatrix.
- * \param dmat the dmatrix to be saved
- * \param fname file name to be saved
- * \param silent whether print message during saving
- */
-void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_IO_H_
--- a/src/io/libsvm_parser.h
+++ b/src/io/libsvm_parser.h
@@ -1,212 +0,0 @@
-/*!
- *  Copyright (c) 2015 by Contributors
- * \file libsvm_parser.h
- * \brief iterator parser to parse libsvm format
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_LIBSVM_PARSER_H_
-#define XGBOOST_IO_LIBSVM_PARSER_H_
-#define NOMINMAX
-#include <vector>
-#include <cstring>
-#include <cctype>
-#include <algorithm>
-#include "../utils/omp.h"
-#include "../utils/utils.h"
-#include "../sync/sync.h"
-#include "../utils/thread_buffer.h"
-#include "./sparse_batch_page.h"
-
-namespace xgboost {
-namespace io {
-/*! \brief page returned by libsvm parser */
-struct LibSVMPage : public SparsePage {
-  std::vector<float> label;
-  // overload clear
-  inline void Clear() {
-    SparsePage::Clear();
-    label.clear();
-  }
-};
-/*!
- * \brief libsvm parser that parses the input lines
- * and returns rows in input data
- * factory that was used by threadbuffer template
- */
-class LibSVMPageFactory  {
- public:
-  LibSVMPageFactory()
-      : bytes_read_(0), at_head_(true) {
-  }
-  inline bool Init(void) {
-    return true;
-  }
-  inline void Setup(dmlc::InputSplit *source,
-                    int nthread) {
-    source_ = source;
-    int maxthread;
-    #pragma omp parallel
-    {
-      maxthread = omp_get_num_procs();
-    }
-    maxthread = std::max(maxthread / 2, 1);
-    nthread_ = std::min(maxthread, nthread);
-  }
-  inline void SetParam(const char *name, const char *val) {}
-  inline bool LoadNext(std::vector<LibSVMPage> *data) {
-    return FillData(data);
-  }
-  inline void FreeSpace(std::vector<LibSVMPage> *a) {
-    delete a;
-  }
-  inline std::vector<LibSVMPage> *Create(void) {
-    return new std::vector<LibSVMPage>();
-  }
-  inline void BeforeFirst(void) {
-    utils::Assert(at_head_, "cannot call beforefirst");
-  }
-  inline void Destroy(void) {
-    delete source_;
-  }
-  inline size_t bytes_read(void) const {
-    return bytes_read_;
-  }
-
- protected:
-  inline bool FillData(std::vector<LibSVMPage> *data) {
-    dmlc::InputSplit::Blob chunk;
-    if (!source_->NextChunk(&chunk)) return false;
-    int nthread;
-    #pragma omp parallel num_threads(nthread_)
-    {
-      nthread = omp_get_num_threads();
-    }
-    // reserve space for data
-    data->resize(nthread);
-    bytes_read_ += chunk.size;
-    utils::Assert(chunk.size != 0, "LibSVMParser.FileData");
-    char *head = reinterpret_cast<char*>(chunk.dptr);
-    #pragma omp parallel num_threads(nthread_)
-    {
-      // threadid
-      int tid = omp_get_thread_num();
-      size_t nstep = (chunk.size + nthread - 1) / nthread;
-      size_t sbegin = std::min(tid * nstep, chunk.size);
-      size_t send = std::min((tid + 1) * nstep, chunk.size);
-      char *pbegin = BackFindEndLine(head + sbegin, head);
-      char *pend;
-      if (tid + 1 == nthread) {
-        pend = head + send;
-      } else {
-        pend = BackFindEndLine(head + send, head);
-      }
-      ParseBlock(pbegin, pend, &(*data)[tid]);
-    }
-    return true;
-  }
-  /*!
-   * \brief parse data into out
-   * \param begin beginning of buffer
-   * \param end end of buffer
-   */
-  inline void ParseBlock(char *begin,
-                         char *end,
-                         LibSVMPage *out) {
-    using namespace std;
-    out->Clear();
-    char *p = begin;
-    while (p != end) {
-      while (isspace(*p) && p != end) ++p;
-      if (p == end) break;
-      char *head = p;
-      while (isdigit(*p) && p != end) ++p;
-      if (*p == ':') {
-        out->data.push_back(SparseBatch::Entry(atol(head),
-                                               static_cast<bst_float>(atof(p + 1))));
-      } else {
-        if (out->label.size() != 0) {
-          out->offset.push_back(out->data.size());
-        }
-        out->label.push_back(static_cast<float>(atof(head)));
-      }
-      while (!isspace(*p) && p != end) ++p;
-    }
-    if (out->label.size() != 0) {
-      out->offset.push_back(out->data.size());
-    }
-    utils::Check(out->label.size() + 1 == out->offset.size(),
-                 "LibSVMParser inconsistent");
-  }
-  /*!
-   * \brief start from bptr, go backward and find first endof line
-   * \param bptr end position to go backward
-   * \param begin the beginning position of buffer
-   * \return position of first endof line going backward
-   */
-  inline char* BackFindEndLine(char *bptr,
-                               char *begin) {
-    for (; bptr != begin; --bptr) {
-      if (*bptr == '\n' || *bptr == '\r') return bptr;
-    }
-    return begin;
-  }
-
- private:
-  // nthread
-  int nthread_;
-  // number of bytes readed
-  size_t bytes_read_;
-  // at beginning, at end of stream
-  bool at_head_;
-  // source split that provides the data
-  dmlc::InputSplit *source_;
-};
-
-class LibSVMParser : public utils::IIterator<LibSVMPage> {
- public:
-  explicit LibSVMParser(dmlc::InputSplit *source,
-                        int nthread)
-      : at_end_(false), data_ptr_(0), data_(NULL) {
-    itr.SetParam("buffer_size", "2");
-    itr.get_factory().Setup(source, nthread);
-    itr.Init();
-  }
-  virtual void BeforeFirst(void) {
-    itr.BeforeFirst();
-  }
-  virtual bool Next(void) {
-    if (at_end_) return false;
-    while (true) {
-      if (data_ == NULL || data_ptr_ >= data_->size()) {
-        if (!itr.Next(data_)) {
-          at_end_ = true; return false;
-        } else {
-          data_ptr_ = 0;
-        }
-      }
-      while (data_ptr_ < data_->size()) {
-        data_ptr_ += 1;
-        if ((*data_)[data_ptr_ - 1].Size() != 0) {
-          return true;
-        }
-      }
-    }
-    return true;
-  }
-  virtual const LibSVMPage &Value(void) const {
-    return (*data_)[data_ptr_ - 1];
-  }
-  inline size_t bytes_read(void) const {
-    return itr.get_factory().bytes_read();
-  }
-
- private:
-  bool at_end_;
-  size_t data_ptr_;
-  std::vector<LibSVMPage> *data_;
-  utils::ThreadBuffer<std::vector<LibSVMPage>*, LibSVMPageFactory> itr;
-};
-
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_LIBSVM_PARSER_H_
--- a/src/io/page_dmatrix-inl.hpp
+++ b/src/io/page_dmatrix-inl.hpp
@@ -1,260 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file page_dmatrix-inl.hpp
- *   row iterator based on sparse page
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
-#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <algorithm>
-#include "../data.h"
-#include "../utils/iterator.h"
-#include "../utils/thread_buffer.h"
-#include "./simple_fmatrix-inl.hpp"
-#include "./sparse_batch_page.h"
-#include "./page_fmatrix-inl.hpp"
-#include "./libsvm_parser.h"
-
-namespace xgboost {
-namespace io {
-/*! \brief thread buffer iterator */
-class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
- public:
-  ThreadRowPageIterator(void) {
-    itr.SetParam("buffer_size", "4");
-    page_ = NULL;
-    base_rowid_ = 0;
-  }
-  virtual ~ThreadRowPageIterator(void) {}
-  virtual void Init(void) {
-  }
-  virtual void BeforeFirst(void) {
-    itr.BeforeFirst();
-    base_rowid_ = 0;
-  }
-  virtual bool Next(void) {
-    if (!itr.Next(page_)) return false;
-    out_ = page_->GetRowBatch(base_rowid_);
-    base_rowid_ += out_.size;
-    return true;
-  }
-  virtual const RowBatch &Value(void) const {
-    return out_;
-  }
-  /*! \brief load and initialize the iterator with fi */
-  inline void Load(const utils::FileStream &fi) {
-    itr.get_factory().SetFile(fi, 0);
-    itr.Init();
-    this->BeforeFirst();
-  }
-
- private:
-  // base row id
-  size_t base_rowid_;
-  // output data
-  RowBatch out_;
-  SparsePage *page_;
-  utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
-};
-
-/*! \brief data matrix using page */
-template<int TKMagic>
-class DMatrixPageBase : public DataMatrix {
- public:
-  DMatrixPageBase(void) : DataMatrix(kMagic) {
-    iter_ = new ThreadRowPageIterator();
-  }
-  // virtual destructor
-  virtual ~DMatrixPageBase(void) {
-    // do not delete row iterator, since it is owned by fmat
-    // to be cleaned up in a more clear way
-  }
-  /*! \brief save a DataMatrix as DMatrixPage */
-  inline static void Save(const char *fname_, const DataMatrix &mat, bool silent) {
-    std::string fname = fname_;
-    utils::FileStream fs(utils::FopenCheck(fname.c_str(), "wb"));
-    int magic = kMagic;
-    fs.Write(&magic, sizeof(magic));
-    mat.info.SaveBinary(fs);
-    fs.Close();
-    fname += ".row.blob";
-    utils::IIterator<RowBatch> *iter = mat.fmat()->RowIterator();
-    utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb"));
-    SparsePage page;
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        page.Push(batch[i]);
-        if (page.MemCostBytes() >= kPageSize) {
-          page.Save(&fbin); page.Clear();
-        }
-      }
-    }
-    if (page.data.size() != 0) page.Save(&fbin);
-    fbin.Close();
-    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
-                    static_cast<unsigned long>(mat.info.num_row()), // NOLINT(*)
-                    static_cast<unsigned long>(mat.info.num_col()), fname_); // NOLINT(*)
-    }
-  }
-  /*! \brief load and initialize the iterator with fi */
-  inline void LoadBinary(utils::FileStream &fi,  // NOLINT(*)
-                         bool silent,
-                         const char *fname_) {
-    this->set_cache_file(fname_);
-    std::string fname = fname_;
-    int tmagic;
-    utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    this->CheckMagic(tmagic);
-    this->info.LoadBinary(fi);
-    // load in the row data file
-    fname += ".row.blob";
-    utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb"));
-    iter_->Load(fs);
-    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()));  // NOLINT(*)
-      if (fname_ != NULL) {
-        utils::Printf(" from %s\n", fname_);
-      } else {
-        utils::Printf("\n");
-      }
-      if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
-      }
-    }
-  }
-  /*! \brief save a LibSVM format file as DMatrixPage */
-  inline void LoadText(const char *uri,
-                       const char* cache_file,
-                       bool silent,
-                       bool loadsplit) {
-    if (!silent) {
-      utils::Printf("start generate text file from %s\n", uri);
-    }
-    int rank = 0, npart = 1;
-    if (loadsplit) {
-      rank = rabit::GetRank();
-      npart = rabit::GetWorldSize();
-    }
-    this->set_cache_file(cache_file);
-    std::string fname_row = std::string(cache_file) + ".row.blob";
-    utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb"));
-    SparsePage page;
-    size_t bytes_write = 0;
-    double tstart = rabit::utils::GetTime();
-    LibSVMParser parser(
-        dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
-    info.Clear();
-    while (parser.Next()) {
-      const LibSVMPage &batch = parser.Value();
-      size_t nlabel = info.labels.size();
-      info.labels.resize(nlabel + batch.label.size());
-      if (batch.label.size() != 0) {
-        std::memcpy(BeginPtr(info.labels) + nlabel,
-                    BeginPtr(batch.label),
-                    batch.label.size() * sizeof(float));
-      }
-      page.Push(batch);
-      for (size_t i = 0; i < batch.data.size(); ++i) {
-        info.info.num_col = std::max(info.info.num_col,
-                                     static_cast<size_t>(batch.data[i].index+1));
-      }
-      if (page.MemCostBytes() >= kPageSize) {
-        bytes_write += page.MemCostBytes();
-        page.Save(&fo);
-        page.Clear();
-        double tdiff = rabit::utils::GetTime() - tstart;
-        if (!silent) {
-          utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
-                        cache_file, (bytes_write >> 20UL) / tdiff,
-                        (bytes_write >> 20UL));
-        }
-      }
-      info.info.num_row += batch.label.size();
-    }
-    if (page.data.size() != 0) {
-      page.Save(&fo);
-    }
-    fo.Close();
-    iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb")));
-    // save data matrix
-    utils::FileStream fs(utils::FopenCheck(cache_file, "wb"));
-    int tmagic = kMagic;
-    fs.Write(&tmagic, sizeof(tmagic));
-    this->info.SaveBinary(fs);
-    fs.Close();
-    if (!silent) {
-      utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    uri);
-    }
-  }
-  /*! \brief magic number used to identify DMatrix */
-  static const int kMagic = TKMagic;
-  /*! \brief page size 32 MB */
-  static const size_t kPageSize = 32UL << 20UL;
-
- protected:
-  virtual void set_cache_file(const std::string &cache_file)  = 0;
-  virtual void CheckMagic(int tmagic)  = 0;
-  /*! \brief row iterator */
-  ThreadRowPageIterator *iter_;
-};
-
-class DMatrixPage : public DMatrixPageBase<0xffffab02> {
- public:
-  DMatrixPage(void) {
-    fmat_ = new FMatrixPage(iter_, this->info);
-  }
-  virtual ~DMatrixPage(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  virtual void set_cache_file(const std::string &cache_file) {
-    fmat_->set_cache_file(cache_file);
-  }
-  virtual void CheckMagic(int tmagic) {
-    utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
-                 tmagic == DMatrixPageBase<0xffffab03>::kMagic,
-                 "invalid format,magic number mismatch");
-  }
-  /*! \brief the real fmatrix */
-  FMatrixPage *fmat_;
-};
-
-// mix of FMatrix S and DMatrix
-// cost half of ram usually as DMatrixSimple
-class DMatrixHalfRAM : public DMatrixPageBase<0xffffab03> {
- public:
-  DMatrixHalfRAM(void) {
-    fmat_ = new FMatrixS(iter_, this->info);
-  }
-  virtual ~DMatrixHalfRAM(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  virtual void set_cache_file(const std::string &cache_file) {
-  }
-  virtual void CheckMagic(int tmagic) {
-    utils::Check(tmagic == DMatrixPageBase<0xffffab02>::kMagic ||
-                 tmagic == DMatrixPageBase<0xffffab03>::kMagic,
-                 "invalid format,magic number mismatch");
-  }
-  /*! \brief the real fmatrix */
-  IFMatrix *fmat_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -1,360 +0,0 @@
-/*!
- * Copyright (c) 2014 by Contributors
- * \file page_fmatrix-inl.hpp
- *   col iterator based on sparse page
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
-#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <algorithm>
-
-namespace xgboost {
-namespace io {
-/*! \brief thread buffer iterator */
-class ThreadColPageIterator: public utils::IIterator<ColBatch> {
- public:
-  ThreadColPageIterator(void) {
-    itr.SetParam("buffer_size", "2");
-    page_ = NULL;
-  }
-  virtual ~ThreadColPageIterator(void) {}
-  virtual void Init(void) {}
-  virtual void BeforeFirst(void) {
-    itr.BeforeFirst();
-  }
-  virtual bool Next(void) {
-    if (!itr.Next(page_)) return false;
-    out_.col_index = BeginPtr(itr.get_factory().index_set());
-    col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(NULL, 0));
-    for (size_t i = 0; i < col_data_.size(); ++i) {
-      col_data_[i] = SparseBatch::Inst
-          (BeginPtr(page_->data) + page_->offset[i],
-           static_cast<bst_uint>(page_->offset[i + 1] - page_->offset[i]));
-    }
-    out_.col_data = BeginPtr(col_data_);
-    out_.size = col_data_.size();
-    return true;
-  }
-  virtual const ColBatch &Value(void) const {
-    return out_;
-  }
-  /*! \brief load and initialize the iterator with fi */
-  inline void SetFile(const utils::FileStream &fi) {
-    itr.get_factory().SetFile(fi);
-    itr.Init();
-  }
-  // set index set
-  inline void SetIndexSet(const std::vector<bst_uint> &fset, bool load_all) {
-    itr.get_factory().SetIndexSet(fset, load_all);
-  }
-
- private:
-  // output data
-  ColBatch out_;
-  SparsePage *page_;
-  std::vector<SparseBatch::Inst> col_data_;
-  utils::ThreadBuffer<SparsePage*, SparsePageFactory> itr;
-};
-
-struct ColConvertFactory {
-  inline bool Init(void) {
-    return true;
-  }
-  inline void Setup(float pkeep,
-                    size_t max_row_perbatch,
-                    size_t num_col,
-                    utils::IIterator<RowBatch> *iter,
-                    std::vector<bst_uint> *buffered_rowset,
-                    const std::vector<bool> *enabled) {
-    pkeep_ = pkeep;
-    max_row_perbatch_ = max_row_perbatch;
-    num_col_ = num_col;
-    iter_ = iter;
-    buffered_rowset_ = buffered_rowset;
-    enabled_ = enabled;
-  }
-  inline SparsePage *Create(void) {
-    return new SparsePage();
-  }
-  inline void FreeSpace(SparsePage *a) {
-    delete a;
-  }
-  inline void SetParam(const char *name, const char *val) {}
-  inline bool LoadNext(SparsePage *val) {
-    tmp_.Clear();
-    size_t btop = buffered_rowset_->size();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (pkeep_ == 1.0f || random::SampleBinary(pkeep_)) {
-          buffered_rowset_->push_back(ridx);
-          tmp_.Push(batch[i]);
-        }
-      }
-      if (tmp_.MemCostBytes() >= kPageSize ||
-          tmp_.Size() >= max_row_perbatch_) {
-        this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
-                          *enabled_, val);
-        return true;
-      }
-    }
-    if (tmp_.Size() != 0) {
-        this->MakeColPage(tmp_, BeginPtr(*buffered_rowset_) + btop,
-                          *enabled_, val);
-        return true;
-    } else {
-      return false;
-    }
-  }
-  inline void Destroy(void) {}
-  inline void BeforeFirst(void) {}
-  inline void MakeColPage(const SparsePage &prow,
-                          const bst_uint *ridx,
-                          const std::vector<bool> &enabled,
-                          SparsePage *pcol) {
-    pcol->Clear();
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-      int max_nthread = std::max(omp_get_num_procs() / 2 - 4, 1);
-      if (nthread > max_nthread) {
-        nthread = max_nthread;
-      }
-    }
-    pcol->Clear();
-    utils::ParallelGroupBuilder<SparseBatch::Entry>
-        builder(&pcol->offset, &pcol->data);
-    builder.InitBudget(num_col_, nthread);
-    bst_omp_uint ndata = static_cast<bst_uint>(prow.Size());
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
-        const SparseBatch::Entry &e = prow.data[j];
-        if (enabled[e.index]) {
-          builder.AddBudget(e.index, tid);
-        }
-      }
-    }
-    builder.InitStorage();
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) {
-        const SparseBatch::Entry &e = prow.data[j];
-        builder.Push(e.index,
-                     SparseBatch::Entry(ridx[i], e.fvalue),
-                     tid);
-      }
-    }
-    utils::Assert(pcol->Size() == num_col_, "inconsistent col data");
-    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(pcol->data) + pcol->offset[i + 1],
-                  SparseBatch::Entry::CmpValue);
-      }
-    }
-  }
-  // probability of keep
-  float pkeep_;
-  // maximum number of rows per batch
-  size_t max_row_perbatch_;
-  // number of columns
-  size_t num_col_;
-  // row batch iterator
-  utils::IIterator<RowBatch> *iter_;
-  // buffered rowset
-  std::vector<bst_uint> *buffered_rowset_;
-  // enabled marks
-  const std::vector<bool> *enabled_;
-  // internal temp cache
-  SparsePage tmp_;
-  /*! \brief page size 256 M */
-  static const size_t kPageSize = 256 << 20UL;
-};
-/*!
- * \brief sparse matrix that support column access, CSC
- */
-class FMatrixPage : public IFMatrix {
- public:
-  typedef SparseBatch::Entry Entry;
-  /*! \brief constructor */
-  FMatrixPage(utils::IIterator<RowBatch> *iter,
-              const learner::MetaInfo &info) : info(info) {
-    this->iter_ = iter;
-  }
-  // destructor
-  virtual ~FMatrixPage(void) {
-    if (iter_ != NULL) delete iter_;
-  }
-  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const {
-    return col_size_.size() != 0;
-  }
-  /*! \brief get number of columns */
-  virtual size_t NumCol(void) const {
-    utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_size_.size();
-  }
-  /*! \brief get number of buffered rows */
-  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
-    return buffered_rowset_;
-  }
-  /*! \brief get column size */
-  virtual size_t GetColSize(size_t cidx) const {
-    return col_size_[cidx];
-  }
-  /*! \brief get column density */
-  virtual float GetColDensity(size_t cidx) const {
-    size_t nmiss = num_buffered_row_ - (col_size_[cidx]);
-    return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
-  }
-  virtual void InitColAccess(const std::vector<bool> &enabled,
-                             float pkeep, size_t max_row_perbatch) {
-    if (this->HaveColAccess()) return;
-    if (TryLoadColData()) return;
-    this->InitColData(enabled, pkeep, max_row_perbatch);
-    utils::Check(TryLoadColData(), "failed on creating col.blob");
-  }
-  /*!
-   * \brief get the row iterator associated with FMatrix
-   */
-  virtual utils::IIterator<RowBatch>* RowIterator(void) {
-    iter_->BeforeFirst();
-    return iter_;
-  }
-  /*!
-   * \brief get the column based  iterator
-   */
-  virtual utils::IIterator<ColBatch>* ColIterator(void) {
-    size_t ncol = this->NumCol();
-    col_index_.resize(0);
-    for (size_t i = 0; i < ncol; ++i) {
-      col_index_.push_back(static_cast<bst_uint>(i));
-    }
-    col_iter_.SetIndexSet(col_index_, false);
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  /*!
-   * \brief column based iterator
-   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    size_t ncol = this->NumCol();
-    col_index_.resize(0);
-    for (size_t i = 0; i < fset.size(); ++i) {
-      if (fset[i] < ncol) col_index_.push_back(fset[i]);
-    }
-    col_iter_.SetIndexSet(col_index_, false);
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  // set the cache file name
-  inline void set_cache_file(const std::string &cache_file) {
-    col_data_name_ = std::string(cache_file) + ".col.blob";
-    col_meta_name_ = std::string(cache_file) + ".col.meta";
-  }
-
- protected:
-  inline bool TryLoadColData(void) {
-    std::FILE *fi = fopen64(col_meta_name_.c_str(), "rb");
-    if (fi == NULL) return false;
-    utils::FileStream fs(fi);
-    LoadMeta(&fs);
-    fs.Close();
-    fi = utils::FopenCheck(col_data_name_.c_str(), "rb");
-    if (fi == NULL) return false;
-    col_iter_.SetFile(utils::FileStream(fi));
-    return true;
-  }
-  inline void LoadMeta(utils::IStream *fi) {
-    utils::Check(fi->Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0,
-                 "invalid col.blob file");
-    utils::Check(fi->Read(&buffered_rowset_),
-                 "invalid col.blob file");
-    utils::Check(fi->Read(&col_size_),
-                 "invalid col.blob file");
-  }
-  inline void SaveMeta(utils::IStream *fo) {
-    fo->Write(&num_buffered_row_, sizeof(num_buffered_row_));
-    fo->Write(buffered_rowset_);
-    fo->Write(col_size_);
-  }
-  /*!
-   * \brief initialize column data
-   * \param enabled the list of enabled columns
-   * \param pkeep probability to keep a row
-   * \param max_row_perbatch maximum row per batch
-   */
-  inline void InitColData(const std::vector<bool> &enabled,
-                          float pkeep, size_t max_row_perbatch) {
-    // clear rowset
-    buffered_rowset_.clear();
-    col_size_.resize(info.num_col());
-    std::fill(col_size_.begin(), col_size_.end(), 0);
-    utils::FileStream fo;
-    fo = utils::FileStream(utils::FopenCheck(col_data_name_.c_str(), "wb"));
-    iter_->BeforeFirst();
-    double tstart = rabit::utils::GetTime();
-    size_t bytes_write = 0;
-    utils::ThreadBuffer<SparsePage*, ColConvertFactory> citer;
-    citer.SetParam("buffer_size", "2");
-    citer.get_factory().Setup(pkeep, max_row_perbatch, info.num_col(),
-                              iter_, &buffered_rowset_, &enabled);
-    citer.Init();
-    SparsePage *pcol;
-    while (citer.Next(pcol)) {
-      for (size_t i = 0; i < pcol->Size(); ++i) {
-        col_size_[i] += pcol->offset[i + 1] - pcol->offset[i];
-      }
-      pcol->Save(&fo);
-      size_t spage = pcol->MemCostBytes();
-      bytes_write += spage;
-      double tnow = rabit::utils::GetTime();
-      double tdiff = tnow - tstart;
-      utils::Printf("Writing to %s in %g MB/s, %lu MB written\n",
-                    col_data_name_.c_str(),
-                    (bytes_write >> 20UL) / tdiff,
-                    (bytes_write >> 20UL));
-    }
-    fo.Close();
-    num_buffered_row_ = buffered_rowset_.size();
-    fo = utils::FileStream(utils::FopenCheck(col_meta_name_.c_str(), "wb"));
-    this->SaveMeta(&fo);
-    fo.Close();
-  }
-
- private:
-  /*! \brief page size 256 M */
-  static const size_t kPageSize = 256 << 20UL;
-  // shared meta info with DMatrix
-  const learner::MetaInfo &info;
-  // row iterator
-  utils::IIterator<RowBatch> *iter_;
-  /*! \brief column based data file name */
-  std::string col_data_name_;
-  /*! \brief column based data file name */
-  std::string col_meta_name_;
-  /*! \brief list of row index that are buffered */
-  std::vector<bst_uint> buffered_rowset_;
-  // number of buffered rows
-  size_t num_buffered_row_;
-  // count for column data
-  std::vector<size_t> col_size_;
-  // internal column index for output
-  std::vector<bst_uint> col_index_;
-  // internal thread backed col iterator
-  ThreadColPageIterator col_iter_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -1,324 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file simple_dmatrix-inl.hpp
- * \brief simple implementation of DMatrixS that can be used
- *  the data format of xgboost is templatized, which means it can accept
- *  any data structure that implements the function defined by FMatrix
- *  this file is a specific implementation of input data structure that can be used by BoostLearner
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
-#define XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
-
-#include <string>
-#include <cstring>
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include "../data.h"
-#include "../utils/utils.h"
-#include "../learner/dmatrix.h"
-#include "./io.h"
-#include "./simple_fmatrix-inl.hpp"
-#include "../sync/sync.h"
-#include "./libsvm_parser.h"
-
-namespace xgboost {
-namespace io {
-/*! \brief implementation of DataMatrix, in CSR format */
-class DMatrixSimple : public DataMatrix {
- public:
-  // constructor
-  DMatrixSimple(void) : DataMatrix(kMagic) {
-    fmat_ = new FMatrixS(new OneBatchIter(this), this->info);
-    this->Clear();
-  }
-  // virtual destructor
-  virtual ~DMatrixSimple(void) {
-    delete fmat_;
-  }
-  virtual IFMatrix *fmat(void) const {
-    return fmat_;
-  }
-  /*! \brief clear the storage */
-  inline void Clear(void) {
-    row_ptr_.clear();
-    row_ptr_.push_back(0);
-    row_data_.clear();
-    info.Clear();
-  }
-  /*! \brief copy content data from source matrix */
-  inline void CopyFrom(const DataMatrix &src) {
-    this->Clear();
-    this->info = src.info;
-    // clone data contents from src matrix
-    utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        RowBatch::Inst inst = batch[i];
-        row_data_.resize(row_data_.size() + inst.length);
-        if (inst.length != 0) {
-          std::memcpy(&row_data_[row_ptr_.back()], inst.data,
-                      sizeof(RowBatch::Entry) * inst.length);
-        }
-        row_ptr_.push_back(row_ptr_.back() + inst.length);
-      }
-    }
-  }
-  /*!
-   * \brief add a row to the matrix
-   * \param feats features
-   * \return the index of added row
-   */
-  inline size_t AddRow(const std::vector<RowBatch::Entry> &feats) {
-    for (size_t i = 0; i < feats.size(); ++i) {
-      row_data_.push_back(feats[i]);
-      info.info.num_col = std::max(info.info.num_col,
-                                   static_cast<size_t>(feats[i].index+1));
-    }
-    row_ptr_.push_back(row_ptr_.back() + feats.size());
-    info.info.num_row += 1;
-    return row_ptr_.size() - 2;
-  }
-  /*!
-   * \brief load split of input, used in distributed mode
-   * \param uri the uri of input
-   * \param loadsplit whether loadsplit of data or all the data
-   * \param silent whether print information or not
-   */
-  inline void LoadText(const char *uri, bool silent = false, bool loadsplit = false) {
-    int rank = 0, npart = 1;
-    if (loadsplit) {
-      rank = rabit::GetRank();
-      npart = rabit::GetWorldSize();
-    }
-    LibSVMParser parser(
-        dmlc::InputSplit::Create(uri, rank, npart, "text"), 16);
-    this->Clear();
-    while (parser.Next()) {
-      const LibSVMPage &batch = parser.Value();
-      size_t nlabel = info.labels.size();
-      info.labels.resize(nlabel + batch.label.size());
-      if (batch.label.size() != 0) {
-        std::memcpy(BeginPtr(info.labels) + nlabel,
-                    BeginPtr(batch.label),
-                    batch.label.size() * sizeof(float));
-      }
-      size_t ndata = row_data_.size();
-      row_data_.resize(ndata + batch.data.size());
-      if (batch.data.size() != 0) {
-        std::memcpy(BeginPtr(row_data_) + ndata,
-                    BeginPtr(batch.data),
-                    batch.data.size() * sizeof(RowBatch::Entry));
-      }
-      row_ptr_.resize(row_ptr_.size() + batch.label.size());
-      for (size_t i = 0; i < batch.label.size(); ++i) {
-        row_ptr_[nlabel + i + 1] = row_ptr_[nlabel] + batch.offset[i + 1];
-      }
-      info.info.num_row += batch.Size();
-      for (size_t i = 0; i < batch.data.size(); ++i) {
-        info.info.num_col = std::max(info.info.num_col,
-                                     static_cast<size_t>(batch.data[i].index+1));
-      }
-    }
-    if (!silent) {
-      utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    static_cast<unsigned long>(row_data_.size()), uri);  // NOLINT(*)
-    }
-    // try to load in additional file
-    if (!loadsplit) {
-      std::string name = uri;
-      std::string gname = name + ".group";
-      if (info.TryLoadGroup(gname.c_str(), silent)) {
-        utils::Check(info.group_ptr.back() == info.num_row(),
-                     "DMatrix: group data does not match the number of rows in features");
-      }
-      std::string wname = name + ".weight";
-      if (info.TryLoadFloatInfo("weight", wname.c_str(), silent)) {
-        utils::Check(info.weights.size() == info.num_row(),
-                     "DMatrix: weight data does not match the number of rows in features");
-      }
-      std::string mname = name + ".base_margin";
-      if (info.TryLoadFloatInfo("base_margin", mname.c_str(), silent)) {
-      }
-    }
-  }
-  /*!
-   * \brief load from binary file
-   * \param fname name of binary data
-   * \param silent whether print information or not
-   * \return whether loading is success
-   */
-  inline bool LoadBinary(const char* fname, bool silent = false) {
-    std::FILE *fp = fopen64(fname, "rb");
-    if (fp == NULL) return false;
-    utils::FileStream fs(fp);
-    this->LoadBinary(fs, silent, fname);
-    fs.Close();
-    return true;
-  }
-  /*!
-   * \brief load from binary stream
-   * \param fs input file stream
-   * \param silent whether print information during loading
-   * \param fname file name, used to print message
-   */
-  inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {  // NOLINT(*)
-    int tmagic;
-    utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
-    utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch",
-                 fname == NULL ? "" : fname);
-
-    info.LoadBinary(fs);
-    LoadBinary(fs, &row_ptr_, &row_data_);
-    fmat_->LoadColAccess(fs);
-
-    if (!silent) {
-      utils::Printf("%lux%lu matrix with %lu entries is loaded",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    static_cast<unsigned long>(row_data_.size()));  // NOLINT(*)
-      if (fname != NULL) {
-        utils::Printf(" from %s\n", fname);
-      } else {
-        utils::Printf("\n");
-      }
-      if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
-      }
-    }
-  }
-  /*!
-   * \brief save to binary file
-   * \param fname name of binary data
-   * \param silent whether print information or not
-   */
-  inline void SaveBinary(const char* fname, bool silent = false) const {
-    utils::FileStream fs(utils::FopenCheck(fname, "wb"));
-    int tmagic = kMagic;
-    fs.Write(&tmagic, sizeof(tmagic));
-    info.SaveBinary(fs);
-    SaveBinary(fs, row_ptr_, row_data_);
-    fmat_->SaveColAccess(fs);
-    fs.Close();
-
-    if (!silent) {
-      utils::Printf("%lux%lu matrix with %lu entries is saved to %s\n",
-                    static_cast<unsigned long>(info.num_row()),  // NOLINT(*)
-                    static_cast<unsigned long>(info.num_col()),  // NOLINT(*)
-                    static_cast<unsigned long>(row_data_.size()), fname);  // NOLINT(*)
-      if (info.group_ptr.size() != 0) {
-        utils::Printf("data contains %u groups\n",
-                      static_cast<unsigned>(info.group_ptr.size()-1));
-      }
-    }
-  }
-  /*!
-   * \brief cache load data given a file name, if filename ends with .buffer, direct load binary
-   *        otherwise the function will first check if fname + '.buffer' exists,
-   *        if binary buffer exists, it will reads from binary buffer, otherwise, it will load from text file,
-   *        and try to create a buffer file
-   * \param fname name of binary data
-   * \param silent whether print information or not
-   * \param savebuffer whether do save binary buffer if it is text
-   */
-  inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
-    using namespace std;
-    size_t len = strlen(fname);
-    if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
-      if (!this->LoadBinary(fname, silent)) {
-        utils::Error("can not open file \"%s\"", fname);
-      }
-      return;
-    }
-    char bname[1024];
-    utils::SPrintf(bname, sizeof(bname), "%s.buffer", fname);
-    if (!this->LoadBinary(bname, silent)) {
-      this->LoadText(fname, silent);
-      if (savebuffer) this->SaveBinary(bname, silent);
-    }
-  }
-  // data fields
-  /*! \brief row pointer of CSR sparse storage */
-  std::vector<size_t> row_ptr_;
-  /*! \brief data in the row */
-  std::vector<RowBatch::Entry> row_data_;
-  /*! \brief the real fmatrix */
-  FMatrixS *fmat_;
-  /*! \brief magic number used to identify DMatrix */
-  static const int kMagic = 0xffffab01;
-
- protected:
-  /*!
-   * \brief save data to binary stream
-   * \param fo output stream
-   * \param ptr pointer data
-   * \param data data content
-   */
-  inline static void SaveBinary(utils::IStream &fo,  // NOLINT(*)
-                                const std::vector<size_t> &ptr,
-                                const std::vector<RowBatch::Entry> &data) {
-    size_t nrow = ptr.size() - 1;
-    fo.Write(&nrow, sizeof(size_t));
-    fo.Write(BeginPtr(ptr), ptr.size() * sizeof(size_t));
-    if (data.size() != 0) {
-      fo.Write(BeginPtr(data), data.size() * sizeof(RowBatch::Entry));
-    }
-  }
-  /*!
-   * \brief load data from binary stream
-   * \param fi input stream
-   * \param out_ptr pointer data
-   * \param out_data data content
-   */
-  inline static void LoadBinary(utils::IStream &fi,  // NOLINT(*)
-                                std::vector<size_t> *out_ptr,
-                                std::vector<RowBatch::Entry> *out_data) {
-    size_t nrow;
-    utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format");
-    out_ptr->resize(nrow + 1);
-    utils::Check(fi.Read(BeginPtr(*out_ptr), out_ptr->size() * sizeof(size_t)) != 0,
-                  "invalid input file format");
-    out_data->resize(out_ptr->back());
-    if (out_data->size() != 0) {
-      utils::Assert(fi.Read(BeginPtr(*out_data), out_data->size() * sizeof(RowBatch::Entry)) != 0,
-                    "invalid input file format");
-    }
-  }
-  // one batch iterator that return content in the matrix
-  struct OneBatchIter: utils::IIterator<RowBatch> {
-    explicit OneBatchIter(DMatrixSimple *parent)
-        : at_first_(true), parent_(parent) {}
-    virtual ~OneBatchIter(void) {}
-    virtual void BeforeFirst(void) {
-      at_first_ = true;
-    }
-    virtual bool Next(void) {
-      if (!at_first_) return false;
-      at_first_ = false;
-      batch_.size = parent_->row_ptr_.size() - 1;
-      batch_.base_rowid = 0;
-      batch_.ind_ptr = BeginPtr(parent_->row_ptr_);
-      batch_.data_ptr = BeginPtr(parent_->row_data_);
-      return true;
-    }
-    virtual const RowBatch &Value(void) const {
-      return batch_;
-    }
-
-   private:
-    // whether is at first
-    bool at_first_;
-    // pointer to parent
-    DMatrixSimple *parent_;
-    // temporal space for batch
-    RowBatch batch_;
-  };
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // namespace XGBOOST_IO_SIMPLE_DMATRIX_INL_HPP_
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -1,374 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file simple_fmatrix-inl.hpp
- * \brief the input data structure for gradient boosting
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
-#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_
-
-#include <limits>
-#include <algorithm>
-#include <vector>
-#include "../data.h"
-#include "../utils/utils.h"
-#include "../utils/random.h"
-#include "../utils/omp.h"
-#include "../learner/dmatrix.h"
-#include "../utils/group_data.h"
-#include "./sparse_batch_page.h"
-
-namespace xgboost {
-namespace io {
-/*!
- * \brief sparse matrix that support column access, CSC
- */
-class FMatrixS : public IFMatrix {
- public:
-  typedef SparseBatch::Entry Entry;
-  /*! \brief constructor */
-  FMatrixS(utils::IIterator<RowBatch> *iter,
-               const learner::MetaInfo &info)
-      : info_(info) {
-    this->iter_ = iter;
-  }
-  // destructor
-  virtual ~FMatrixS(void) {
-    if (iter_ != NULL) delete iter_;
-  }
-  /*! \return whether column access is enabled */
-  virtual bool HaveColAccess(void) const {
-    return col_size_.size() != 0;
-  }
-  /*! \brief get number of columns */
-  virtual size_t NumCol(void) const {
-    utils::Check(this->HaveColAccess(), "NumCol:need column access");
-    return col_size_.size();
-  }
-  /*! \brief get number of buffered rows */
-  virtual const std::vector<bst_uint> &buffered_rowset(void) const {
-    return buffered_rowset_;
-  }
-  /*! \brief get column size */
-  virtual size_t GetColSize(size_t cidx) const {
-    return col_size_[cidx];
-  }
-  /*! \brief get column density */
-  virtual float GetColDensity(size_t cidx) const {
-    size_t nmiss = buffered_rowset_.size() - col_size_[cidx];
-    return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
-  }
-  virtual void InitColAccess(const std::vector<bool> &enabled,
-                             float pkeep, size_t max_row_perbatch) {
-    if (this->HaveColAccess()) return;
-    this->InitColData(enabled, pkeep, max_row_perbatch);
-  }
-  /*!
-   * \brief get the row iterator associated with FMatrix
-   */
-  virtual utils::IIterator<RowBatch>* RowIterator(void) {
-    iter_->BeforeFirst();
-    return iter_;
-  }
-  /*!
-   * \brief get the column based  iterator
-   */
-  virtual utils::IIterator<ColBatch>* ColIterator(void) {
-    size_t ncol = this->NumCol();
-    col_iter_.col_index_.resize(ncol);
-    for (size_t i = 0; i < ncol; ++i) {
-      col_iter_.col_index_[i] = static_cast<bst_uint>(i);
-    }
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  /*!
-   * \brief column based iterator
-   */
-  virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
-    size_t ncol = this->NumCol();
-    col_iter_.col_index_.resize(0);
-    for (size_t i = 0; i < fset.size(); ++i) {
-      if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
-    }
-    col_iter_.BeforeFirst();
-    return &col_iter_;
-  }
-  /*!
-   * \brief save column access data into stream
-   * \param fo output stream to save to
-   */
-  inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*)
-    size_t n = 0;
-    fo.Write(&n, sizeof(n));
-  }
-  /*!
-   * \brief load column access data from stream
-   * \param fo output stream to load from
-   */
-  inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*)
-    // do nothing in load col access
-  }
-
- protected:
-  /*!
-   * \brief initialize column data
-   * \param enabled the list of enabled columns
-   * \param pkeep probability to keep a row
-   * \param max_row_perbatch maximum row per batch
-   */
-  inline void InitColData(const std::vector<bool> &enabled,
-                          float pkeep, size_t max_row_perbatch) {
-    col_iter_.Clear();
-    if (info_.num_row() < max_row_perbatch) {
-      SparsePage *page = new SparsePage();
-      this->MakeOneBatch(enabled, pkeep, page);
-      col_iter_.cpages_.push_back(page);
-    } else {
-      this->MakeManyBatch(enabled, pkeep, max_row_perbatch);
-    }
-    // setup col-size
-    col_size_.resize(info_.num_col());
-    std::fill(col_size_.begin(), col_size_.end(), 0);
-    for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) {
-      SparsePage *pcol = col_iter_.cpages_[i];
-      for (size_t j = 0; j < pcol->Size(); ++j) {
-        col_size_[j] += pcol->offset[j + 1] - pcol->offset[j];
-      }
-    }
-  }
-  /*!
-   * \brief make column page from iterator
-   * \param pkeep probability to keep a row
-   * \param pcol the target column
-   */
-  inline void MakeOneBatch(const std::vector<bool> &enabled,
-                           float pkeep,
-                           SparsePage *pcol) {
-    // clear rowset
-    buffered_rowset_.clear();
-    // bit map
-    int nthread;
-    std::vector<bool> bmap;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    pcol->Clear();
-    utils::ParallelGroupBuilder<SparseBatch::Entry>
-        builder(&pcol->offset, &pcol->data);
-    builder.InitBudget(info_.num_col(), nthread);
-    // start working
-    iter_->BeforeFirst();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      bmap.resize(bmap.size() + batch.size, true);
-      long batch_size = static_cast<long>(batch.size); // NOLINT(*)
-      for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
-          buffered_rowset_.push_back(ridx);
-        } else {
-          bmap[i] = false;
-        }
-      }
-      #pragma omp parallel for schedule(static)
-      for (long i = 0; i < batch_size; ++i) { // NOLINT(*)
-        int tid = omp_get_thread_num();
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (bmap[ridx]) {
-          RowBatch::Inst inst = batch[i];
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            if (enabled[inst[j].index]) {
-              builder.AddBudget(inst[j].index, tid);
-            }
-          }
-        }
-      }
-    }
-    builder.InitStorage();
-
-    iter_->BeforeFirst();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      #pragma omp parallel for schedule(static)
-      for (long i = 0; i < static_cast<long>(batch.size); ++i) { // NOLINT(*)
-        int tid = omp_get_thread_num();
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (bmap[ridx]) {
-          RowBatch::Inst inst = batch[i];
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            if (enabled[inst[j].index]) {
-              builder.Push(inst[j].index,
-                           Entry((bst_uint)(batch.base_rowid+i),
-                                 inst[j].fvalue), tid);
-            }
-          }
-        }
-      }
-    }
-
-    utils::Assert(pcol->Size() == info_.num_col(),
-                  "inconsistent col data");
-    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(pcol->data) + pcol->offset[i + 1],
-                  SparseBatch::Entry::CmpValue);
-      }
-    }
-  }
-
-  inline void MakeManyBatch(const std::vector<bool> &enabled,
-                            float pkeep, size_t max_row_perbatch) {
-    size_t btop = 0;
-    buffered_rowset_.clear();
-    // internal temp cache
-    SparsePage tmp; tmp.Clear();
-    iter_->BeforeFirst();
-    while (iter_->Next()) {
-      const RowBatch &batch = iter_->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
-          buffered_rowset_.push_back(ridx);
-          tmp.Push(batch[i]);
-        }
-        if (tmp.Size() >= max_row_perbatch) {
-          SparsePage *page = new SparsePage();
-          this->MakeColPage(tmp.GetRowBatch(0),
-                            BeginPtr(buffered_rowset_) + btop,
-                            enabled, page);
-          col_iter_.cpages_.push_back(page);
-          btop = buffered_rowset_.size();
-          tmp.Clear();
-        }
-      }
-    }
-    if (tmp.Size() != 0) {
-      SparsePage *page = new SparsePage();
-      this->MakeColPage(tmp.GetRowBatch(0),
-                        BeginPtr(buffered_rowset_) + btop,
-                        enabled, page);
-      col_iter_.cpages_.push_back(page);
-    }
-  }
-  // make column page from subset of rowbatchs
-  inline void MakeColPage(const RowBatch &batch,
-                          const bst_uint *ridx,
-                          const std::vector<bool> &enabled,
-                          SparsePage *pcol) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-      int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1);
-      if (nthread > max_nthread) {
-        nthread = max_nthread;
-      }
-    }
-    pcol->Clear();
-    utils::ParallelGroupBuilder<SparseBatch::Entry>
-        builder(&pcol->offset, &pcol->data);
-    builder.InitBudget(info_.num_col(), nthread);
-    bst_omp_uint ndata = static_cast<bst_uint>(batch.size);
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      RowBatch::Inst inst = batch[i];
-      for (bst_uint j = 0; j < inst.length; ++j) {
-        const SparseBatch::Entry &e = inst[j];
-        if (enabled[e.index]) {
-          builder.AddBudget(e.index, tid);
-        }
-      }
-    }
-    builder.InitStorage();
-    #pragma omp parallel for schedule(static) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      int tid = omp_get_thread_num();
-      RowBatch::Inst inst = batch[i];
-      for (bst_uint j = 0; j < inst.length; ++j) {
-        const SparseBatch::Entry &e = inst[j];
-        builder.Push(e.index,
-                     SparseBatch::Entry(ridx[i], e.fvalue),
-                     tid);
-      }
-    }
-    utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data");
-    // sort columns
-    bst_omp_uint ncol = static_cast<bst_omp_uint>(pcol->Size());
-    #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread)
-    for (bst_omp_uint i = 0; i < ncol; ++i) {
-      if (pcol->offset[i] < pcol->offset[i + 1]) {
-        std::sort(BeginPtr(pcol->data) + pcol->offset[i],
-                  BeginPtr(pcol->data) + pcol->offset[i + 1],
-                  SparseBatch::Entry::CmpValue);
-      }
-    }
-  }
-
- private:
-  // one batch iterator that return content in the matrix
-  struct ColBatchIter: utils::IIterator<ColBatch> {
-    ColBatchIter(void) : data_ptr_(0) {}
-    virtual ~ColBatchIter(void) {
-      this->Clear();
-    }
-    virtual void BeforeFirst(void) {
-      data_ptr_ = 0;
-    }
-    virtual bool Next(void) {
-      if (data_ptr_ >= cpages_.size()) return false;
-      data_ptr_ += 1;
-      SparsePage *pcol = cpages_[data_ptr_ - 1];
-      batch_.size = col_index_.size();
-      col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0));
-      for (size_t i = 0; i < col_data_.size(); ++i) {
-        const bst_uint ridx = col_index_[i];
-        col_data_[i] = SparseBatch::Inst
-            (BeginPtr(pcol->data) + pcol->offset[ridx],
-             static_cast<bst_uint>(pcol->offset[ridx + 1] - pcol->offset[ridx]));
-      }
-      batch_.col_index = BeginPtr(col_index_);
-      batch_.col_data = BeginPtr(col_data_);
-      return true;
-    }
-    virtual const ColBatch &Value(void) const {
-      return batch_;
-    }
-    inline void Clear(void) {
-      for (size_t i = 0; i < cpages_.size(); ++i) {
-        delete cpages_[i];
-      }
-      cpages_.clear();
-    }
-    // data content
-    std::vector<bst_uint> col_index_;
-    // column content
-    std::vector<ColBatch::Inst> col_data_;
-    // column sparse pages
-    std::vector<SparsePage*> cpages_;
-    // data pointer
-    size_t data_ptr_;
-    // temporal space for batch
-    ColBatch batch_;
-  };
-  // --- data structure used to support InitColAccess --
-  // column iterator
-  ColBatchIter col_iter_;
-  // shared meta info with DMatrix
-  const learner::MetaInfo &info_;
-  // row iterator
-  utils::IIterator<RowBatch> *iter_;
-  /*! \brief list of row index that are buffered */
-  std::vector<bst_uint> buffered_rowset_;
-  // count for column data
-  std::vector<size_t> col_size_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_
--- a/src/io/sparse_batch_page.h
+++ b/src/io/sparse_batch_page.h
@@ -1,272 +0,0 @@
-/*!
- * Copyright (c) 2014 by Contributors
- * \file sparse_batch_page.h
- *   content holder of sparse batch that can be saved to disk
- *   the representation can be effectively
- *   use in external memory computation
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_
-#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_
-
-#include <vector>
-#include <algorithm>
-#include "../data.h"
-
-namespace xgboost {
-namespace io {
-/*!
- * \brief storage unit of sparse batch
- */
-class SparsePage {
- public:
-  /*! \brief offset of the segments */
-  std::vector<size_t> offset;
-  /*! \brief the data of the segments */
-  std::vector<SparseBatch::Entry> data;
-  /*! \brief constructor */
-  SparsePage() {
-    this->Clear();
-  }
-  /*! \return number of instance in the page */
-  inline size_t Size() const {
-    return offset.size() - 1;
-  }
-  /*!
-   * \brief load only the segments we are interested in
-   * \param fi the input stream of the file
-   * \param sorted_index_set sorted index of segments we are interested in
-   * \return true of the loading as successful, false if end of file was reached
-   */
-  inline bool Load(utils::ISeekStream *fi,
-                   const std::vector<bst_uint> &sorted_index_set) {
-    if (!fi->Read(&disk_offset_)) return false;
-    // setup the offset
-    offset.clear(); offset.push_back(0);
-    for (size_t i = 0; i < sorted_index_set.size(); ++i) {
-      bst_uint fid = sorted_index_set[i];
-      utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format");
-      size_t size = disk_offset_[fid + 1] - disk_offset_[fid];
-      offset.push_back(offset.back() + size);
-    }
-    data.resize(offset.back());
-    // read in the data
-    size_t begin = fi->Tell();
-    size_t curr_offset = 0;
-    for (size_t i = 0; i < sorted_index_set.size();) {
-      bst_uint fid = sorted_index_set[i];
-      if (disk_offset_[fid] != curr_offset) {
-        utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted");
-        fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry));
-        curr_offset = disk_offset_[fid];
-      }
-      size_t j, size_to_read = 0;
-      for (j = i; j < sorted_index_set.size(); ++j) {
-        if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) {
-          size_to_read += offset[j + 1] - offset[j];
-        } else {
-          break;
-        }
-      }
-      if (size_to_read != 0) {
-        utils::Check(fi->Read(BeginPtr(data) + offset[i],
-                              size_to_read * sizeof(SparseBatch::Entry)) != 0,
-                     "Invalid SparsePage file");
-        curr_offset += size_to_read;
-      }
-      i = j;
-    }
-    // seek to end of record
-    if (curr_offset != disk_offset_.back()) {
-      fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry));
-    }
-    return true;
-  }
-  /*!
-   * \brief load all the segments
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
-   */
-  inline bool Load(utils::IStream *fi) {
-    if (!fi->Read(&offset)) return false;
-    utils::Check(offset.size() != 0, "Invalid SparsePage file");
-    data.resize(offset.back());
-    if (data.size() != 0) {
-      utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0,
-                   "Invalid SparsePage file");
-    }
-    return true;
-  }
-  /*!
-   * \brief save the data to fo, when a page was written
-   *    to disk it must contain all the elements in the
-   * \param fo output stream
-   */
-  inline void Save(utils::IStream *fo) const {
-    utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset");
-    utils::Assert(offset.back() == data.size(), "in consistent SparsePage");
-    fo->Write(offset);
-    if (data.size() != 0) {
-      fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry));
-    }
-  }
-  /*! \return estimation of memory cost of this page */
-  inline size_t MemCostBytes(void) const {
-    return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry);
-  }
-  /*! \brief clear the page */
-  inline void Clear(void) {
-    offset.clear();
-    offset.push_back(0);
-    data.clear();
-  }
-  /*!
-   * \brief load all the segments and add it to existing batch
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
-   */
-  inline bool PushLoad(utils::IStream *fi) {
-    if (!fi->Read(&disk_offset_)) return false;
-    data.resize(offset.back() + disk_offset_.back());
-    if (disk_offset_.back() != 0) {
-      utils::Check(fi->Read(BeginPtr(data) + offset.back(),
-                            disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0,
-                   "Invalid SparsePage file");
-    }
-    size_t top = offset.back();
-    size_t begin = offset.size();
-    offset.resize(offset.size() + disk_offset_.size());
-    for (size_t i = 0; i < disk_offset_.size(); ++i) {
-      offset[i + begin] = top + disk_offset_[i];
-    }
-    return true;
-  }
-  /*!
-   * \brief Push row batch into the page
-   * \param batch the row batch
-   */
-  inline void Push(const RowBatch &batch) {
-    data.resize(offset.back() + batch.ind_ptr[batch.size]);
-    std::memcpy(BeginPtr(data) + offset.back(),
-                batch.data_ptr + batch.ind_ptr[0],
-                sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]);
-    size_t top = offset.back();
-    size_t begin = offset.size();
-    offset.resize(offset.size() + batch.size);
-    for (size_t i = 0; i < batch.size; ++i) {
-      offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0];
-    }
-  }
-  /*!
-   * \brief Push a sparse page
-   * \param batch the row page
-   */
-  inline void Push(const SparsePage &batch) {
-    size_t top = offset.back();
-    data.resize(top + batch.data.size());
-    std::memcpy(BeginPtr(data) + top,
-                BeginPtr(batch.data),
-                sizeof(SparseBatch::Entry) * batch.data.size());
-    size_t begin = offset.size();
-    offset.resize(begin + batch.Size());
-    for (size_t i = 0; i < batch.Size(); ++i) {
-      offset[i + begin] = top + batch.offset[i + 1];
-    }
-  }
-  /*!
-   * \brief Push one instance into page
-   *  \param row an instance row
-   */
-  inline void Push(const SparseBatch::Inst &inst) {
-    offset.push_back(offset.back() + inst.length);
-    size_t begin = data.size();
-    data.resize(begin + inst.length);
-    if (inst.length != 0) {
-      std::memcpy(BeginPtr(data) + begin, inst.data,
-                  sizeof(SparseBatch::Entry) * inst.length);
-    }
-  }
-  /*!
-   * \param base_rowid base_rowid of the data
-   * \return row batch representation of the page
-   */
-  inline RowBatch GetRowBatch(size_t base_rowid) const {
-    RowBatch out;
-    out.base_rowid  = base_rowid;
-    out.ind_ptr = BeginPtr(offset);
-    out.data_ptr = BeginPtr(data);
-    out.size = offset.size() - 1;
-    return out;
-  }
-
- private:
-  /*! \brief external memory column offset */
-  std::vector<size_t> disk_offset_;
-};
-/*!
- * \brief factory class for SparsePage,
- *        used in threadbuffer template
- */
-class SparsePageFactory {
- public:
-  SparsePageFactory(void)
-      : action_load_all_(true), set_load_all_(true) {}
-  inline void SetFile(const utils::FileStream &fi,
-                      size_t file_begin = 0) {
-    fi_ = fi;
-    file_begin_ = file_begin;
-  }
-  inline const std::vector<bst_uint> &index_set(void) const {
-    return action_index_set_;
-  }
-  // set index set, will be used after next before first
-  inline void SetIndexSet(const std::vector<bst_uint> &index_set,
-                          bool load_all) {
-    set_load_all_ = load_all;
-    if (!set_load_all_) {
-      set_index_set_ = index_set;
-      std::sort(set_index_set_.begin(), set_index_set_.end());
-    }
-  }
-  inline bool Init(void) {
-    return true;
-  }
-  inline void SetParam(const char *name, const char *val) {}
-  inline bool LoadNext(SparsePage *val) {
-    if (!action_load_all_) {
-      if (action_index_set_.size() == 0) {
-        return false;
-      } else {
-        return val->Load(&fi_, action_index_set_);
-      }
-    } else {
-      return val->Load(&fi_);
-    }
-  }
-  inline SparsePage *Create(void) {
-    return new SparsePage();
-  }
-  inline void FreeSpace(SparsePage *a) {
-    delete a;
-  }
-  inline void Destroy(void) {
-    fi_.Close();
-  }
-  inline void BeforeFirst(void) {
-    fi_.Seek(file_begin_);
-    action_load_all_ = set_load_all_;
-    if (!set_load_all_) {
-      action_index_set_ = set_index_set_;
-    }
-  }
-
- private:
-  bool action_load_all_, set_load_all_;
-  size_t file_begin_;
-  utils::FileStream fi_;
-  std::vector<bst_uint> action_index_set_;
-  std::vector<bst_uint> set_index_set_;
-};
-}  // namespace io
-}  // namespace xgboost
-#endif  // XGBOOST_IO_SPARSE_BATCH_PAGE_H_
--- a/src/learner/dmatrix.h
+++ b/src/learner/dmatrix.h
@@ -1,176 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file dmatrix.h
- * \brief meta data and template data structure
- *        used for regression/classification/ranking
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_LEARNER_DMATRIX_H_
-#define XGBOOST_LEARNER_DMATRIX_H_
-
-#include <vector>
-#include <cstring>
-#include "../data.h"
-#include "../utils/io.h"
-namespace xgboost {
-namespace learner {
-/*!
- * \brief meta information needed in training, including label, weight
- */
-struct MetaInfo {
-  /*!
-   * \brief information needed by booster
-   * BoosterInfo does not implement save and load,
-   * all serialization is done in MetaInfo
-   */
-  BoosterInfo info;
-  /*! \brief label of each instance */
-  std::vector<float> labels;
-  /*!
-   * \brief the index of begin and end of a group
-   * needed when the learning task is ranking
-   */
-  std::vector<bst_uint> group_ptr;
-  /*! \brief weights of each instance, optional */
-  std::vector<float> weights;
-  /*!
-   * \brief initialized margins,
-   * if specified, xgboost will start from this initial margin
-   * can be used to specify initial prediction to boost from
-   */
-  std::vector<float> base_margin;
-  /*! \brief version flag, used to check version of this info */
-  static const int kVersion = 0;
-  // constructor
-  MetaInfo(void) {}
-  /*! \return number of rows in dataset */
-  inline size_t num_row(void) const {
-    return info.num_row;
-  }
-  /*! \return number of columns in dataset */
-  inline size_t num_col(void) const {
-    return info.num_col;
-  }
-  /*! \brief clear all the information */
-  inline void Clear(void) {
-    labels.clear();
-    group_ptr.clear();
-    weights.clear();
-    info.root_index.clear();
-    base_margin.clear();
-    info.num_row = info.num_col = 0;
-  }
-  /*! \brief get weight of each instances */
-  inline float GetWeight(size_t i) const {
-    if (weights.size() != 0) {
-      return weights[i];
-    } else {
-      return 1.0f;
-    }
-  }
-  inline void SaveBinary(utils::IStream &fo) const { // NOLINT(*)
-    int version = kVersion;
-    fo.Write(&version, sizeof(version));
-    fo.Write(&info.num_row, sizeof(info.num_row));
-    fo.Write(&info.num_col, sizeof(info.num_col));
-    fo.Write(labels);
-    fo.Write(group_ptr);
-    fo.Write(weights);
-    fo.Write(info.root_index);
-    fo.Write(base_margin);
-  }
-  inline void LoadBinary(utils::IStream &fi) { // NOLINT(*)
-    int version;
-    utils::Check(fi.Read(&version, sizeof(version)) != 0, "MetaInfo: invalid format");
-    utils::Check(fi.Read(&info.num_row, sizeof(info.num_row)) != 0, "MetaInfo: invalid format");
-    utils::Check(fi.Read(&info.num_col, sizeof(info.num_col)) != 0, "MetaInfo: invalid format");
-    utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&info.root_index), "MetaInfo: invalid format");
-    utils::Check(fi.Read(&base_margin), "MetaInfo: invalid format");
-  }
-  // try to load group information from file, if exists
-  inline bool TryLoadGroup(const char* fname, bool silent = false) {
-    using namespace std;
-    FILE *fi = fopen64(fname, "r");
-    if (fi == NULL) return false;
-    group_ptr.push_back(0);
-    unsigned nline;
-    while (fscanf(fi, "%u", &nline) == 1) {
-      group_ptr.push_back(group_ptr.back()+nline);
-    }
-    if (!silent) {
-      utils::Printf("%u groups are loaded from %s\n",
-                    static_cast<unsigned>(group_ptr.size()-1), fname);
-    }
-    fclose(fi);
-    return true;
-  }
-  inline std::vector<float>& GetFloatInfo(const char *field) {
-    using namespace std;
-    if (!strcmp(field, "label")) return labels;
-    if (!strcmp(field, "weight")) return weights;
-    if (!strcmp(field, "base_margin")) return base_margin;
-    utils::Error("unknown field %s", field);
-    return labels;
-  }
-  inline const std::vector<float>& GetFloatInfo(const char *field) const {
-    return ((MetaInfo*)this)->GetFloatInfo(field); // NOLINT(*)
-  }
-  inline std::vector<unsigned> &GetUIntInfo(const char *field) {
-    using namespace std;
-    if (!strcmp(field, "root_index")) return info.root_index;
-    if (!strcmp(field, "fold_index")) return info.fold_index;
-    utils::Error("unknown field %s", field);
-    return info.root_index;
-  }
-  inline const std::vector<unsigned> &GetUIntInfo(const char *field) const {
-    return ((MetaInfo*)this)->GetUIntInfo(field);  // NOLINT(*)
-  }
-  // try to load weight information from file, if exists
-  inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
-    using namespace std;
-    std::vector<float> &data = this->GetFloatInfo(field);
-    FILE *fi = fopen64(fname, "r");
-    if (fi == NULL) return false;
-    float wt;
-    while (fscanf(fi, "%f", &wt) == 1) {
-      data.push_back(wt);
-    }
-    if (!silent) {
-      utils::Printf("loading %s from %s\n", field, fname);
-    }
-    fclose(fi);
-    return true;
-  }
-};
-
-/*!
- * \brief data object used for learning,
- * \tparam FMatrix type of feature data source
- */
-struct DMatrix {
-  /*!
-   * \brief magic number associated with this object
-   *    used to check if it is specific instance
-   */
-  const int magic;
-  /*! \brief meta information about the dataset */
-  MetaInfo info;
-  /*!
-   * \brief cache pointer to verify if the data structure is cached in some learner
-   *  used to verify if DMatrix is cached
-   */
-  void *cache_learner_ptr_;
-  /*! \brief default constructor */
-  explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {}
-  /*! \brief get feature matrix about data content */
-  virtual IFMatrix *fmat(void) const = 0;
-  // virtual destructor
-  virtual ~DMatrix(void){}
-};
-
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_DMATRIX_H_
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -1,589 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file xgboost_evaluation-inl.hpp
- * \brief evaluation metrics for regression and classification and rank
- * \author Kailong Chen, Tianqi Chen
- */
-#ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_
-#define XGBOOST_LEARNER_EVALUATION_INL_HPP_
-
-#include <vector>
-#include <utility>
-#include <string>
-#include <cmath>
-#include <climits>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/math.h"
-#include "./evaluation.h"
-#include "./helper_utils.h"
-
-namespace xgboost {
-namespace learner {
-/*!
- * \brief base class of element-wise evaluation
- * \tparam Derived the name of subclass
- */
-template<typename Derived>
-struct EvalEWiseBase : public IEvaluator {
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() == info.labels.size(),
-                 "label and prediction size not match"\
-                 "hint: use merror or mlogloss for multi-class classification");
-
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
-
-    float sum = 0.0, wsum = 0.0;
-    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const float wt = info.GetWeight(i);
-      sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
-      wsum += wt;
-    }
-    float dat[2]; dat[0] = sum, dat[1] = wsum;
-    if (distributed) {
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-    }
-    return Derived::GetFinal(dat[0], dat[1]);
-  }
-  /*!
-   * \brief to be implemented by subclass,
-   *   get evaluation result from one row
-   * \param label label of current instance
-   * \param pred prediction value of current instance
-   */
-  inline static float EvalRow(float label, float pred);
-  /*!
-   * \brief to be overridden by subclass, final transformation
-   * \param esum the sum statistics returned by EvalRow
-   * \param wsum sum of weight
-   */
-  inline static float GetFinal(float esum, float wsum) {
-    return esum / wsum;
-  }
-};
-
-/*! \brief RMSE */
-struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
-  virtual const char *Name(void) const {
-    return "rmse";
-  }
-  inline static float EvalRow(float label, float pred) {
-    float diff = label - pred;
-    return diff * diff;
-  }
-  inline static float GetFinal(float esum, float wsum) {
-    return std::sqrt(esum / wsum);
-  }
-};
-
-/*! \brief logloss */
-struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
-  virtual const char *Name(void) const {
-    return "logloss";
-  }
-  inline static float EvalRow(float y, float py) {
-    const float eps = 1e-16f;
-    const float pneg = 1.0f - py;
-    if (py < eps) {
-      return -y * std::log(eps) - (1.0f - y)  * std::log(1.0f - eps);
-    } else if (pneg < eps) {
-      return -y * std::log(1.0f - eps) - (1.0f - y)  * std::log(eps);
-    } else {
-      return -y * std::log(py) - (1.0f - y) * std::log(pneg);
-    }
-  }
-};
-
-/*! \brief error */
-struct EvalError : public EvalEWiseBase<EvalError> {
-  virtual const char *Name(void) const {
-    return "error";
-  }
-  inline static float EvalRow(float label, float pred) {
-    // assume label is in [0,1]
-    return pred > 0.5f ? 1.0f - label : label;
-  }
-};
-
-/*! \brief log-likelihood of Poission distribution */
-struct EvalPoissionNegLogLik : public EvalEWiseBase<EvalPoissionNegLogLik> {
-  virtual const char *Name(void) const {
-    return "poisson-nloglik";
-  }
-  inline static float EvalRow(float y, float py) {
-    const float eps = 1e-16f;
-    if (py < eps) py = eps;
-    return utils::LogGamma(y + 1.0f) + py - std::log(py) * y;
-  }
-};
-
-/*!
- * \brief base class of multi-class evaluation
- * \tparam Derived the name of subclass
- */
-template<typename Derived>
-struct EvalMClassBase : public IEvaluator {
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "label and prediction size not match");
-    const size_t nclass = preds.size() / info.labels.size();
-    utils::Check(nclass > 1,
-                 "mlogloss and merror are only used for multi-class classification,"\
-                 " use logloss for binary classification");
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
-    float sum = 0.0, wsum = 0.0;
-    int label_error = 0;
-    #pragma omp parallel for reduction(+: sum, wsum) schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const float wt = info.GetWeight(i);
-      int label =  static_cast<int>(info.labels[i]);
-      if (label >= 0 && label < static_cast<int>(nclass)) {
-        sum += Derived::EvalRow(label,
-                                BeginPtr(preds) + i * nclass,
-                                nclass) * wt;
-        wsum += wt;
-      } else {
-        label_error = label;
-      }
-    }
-    utils::Check(label_error >= 0 && label_error < static_cast<int>(nclass),
-                 "MultiClassEvaluation: label must be in [0, num_class)," \
-                 " num_class=%d but found %d in label",
-                 static_cast<int>(nclass), label_error);
-    float dat[2]; dat[0] = sum, dat[1] = wsum;
-    if (distributed) {
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-    }
-    return Derived::GetFinal(dat[0], dat[1]);
-  }
-  /*!
-   * \brief to be implemented by subclass,
-   *   get evaluation result from one row
-   * \param label label of current instance
-   * \param pred prediction value of current instance
-   * \param nclass number of class in the prediction
-   */
-  inline static float EvalRow(int label,
-                              const float *pred,
-                              size_t nclass);
-  /*!
-   * \brief to be overridden by subclass, final transformation
-   * \param esum the sum statistics returned by EvalRow
-   * \param wsum sum of weight
-   */
-  inline static float GetFinal(float esum, float wsum) {
-    return esum / wsum;
-  }
-  // used to store error message
-  const char *error_msg_;
-};
-/*! \brief match error */
-struct EvalMatchError : public EvalMClassBase<EvalMatchError> {
-  virtual const char *Name(void) const {
-    return "merror";
-  }
-  inline static float EvalRow(int label,
-                              const float *pred,
-                              size_t nclass) {
-    return FindMaxIndex(pred, nclass) != static_cast<int>(label);
-  }
-};
-/*! \brief match error */
-struct EvalMultiLogLoss : public EvalMClassBase<EvalMultiLogLoss> {
-  virtual const char *Name(void) const {
-    return "mlogloss";
-  }
-  inline static float EvalRow(int label,
-                              const float *pred,
-                              size_t nclass) {
-    const float eps = 1e-16f;
-    size_t k = static_cast<size_t>(label);
-    if (pred[k] > eps) {
-      return -std::log(pred[k]);
-    } else {
-      return -std::log(eps);
-    }
-  }
-};
-
-/*! \brief ctest */
-struct EvalCTest: public IEvaluator {
-  EvalCTest(IEvaluator *base, const char *name)
-      : base_(base), name_(name) {}
-  virtual ~EvalCTest(void) {
-    delete base_;
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str());
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "label and prediction size not match");
-    size_t ngroup = preds.size() / info.labels.size() - 1;
-    const unsigned ndata = static_cast<unsigned>(info.labels.size());
-    utils::Check(ngroup > 1, "pred size does not meet requirement");
-    utils::Check(ndata == info.info.fold_index.size(), "need fold index");
-    double wsum = 0.0;
-    for (size_t k = 0; k < ngroup; ++k) {
-      std::vector<float> tpred;
-      MetaInfo tinfo;
-      for (unsigned i = 0; i < ndata; ++i) {
-        if (info.info.fold_index[i] == k) {
-          tpred.push_back(preds[i + (k + 1) * ndata]);
-          tinfo.labels.push_back(info.labels[i]);
-          tinfo.weights.push_back(info.GetWeight(i));
-        }
-      }
-      wsum += base_->Eval(tpred, tinfo);
-    }
-    return static_cast<float>(wsum / ngroup);
-  }
-
- private:
-  IEvaluator *base_;
-  std::string name_;
-};
-
-/*! \brief AMS: also records best threshold */
-struct EvalAMS : public IEvaluator {
- public:
-  explicit EvalAMS(const char *name) {
-    name_ = name;
-    // note: ams@0 will automatically select which ratio to go
-    utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
-  }
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(!distributed, "metric AMS do not support distributed evaluation");
-    using namespace std;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
-
-    utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
-    std::vector< std::pair<float, unsigned> > rec(ndata);
-
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      rec[i] = std::make_pair(preds[i], i);
-    }
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
-    if (ntop == 0) ntop = ndata;
-    const double br = 10.0;
-    unsigned thresindex = 0;
-    double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
-    for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
-      const unsigned ridx = rec[i].second;
-      const float wt = info.weights[ridx];
-      if (info.labels[ridx] > 0.5f) {
-        s_tp += wt;
-      } else {
-        b_fp += wt;
-      }
-      if (rec[i].first != rec[i+1].first) {
-        double ams = sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
-        if (tams < ams) {
-          thresindex = i;
-          tams = ams;
-        }
-      }
-    }
-    if (ntop == ndata) {
-      utils::Printf("\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
-      return static_cast<float>(tams);
-    } else {
-      return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
-    }
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-
- private:
-  std::string name_;
-  float ratio_;
-};
-
-/*! \brief precision with cut off at top percentile */
-struct EvalPrecisionRatio : public IEvaluator{
- public:
-  explicit EvalPrecisionRatio(const char *name) : name_(name) {
-    using namespace std;
-    if (sscanf(name, "apratio@%f", &ratio_) == 1) {
-      use_ap = 1;
-    } else {
-      utils::Assert(sscanf(name, "pratio@%f", &ratio_) == 1, "BUG");
-      use_ap = 0;
-    }
-  }
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(!distributed, "metric %s do not support distributed evaluation", Name());
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Assert(preds.size() % info.labels.size() == 0,
-                  "label size predict size not match");
-    std::vector< std::pair<float, unsigned> > rec;
-    for (size_t j = 0; j < info.labels.size(); ++j) {
-      rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
-    }
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    double pratio = CalcPRatio(rec, info);
-    return static_cast<float>(pratio);
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-
- protected:
-  inline double CalcPRatio(const std::vector< std::pair<float, unsigned> >& rec,
-                           const MetaInfo &info) const {
-    size_t cutoff = static_cast<size_t>(ratio_ * rec.size());
-    double wt_hit = 0.0, wsum = 0.0, wt_sum = 0.0;
-    for (size_t j = 0; j < cutoff; ++j) {
-      const float wt = info.GetWeight(j);
-      wt_hit += info.labels[rec[j].second] * wt;
-      wt_sum += wt;
-      wsum += wt_hit / wt_sum;
-    }
-    if (use_ap != 0) {
-      return wsum / cutoff;
-    } else {
-      return wt_hit / wt_sum;
-    }
-  }
-  int use_ap;
-  float ratio_;
-  std::string name_;
-};
-
-/*! \brief Area Under Curve, for both classification and rank */
-struct EvalAuc : public IEvaluator {
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "label size predict size not match");
-    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(info.labels.size());
-
-    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
-    utils::Check(gptr.back() == info.labels.size(),
-                 "EvalAuc: group structure must match number of prediction");
-    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    // sum statistics
-    double sum_auc = 0.0f;
-    #pragma omp parallel reduction(+:sum_auc)
-    {
-      // each thread takes a local rec
-      std::vector< std::pair<float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.push_back(std::make_pair(preds[j], j));
-        }
-        std::sort(rec.begin(), rec.end(), CmpFirst);
-        // calculate AUC
-        double sum_pospair = 0.0;
-        double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
-        for (size_t j = 0; j < rec.size(); ++j) {
-          const float wt = info.GetWeight(rec[j].second);
-          const float ctr = info.labels[rec[j].second];
-          // keep bucketing predictions in same bucket
-          if (j != 0 && rec[j].first != rec[j - 1].first) {
-            sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-            sum_npos += buf_pos;
-            sum_nneg += buf_neg;
-            buf_neg = buf_pos = 0.0f;
-          }
-          buf_pos += ctr * wt;
-          buf_neg += (1.0f - ctr) * wt;
-        }
-        sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-        sum_npos += buf_pos;
-        sum_nneg += buf_neg;
-        // check weird conditions
-        utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
-                     "AUC: the dataset only contains pos or neg samples");
-        // this is the AUC
-        sum_auc += sum_pospair / (sum_npos*sum_nneg);
-      }
-    }
-    if (distributed) {
-      float dat[2];
-      dat[0] = static_cast<float>(sum_auc);
-      dat[1] = static_cast<float>(ngroup);
-      // approximately estimate auc using mean
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return static_cast<float>(sum_auc) / ngroup;
-    }
-  }
-  virtual const char *Name(void) const {
-    return "auc";
-  }
-};
-
-/*! \brief Evaluate rank list */
-struct EvalRankList : public IEvaluator {
- public:
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed) const {
-    utils::Check(preds.size() == info.labels.size(),
-                  "label size predict size not match");
-    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0);
-    tgptr[1] = static_cast<unsigned>(preds.size());
-    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
-    utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
-    utils::Assert(gptr.back() == preds.size(),
-                   "EvalRanklist: group structure must match number of prediction");
-    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    // sum statistics
-    double sum_metric = 0.0f;
-    #pragma omp parallel reduction(+:sum_metric)
-    {
-      // each thread takes a local rec
-      std::vector< std::pair<float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        rec.clear();
-        for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
-          rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
-        }
-        sum_metric += this->EvalMetric(rec);
-      }
-    }
-    if (distributed) {
-      float dat[2];
-      dat[0] = static_cast<float>(sum_metric);
-      dat[1] = static_cast<float>(ngroup);
-      // approximately estimate the metric using mean
-      rabit::Allreduce<rabit::op::Sum>(dat, 2);
-      return dat[0] / dat[1];
-    } else {
-      return static_cast<float>(sum_metric) / ngroup;
-    }
-  }
-  virtual const char *Name(void) const {
-    return name_.c_str();
-  }
-
- protected:
-  explicit EvalRankList(const char *name) {
-    using namespace std;
-    name_ = name;
-    minus_ = false;
-    if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
-      topn_ = UINT_MAX;
-    }
-    if (name[strlen(name) - 1] == '-') {
-      minus_ = true;
-    }
-  }
-  /*! \return evaluation metric, given the pair_sort record, (pred,label) */
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &pair_sort) const = 0; // NOLINT(*)
-
- protected:
-  unsigned topn_;
-  std::string name_;
-  bool minus_;
-};
-
-/*! \brief Precision at N, for both classification and rank */
-struct EvalPrecision : public EvalRankList{
- public:
-  explicit EvalPrecision(const char *name) : EvalRankList(name) {}
-
- protected:
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
-    // calculate Precision
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    unsigned nhit = 0;
-    for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
-      nhit += (rec[j].second != 0);
-    }
-    return static_cast<float>(nhit) / topn_;
-  }
-};
-
-/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
-struct EvalNDCG : public EvalRankList{
- public:
-  explicit EvalNDCG(const char *name) : EvalRankList(name) {}
-
- protected:
-  inline float CalcDCG(const std::vector< std::pair<float, unsigned> > &rec) const {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
-      const unsigned rel = rec[i].second;
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log(i + 2.0);
-      }
-    }
-    return static_cast<float>(sumdcg);
-  }
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const { // NOLINT(*)
-    std::stable_sort(rec.begin(), rec.end(), CmpFirst);
-    float dcg = this->CalcDCG(rec);
-    std::stable_sort(rec.begin(), rec.end(), CmpSecond);
-    float idcg = this->CalcDCG(rec);
-    if (idcg == 0.0f) {
-      if (minus_) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-    return dcg/idcg;
-  }
-};
-
-/*! \brief Mean Average Precision at N, for both classification and rank */
-struct EvalMAP : public EvalRankList {
- public:
-  explicit EvalMAP(const char *name) : EvalRankList(name) {}
-
- protected:
-  virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
-    std::sort(rec.begin(), rec.end(), CmpFirst);
-    unsigned nhits = 0;
-    double sumap = 0.0;
-    for (size_t i = 0; i < rec.size(); ++i) {
-      if (rec[i].second != 0) {
-        nhits += 1;
-        if (i < this->topn_) {
-          sumap += static_cast<float>(nhits) / (i+1);
-        }
-      }
-    }
-    if (nhits != 0) {
-      sumap /= nhits;
-      return static_cast<float>(sumap);
-    } else {
-      if (minus_) {
-        return 0.0f;
-      } else {
-        return 1.0f;
-      }
-    }
-  }
-};
-
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_EVALUATION_INL_HPP_
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@@ -1,101 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file evaluation.h
- * \brief interface of evaluation function supported in xgboost
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_EVALUATION_H_
-#define XGBOOST_LEARNER_EVALUATION_H_
-
-#include <string>
-#include <vector>
-#include <cstdio>
-#include "../utils/utils.h"
-#include "./dmatrix.h"
-
-namespace xgboost {
-namespace learner {
-/*! \brief evaluator that evaluates the loss metrics */
-struct IEvaluator{
-  /*!
-   * \brief evaluate a specific metric
-   * \param preds prediction
-   * \param info information, including label etc.
-   * \param distributed whether a call to Allreduce is needed to gather
-   *        the average statistics across all the node,
-   *        this is only supported by some metrics
-   */
-  virtual float Eval(const std::vector<float> &preds,
-                     const MetaInfo &info,
-                     bool distributed = false) const = 0;
-  /*! \return name of metric */
-  virtual const char *Name(void) const = 0;
-  /*! \brief virtual destructor */
-  virtual ~IEvaluator(void) {}
-};
-}  // namespace learner
-}  // namespace xgboost
-
-// include implementations of evaluation functions
-#include "evaluation-inl.hpp"
-// factory function
-namespace xgboost {
-namespace learner {
-inline IEvaluator* CreateEvaluator(const char *name) {
-  using namespace std;
-  if (!strcmp(name, "rmse")) return new EvalRMSE();
-  if (!strcmp(name, "error")) return new EvalError();
-  if (!strcmp(name, "merror")) return new EvalMatchError();
-  if (!strcmp(name, "logloss")) return new EvalLogLoss();
-  if (!strcmp(name, "mlogloss")) return new EvalMultiLogLoss();
-  if (!strcmp(name, "poisson-nloglik")) return new EvalPoissionNegLogLik();
-  if (!strcmp(name, "auc")) return new EvalAuc();
-  if (!strncmp(name, "ams@", 4)) return new EvalAMS(name);
-  if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
-  if (!strncmp(name, "pratio@", 7)) return new EvalPrecisionRatio(name);
-  if (!strncmp(name, "map", 3)) return new EvalMAP(name);
-  if (!strncmp(name, "ndcg", 4)) return new EvalNDCG(name);
-  if (!strncmp(name, "ct-", 3)) return new EvalCTest(CreateEvaluator(name+3), name);
-
-  utils::Error("unknown evaluation metric type: %s", name);
-  return NULL;
-}
-
-/*! \brief a set of evaluators */
-class EvalSet{
- public:
-  inline void AddEval(const char *name) {
-    using namespace std;
-    for (size_t i = 0; i < evals_.size(); ++i) {
-      if (!strcmp(name, evals_[i]->Name())) return;
-    }
-    evals_.push_back(CreateEvaluator(name));
-  }
-  ~EvalSet(void) {
-    for (size_t i = 0; i < evals_.size(); ++i) {
-      delete evals_[i];
-    }
-  }
-  inline std::string Eval(const char *evname,
-                          const std::vector<float> &preds,
-                          const MetaInfo &info,
-                          bool distributed = false) {
-    std::string result = "";
-    for (size_t i = 0; i < evals_.size(); ++i) {
-      float res = evals_[i]->Eval(preds, info, distributed);
-      char tmp[1024];
-      utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
-      result += tmp;
-    }
-    return result;
-  }
-  inline size_t Size(void) const {
-    return evals_.size();
-  }
-
- private:
-  std::vector<const IEvaluator*> evals_;
-};
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_EVALUATION_H_
--- a/src/learner/helper_utils.h
+++ b/src/learner/helper_utils.h
@@ -1,80 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file helper_utils.h
- * \brief useful helper functions
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_
-#define XGBOOST_LEARNER_HELPER_UTILS_H_
-
-#include <utility>
-#include <vector>
-#include <cmath>
-#include <algorithm>
-namespace xgboost {
-namespace learner {
-// simple helper function to do softmax
-inline static void Softmax(std::vector<float>* p_rec) {
-  std::vector<float> &rec = *p_rec;
-  float wmax = rec[0];
-  for (size_t i = 1; i < rec.size(); ++i) {
-    wmax = std::max(rec[i], wmax);
-  }
-  double wsum = 0.0f;
-  for (size_t i = 0; i < rec.size(); ++i) {
-    rec[i] = std::exp(rec[i]-wmax);
-    wsum += rec[i];
-  }
-  for (size_t i = 0; i < rec.size(); ++i) {
-    rec[i] /= static_cast<float>(wsum);
-  }
-}
-
-inline static int FindMaxIndex(const float  *rec, size_t size) {
-  size_t mxid = 0;
-  for (size_t i = 1; i < size; ++i) {
-    if (rec[i] > rec[mxid]) {
-      mxid = i;
-    }
-  }
-  return static_cast<int>(mxid);
-}
-
-// simple helper function to do softmax
-inline static int FindMaxIndex(const std::vector<float>& rec) {
-  return FindMaxIndex(BeginPtr(rec), rec.size());
-}
-
-// perform numerically safe logsum
-inline float LogSum(float x, float y) {
-  if (x < y) {
-    return y + std::log(std::exp(x - y) + 1.0f);
-  } else {
-    return x + std::log(std::exp(y - x) + 1.0f);
-  }
-}
-// numerically safe logsum
-inline float LogSum(const float *rec, size_t size) {
-  float mx = rec[0];
-  for (size_t i = 1; i < size; ++i) {
-    mx = std::max(mx, rec[i]);
-  }
-  float sum = 0.0f;
-  for (size_t i = 0; i < size; ++i) {
-    sum += std::exp(rec[i] - mx);
-  }
-  return mx + std::log(sum);
-}
-
-// comparator functions for sorting pairs in descending order
-inline static bool CmpFirst(const std::pair<float, unsigned> &a,
-                            const std::pair<float, unsigned> &b) {
-  return a.first > b.first;
-}
-inline static bool CmpSecond(const std::pair<float, unsigned> &a,
-                             const std::pair<float, unsigned> &b) {
-  return a.second > b.second;
-}
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_HELPER_UTILS_H_
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -1,547 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file learner-inl.hpp
- * \brief learning algorithm
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
-#define XGBOOST_LEARNER_LEARNER_INL_HPP_
-
-#include <algorithm>
-#include <vector>
-#include <utility>
-#include <string>
-#include <limits>
-#include "../sync/sync.h"
-#include "../utils/io.h"
-#include "./objective.h"
-#include "./evaluation.h"
-#include "../gbm/gbm.h"
-
-namespace xgboost {
-/*! \brief namespace for learning algorithm */
-namespace learner {
-/*!
- * \brief learner that performs gradient boosting for a specific objective function.
- *  It does training and prediction.
- */
-class BoostLearner : public rabit::Serializable {
- public:
-  BoostLearner(void) {
-    obj_ = NULL;
-    gbm_ = NULL;
-    name_obj_ = "reg:linear";
-    name_gbm_ = "gbtree";
-    silent = 0;
-    prob_buffer_row = 1.0f;
-    distributed_mode = 0;
-    updater_mode = 0;
-    pred_buffer_size = 0;
-    seed_per_iteration = 0;
-    seed = 0;
-    save_base64 = 0;
-  }
-  virtual ~BoostLearner(void) {
-    if (obj_ != NULL) delete obj_;
-    if (gbm_ != NULL) delete gbm_;
-  }
-  /*!
-   * \brief add internal cache space for mat, this can speedup prediction for matrix,
-   *        please cache prediction for training and eval data
-   *    warning: if the model is loaded from file from some previous training history
-   *             set cache data must be called with exactly SAME
-   *             data matrices to continue training otherwise it will cause error
-   * \param mats array of pointers to matrix whose prediction result need to be cached
-   */
-  inline void SetCacheData(const std::vector<DMatrix*>& mats) {
-    utils::Assert(cache_.size() == 0, "can only call cache data once");
-    // assign buffer index
-    size_t buffer_size = 0;
-    for (size_t i = 0; i < mats.size(); ++i) {
-      bool dupilicate = false;
-      for (size_t j = 0; j < i; ++j) {
-        if (mats[i] == mats[j]) dupilicate = true;
-      }
-      if (dupilicate) continue;
-      // set mats[i]'s cache learner pointer to this
-      mats[i]->cache_learner_ptr_ = this;
-      cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
-      buffer_size += mats[i]->info.num_row();
-    }
-    char str_temp[25];
-    utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
-                   static_cast<unsigned long>(buffer_size)); // NOLINT(*)
-    this->SetParam("num_pbuffer", str_temp);
-    this->pred_buffer_size = buffer_size;
-  }
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    // in this version, bst: prefix is no longer required
-    if (strncmp(name, "bst:", 4) != 0) {
-      std::string n = "bst:"; n += name;
-      this->SetParam(n.c_str(), val);
-    }
-    if (!strcmp(name, "silent")) silent = atoi(val);
-    if (!strcmp(name, "dsplit")) {
-      if (!strcmp(val, "col")) {
-        this->SetParam("updater", "distcol");
-        distributed_mode = 1;
-      } else if (!strcmp(val, "row")) {
-        this->SetParam("updater", "grow_histmaker,prune");
-        distributed_mode = 2;
-      } else {
-        utils::Error("%s is invalid value for dsplit, should be row or col", val);
-      }
-    }
-    if (!strcmp(name, "updater_mode")) updater_mode = atoi(val);
-    if (!strcmp(name, "prob_buffer_row")) {
-      prob_buffer_row = static_cast<float>(atof(val));
-      utils::Check(distributed_mode == 0,
-                   "prob_buffer_row can only be used in single node mode so far");
-      this->SetParam("updater", "grow_colmaker,refresh,prune");
-    }
-    if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
-    if (!strcmp("seed", name)) {
-      seed = atoi(val); random::Seed(seed);
-    }
-    if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
-    if (!strcmp("save_base64", name)) save_base64 = atoi(val);
-    if (!strcmp(name, "num_class")) {
-      this->SetParam("num_output_group", val);
-    }
-    if (!strcmp(name, "nthread")) {
-      omp_set_num_threads(atoi(val));
-    }
-    if (gbm_ == NULL) {
-      if (!strcmp(name, "objective")) name_obj_ = val;
-      if (!strcmp(name, "booster")) name_gbm_ = val;
-      mparam.SetParam(name, val);
-    }
-    if (gbm_ != NULL) gbm_->SetParam(name, val);
-    if (obj_ != NULL) obj_->SetParam(name, val);
-    if (gbm_ == NULL || obj_ == NULL) {
-      cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
-    }
-  }
-  // this is an internal function
-  // initialize the trainer, called at InitModel and LoadModel
-  inline void InitTrainer(bool calc_num_feature = true) {
-    if (calc_num_feature) {
-      // estimate feature bound
-      unsigned num_feature = 0;
-      for (size_t i = 0; i < cache_.size(); ++i) {
-        num_feature = std::max(num_feature,
-                               static_cast<unsigned>(cache_[i].mat_->info.num_col()));
-      }
-      // run allreduce on num_feature to find the maximum value
-      rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
-      if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
-    }
-    char str_temp[25];
-    utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
-    this->SetParam("bst:num_feature", str_temp);
-  }
-  /*!
-   * \brief initialize the model
-   */
-  inline void InitModel(void) {
-    this->InitTrainer();
-    // initialize model
-    this->InitObjGBM();
-    // reset the base score
-    mparam.base_score = obj_->ProbToMargin(mparam.base_score);
-    // initialize GBM model
-    gbm_->InitModel();
-  }
-  /*!
-   * \brief load model from stream
-   * \param fi input stream
-   * \param calc_num_feature whether call InitTrainer with calc_num_feature
-   */
-  inline void LoadModel(utils::IStream &fi,  // NOLINT(*)
-                        bool calc_num_feature = true) {
-    utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
-                 "BoostLearner: wrong model format");
-    {
-      // backward compatibility code for compatible with old model type
-      // for new model, Read(&name_obj_) is suffice
-      uint64_t len;
-      utils::Check(fi.Read(&len, sizeof(len)) != 0, "BoostLearner: wrong model format");
-      if (len >= std::numeric_limits<unsigned>::max()) {
-        int gap;
-        utils::Check(fi.Read(&gap, sizeof(gap)) != 0, "BoostLearner: wrong model format");
-        len = len >> static_cast<uint64_t>(32UL);
-      }
-      if (len != 0) {
-        name_obj_.resize(len);
-        utils::Check(fi.Read(&name_obj_[0], len) != 0, "BoostLearner: wrong model format");
-      }
-    }
-    utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
-    // delete existing gbm if any
-    if (obj_ != NULL) delete obj_;
-    if (gbm_ != NULL) delete gbm_;
-    this->InitTrainer(calc_num_feature);
-    this->InitObjGBM();
-    char tmp[32];
-    utils::SPrintf(tmp, sizeof(tmp), "%u", mparam.num_class);
-    obj_->SetParam("num_class", tmp);
-    gbm_->LoadModel(fi, mparam.saved_with_pbuffer != 0);
-    if (mparam.saved_with_pbuffer == 0) {
-      gbm_->ResetPredBuffer(pred_buffer_size);
-    }
-  }
-  // rabit load model from rabit checkpoint
-  virtual void Load(rabit::Stream *fi) {
-    // for row split, we should not keep pbuffer
-    this->LoadModel(*fi, false);
-  }
-  // rabit save model to rabit checkpoint
-  virtual void Save(rabit::Stream *fo) const {
-    // for row split, we should not keep pbuffer
-    this->SaveModel(*fo, distributed_mode != 2);
-  }
-  /*!
-   * \brief load model from file
-   * \param fname file name
-   */
-  inline void LoadModel(const char *fname) {
-    utils::IStream *fi = utils::IStream::Create(fname, "r");
-    std::string header; header.resize(4);
-    // check header for different binary encode
-    // can be base64 or binary
-    utils::Check(fi->Read(&header[0], 4) != 0, "invalid model");
-    // base64 format
-    if (header == "bs64") {
-      utils::Base64InStream bsin(fi);
-      bsin.InitPosition();
-      this->LoadModel(bsin, true);
-    } else if (header == "binf") {
-      this->LoadModel(*fi, true);
-    } else {
-      delete fi;
-      fi = utils::IStream::Create(fname, "r");
-      this->LoadModel(*fi, true);
-    }
-    delete fi;
-  }
-  inline void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*)
-    ModelParam p = mparam;
-    p.saved_with_pbuffer = static_cast<int>(with_pbuffer);
-    fo.Write(&p, sizeof(ModelParam));
-    fo.Write(name_obj_);
-    fo.Write(name_gbm_);
-    gbm_->SaveModel(fo, with_pbuffer);
-  }
-  /*!
-   * \brief save model into file
-   * \param fname file name
-   * \param with_pbuffer whether save pbuffer together
-   */
-  inline void SaveModel(const char *fname, bool with_pbuffer) const {
-    utils::IStream *fo = utils::IStream::Create(fname, "w");
-    if (save_base64 != 0 || !strcmp(fname, "stdout")) {
-      fo->Write("bs64\t", 5);
-      utils::Base64OutStream bout(fo);
-      this->SaveModel(bout, with_pbuffer);
-      bout.Finish('\n');
-    } else {
-      fo->Write("binf", 4);
-      this->SaveModel(*fo, with_pbuffer);
-    }
-    delete fo;
-  }
-  /*!
-   * \brief check if data matrix is ready to be used by training,
-   *  if not initialize it
-   * \param p_train pointer to the matrix used by training
-   */
-  inline void CheckInit(DMatrix *p_train) {
-    int ncol = static_cast<int>(p_train->info.info.num_col);
-    std::vector<bool> enabled(ncol, true);
-    // set max row per batch to limited value
-    // in distributed mode, use safe choice otherwise
-    size_t max_row_perbatch = std::numeric_limits<size_t>::max();
-    if (updater_mode != 0 || distributed_mode == 2) {
-      max_row_perbatch = 32UL << 10UL;
-    }
-    // initialize column access
-    p_train->fmat()->InitColAccess(enabled,
-                                   prob_buffer_row,
-                                   max_row_perbatch);
-    const int kMagicPage = 0xffffab02;
-    // check, if it is DMatrixPage, then use hist maker
-    if (p_train->magic == kMagicPage) {
-      this->SetParam("updater", "grow_histmaker,prune");
-    }
-  }
-  /*!
-   * \brief update the model for one iteration
-   * \param iter current iteration number
-   * \param train reference to the data matrix
-   */
-  inline void UpdateOneIter(int iter, const DMatrix &train) {
-    if (seed_per_iteration != 0 || rabit::IsDistributed()) {
-      random::Seed(this->seed * kRandSeedMagic + iter);
-    }
-    this->PredictRaw(train, &preds_);
-    obj_->GetGradient(preds_, train.info, iter, &gpair_);
-    gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
-  }
-  /*!
-   * \brief whether model allow lazy checkpoint
-   */
-  inline bool AllowLazyCheckPoint(void) const {
-    return gbm_->AllowLazyCheckPoint();
-  }
-  /*!
-   * \brief evaluate the model for specific iteration
-   * \param iter iteration number
-   * \param evals datas i want to evaluate
-   * \param evname name of each dataset
-   * \return a string corresponding to the evaluation result
-   */
-  inline std::string EvalOneIter(int iter,
-                                 const std::vector<const DMatrix*> &evals,
-                                 const std::vector<std::string> &evname) {
-    std::string res;
-    char tmp[256];
-    utils::SPrintf(tmp, sizeof(tmp), "[%d]", iter);
-    res = tmp;
-    for (size_t i = 0; i < evals.size(); ++i) {
-      this->PredictRaw(*evals[i], &preds_);
-      obj_->EvalTransform(&preds_);
-      res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2);
-    }
-    return res;
-  }
-  /*!
-   * \brief simple evaluation function, with a specified metric
-   * \param data input data
-   * \param metric name of metric
-   * \return a pair of <evaluation name, result>
-   */
-  std::pair<std::string, float> Evaluate(const DMatrix &data, std::string metric) {
-    if (metric == "auto") metric = obj_->DefaultEvalMetric();
-    IEvaluator *ev = CreateEvaluator(metric.c_str());
-    this->PredictRaw(data, &preds_);
-    obj_->EvalTransform(&preds_);
-    float res = ev->Eval(preds_, data.info);
-    delete ev;
-    return std::make_pair(metric, res);
-  }
-  /*!
-   * \brief get prediction
-   * \param data input data
-   * \param output_margin whether to only predict margin value instead of transformed prediction
-   * \param out_preds output vector that stores the prediction
-   * \param ntree_limit limit number of trees used for boosted tree
-   *   predictor, when it equals 0, this means we are using all the trees
-   * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
-   */
-  inline void Predict(const DMatrix &data,
-                      bool output_margin,
-                      std::vector<float> *out_preds,
-                      unsigned ntree_limit = 0,
-                      bool pred_leaf = false) const {
-    if (pred_leaf) {
-      gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit);
-    } else {
-      this->PredictRaw(data, out_preds, ntree_limit);
-      if (!output_margin) {
-        obj_->PredTransform(out_preds);
-      }
-    }
-  }
-  /*!
-   * \brief online prediction function, predict score for one instance at a time
-   *  NOTE: use the batch prediction interface if possible, batch prediction is usually
-   *        more efficient than online prediction
-   *        This function is NOT threadsafe, make sure you only call from one thread
-   *
-   * \param inst the instance you want to predict
-   * \param output_margin whether to only predict margin value instead of transformed prediction
-   * \param out_preds output vector to hold the predictions
-   * \param ntree_limit limit the number of trees used in prediction
-   * \sa Predict
-   */
-  inline void Predict(const SparseBatch::Inst &inst,
-                      bool output_margin,
-                      std::vector<float> *out_preds,
-                      unsigned ntree_limit = 0) const {
-    gbm_->Predict(inst, out_preds, ntree_limit);
-    if (out_preds->size() == 1) {
-      (*out_preds)[0] += mparam.base_score;
-    }
-    if (!output_margin) {
-      obj_->PredTransform(out_preds);
-    }
-  }
-  /*! \brief dump model out */
-  inline std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
-    return gbm_->DumpModel(fmap, option);
-  }
-
- protected:
-  /*!
-   * \brief initialize the objective function and GBM,
-   * if not yet done
-   */
-  inline void InitObjGBM(void) {
-    if (obj_ != NULL) return;
-    utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
-    obj_ = CreateObjFunction(name_obj_.c_str());
-    gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
-    this->InitAdditionDefaultParam();
-    // set parameters
-    for (size_t i = 0; i < cfg_.size(); ++i) {
-      obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
-      gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
-    }
-    if (evaluator_.Size() == 0) {
-      evaluator_.AddEval(obj_->DefaultEvalMetric());
-    }
-  }
-  /*!
-   * \brief additional default value for specific objs
-   */
-  inline void InitAdditionDefaultParam(void) {
-    if (name_obj_ == "count:poisson") {
-      obj_->SetParam("max_delta_step", "0.7");
-      gbm_->SetParam("max_delta_step", "0.7");
-    }
-  }
-  /*!
-   * \brief get un-transformed prediction
-   * \param data training data matrix
-   * \param out_preds output vector that stores the prediction
-   * \param ntree_limit limit number of trees used for boosted tree
-   *   predictor, when it equals 0, this means we are using all the trees
-   */
-  inline void PredictRaw(const DMatrix &data,
-                         std::vector<float> *out_preds,
-                         unsigned ntree_limit = 0) const {
-    gbm_->Predict(data.fmat(), this->FindBufferOffset(data),
-                  data.info.info, out_preds, ntree_limit);
-    // add base margin
-    std::vector<float> &preds = *out_preds;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
-    if (data.info.base_margin.size() != 0) {
-      utils::Check(preds.size() == data.info.base_margin.size(),
-                   "base_margin.size does not match with prediction size");
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint j = 0; j < ndata; ++j) {
-        preds[j] += data.info.base_margin[j];
-      }
-    } else {
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint j = 0; j < ndata; ++j) {
-        preds[j] += mparam.base_score;
-      }
-    }
-  }
-
-  /*! \brief training parameter for regression */
-  struct ModelParam{
-    /* \brief global bias */
-    float base_score;
-    /* \brief number of features  */
-    unsigned num_feature;
-    /* \brief number of classes, if it is multi-class classification  */
-    int num_class;
-    /*! \brief whether the model itself is saved with pbuffer */
-    int saved_with_pbuffer;
-    /*! \brief reserved field */
-    int reserved[30];
-    /*! \brief constructor */
-    ModelParam(void) {
-      std::memset(this, 0, sizeof(ModelParam));
-      base_score = 0.5f;
-      num_feature = 0;
-      num_class = 0;
-      saved_with_pbuffer = 0;
-    }
-    /*!
-     * \brief set parameters from outside
-     * \param name name of the parameter
-     * \param val value of the parameter
-     */
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
-      if (!strcmp("num_class", name)) num_class = atoi(val);
-      if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
-    }
-  };
-  // data fields
-  // stored random seed
-  int seed;
-  // whether seed the PRNG each iteration
-  // this is important for restart from existing iterations
-  // default set to no, but will auto switch on in distributed mode
-  int seed_per_iteration;
-  // save model in base64 encoding
-  int save_base64;
-  // silent during training
-  int silent;
-  // distributed learning mode, if any, 0:none, 1:col, 2:row
-  int distributed_mode;
-  // updater mode, 0:normal, reserved for internal test
-  int updater_mode;
-  // cached size of predict buffer
-  size_t pred_buffer_size;
-  // maximum buffered row value
-  float prob_buffer_row;
-  // evaluation set
-  EvalSet evaluator_;
-  // model parameter
-  ModelParam  mparam;
-  // gbm model that back everything
-  gbm::IGradBooster *gbm_;
-  // name of gbm model used for training
-  std::string name_gbm_;
-  // objective function
-  IObjFunction *obj_;
-  // name of objective function
-  std::string name_obj_;
-  // configurations
-  std::vector< std::pair<std::string, std::string> > cfg_;
-  // temporal storages for prediction
-  std::vector<float> preds_;
-  // gradient pairs
-  std::vector<bst_gpair> gpair_;
-
- protected:
-  // magic number to transform random seed
-  static const int kRandSeedMagic = 127;
-  // cache entry object that helps handle feature caching
-  struct CacheEntry {
-    const DMatrix *mat_;
-    size_t buffer_offset_;
-    size_t num_row_;
-    CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
-        :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
-  };
-  // find internal buffer offset for certain matrix, if not exist, return -1
-  inline int64_t FindBufferOffset(const DMatrix &mat) const {
-    for (size_t i = 0; i < cache_.size(); ++i) {
-      if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
-        if (cache_[i].num_row_ == mat.info.num_row()) {
-          return static_cast<int64_t>(cache_[i].buffer_offset_);
-        }
-      }
-    }
-    return -1;
-  }
-  // data structure field
-  /*! \brief the entries indicates that we have internal prediction cache */
-  std::vector<CacheEntry> cache_;
-};
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_LEARNER_INL_HPP_
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -1,642 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file objective-inl.hpp
- * \brief objective function implementations
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
-#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include <utility>
-#include <cmath>
-#include <functional>
-#include "../data.h"
-#include "./objective.h"
-#include "./helper_utils.h"
-#include "../utils/random.h"
-#include "../utils/omp.h"
-
-namespace xgboost {
-namespace learner {
-/*! \brief defines functions to calculate some commonly used functions */
-struct LossType {
-  /*! \brief indicate which type we are using */
-  int loss_type;
-  // list of constants
-  static const int kLinearSquare = 0;
-  static const int kLogisticNeglik = 1;
-  static const int kLogisticClassify = 2;
-  static const int kLogisticRaw = 3;
-  /*!
-   * \brief transform the linear sum to prediction
-   * \param x linear sum of boosting ensemble
-   * \return transformed prediction
-   */
-  inline float PredTransform(float x) const {
-    switch (loss_type) {
-      case kLogisticRaw:
-      case kLinearSquare: return x;
-      case kLogisticClassify:
-      case kLogisticNeglik: return 1.0f / (1.0f + std::exp(-x));
-      default: utils::Error("unknown loss_type"); return 0.0f;
-    }
-  }
-  /*!
-   * \brief check if label range is valid
-   */
-  inline bool CheckLabel(float x) const {
-    if (loss_type != kLinearSquare) {
-      return x >= 0.0f && x <= 1.0f;
-    }
-    return true;
-  }
-  /*!
-   * \brief error message displayed when check label fail
-   */
-  inline const char * CheckLabelErrorMsg(void) const {
-    if (loss_type != kLinearSquare) {
-      return "label must be in [0,1] for logistic regression";
-    } else {
-      return "";
-    }
-  }
-  /*!
-   * \brief calculate first order gradient of loss, given transformed prediction
-   * \param predt transformed prediction
-   * \param label true label
-   * \return first order gradient
-   */
-  inline float FirstOrderGradient(float predt, float label) const {
-    switch (loss_type) {
-      case kLinearSquare: return predt - label;
-      case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
-      case kLogisticClassify:
-      case kLogisticNeglik: return predt - label;
-      default: utils::Error("unknown loss_type"); return 0.0f;
-    }
-  }
-  /*!
-   * \brief calculate second order gradient of loss, given transformed prediction
-   * \param predt transformed prediction
-   * \param label true label
-   * \return second order gradient
-   */
-  inline float SecondOrderGradient(float predt, float label) const {
-    // cap second order gradient to positive value
-    const float eps = 1e-16f;
-    switch (loss_type) {
-      case kLinearSquare: return 1.0f;
-      case kLogisticRaw: predt = 1.0f / (1.0f + std::exp(-predt));
-      case kLogisticClassify:
-      case kLogisticNeglik: return std::max(predt * (1.0f - predt), eps);
-      default: utils::Error("unknown loss_type"); return 0.0f;
-    }
-  }
-  /*!
-   * \brief transform probability value back to margin
-   */
-  inline float ProbToMargin(float base_score) const {
-    if (loss_type == kLogisticRaw ||
-        loss_type == kLogisticClassify ||
-        loss_type == kLogisticNeglik ) {
-      utils::Check(base_score > 0.0f && base_score < 1.0f,
-                   "base_score must be in (0,1) for logistic loss");
-      base_score = -std::log(1.0f / base_score - 1.0f);
-    }
-    return base_score;
-  }
-  /*! \brief get default evaluation metric for the objective */
-  inline const char *DefaultEvalMetric(void) const {
-    if (loss_type == kLogisticClassify) return "error";
-    if (loss_type == kLogisticRaw) return "auc";
-    return "rmse";
-  }
-};
-
-/*! \brief objective function that only need to */
-class RegLossObj : public IObjFunction {
- public:
-  explicit RegLossObj(int loss_type) {
-    loss.loss_type = loss_type;
-    scale_pos_weight = 1.0f;
-  }
-  virtual ~RegLossObj(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp("scale_pos_weight", name)) {
-      scale_pos_weight = static_cast<float>(atof(val));
-    }
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % info.labels.size() == 0,
-                 "labels are not correctly provided");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    // check if label in range
-    bool label_correct = true;
-    // start calculating gradient
-    const unsigned nstep = static_cast<unsigned>(info.labels.size());
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const unsigned j = i % nstep;
-      float p = loss.PredTransform(preds[i]);
-      float w = info.GetWeight(j);
-      if (info.labels[j] == 1.0f) w *= scale_pos_weight;
-      if (!loss.CheckLabel(info.labels[j])) label_correct = false;
-      gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
-                           loss.SecondOrderGradient(p, info.labels[j]) * w);
-    }
-    utils::Check(label_correct, loss.CheckLabelErrorMsg());
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return loss.DefaultEvalMetric();
-  }
-  virtual void PredTransform(std::vector<float> *io_preds) {
-    std::vector<float> &preds = *io_preds;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint j = 0; j < ndata; ++j) {
-      preds[j] = loss.PredTransform(preds[j]);
-    }
-  }
-  virtual float ProbToMargin(float base_score) const {
-    return loss.ProbToMargin(base_score);
-  }
-
- protected:
-  float scale_pos_weight;
-  LossType loss;
-};
-
-// poisson regression for count
-class PoissonRegression : public IObjFunction {
- public:
-  PoissonRegression(void) {
-    max_delta_step = 0.0f;
-  }
-  virtual ~PoissonRegression(void) {}
-
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp("max_delta_step", name)) {
-      max_delta_step = static_cast<float>(atof(val));
-    }
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(max_delta_step != 0.0f,
-                 "PoissonRegression: need to set max_delta_step");
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() == info.labels.size(),
-                 "labels are not correctly provided");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    // check if label in range
-    bool label_correct = true;
-    // start calculating gradient
-    const long ndata = static_cast<bst_omp_uint>(preds.size()); // NOLINT(*)
-    #pragma omp parallel for schedule(static)
-    for (long i = 0; i < ndata; ++i) { // NOLINT(*)
-      float p = preds[i];
-      float w = info.GetWeight(i);
-      float y = info.labels[i];
-      if (y >= 0.0f) {
-        gpair[i] = bst_gpair((std::exp(p) - y) * w,
-                             std::exp(p + max_delta_step) * w);
-      } else {
-        label_correct = false;
-      }
-    }
-    utils::Check(label_correct,
-                 "PoissonRegression: label must be nonnegative");
-  }
-  virtual void PredTransform(std::vector<float> *io_preds) {
-    std::vector<float> &preds = *io_preds;
-    const long ndata = static_cast<long>(preds.size()); // NOLINT(*)
-    #pragma omp parallel for schedule(static)
-    for (long j = 0; j < ndata; ++j) {  // NOLINT(*)
-      preds[j] = std::exp(preds[j]);
-    }
-  }
-  virtual void EvalTransform(std::vector<float> *io_preds) {
-    PredTransform(io_preds);
-  }
-  virtual float ProbToMargin(float base_score) const {
-    return std::log(base_score);
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return "poisson-nloglik";
-  }
-
- private:
-  float max_delta_step;
-};
-
-// softmax multi-class classification
-class SoftmaxMultiClassObj : public IObjFunction {
- public:
-  explicit SoftmaxMultiClassObj(int output_prob)
-      : output_prob(output_prob) {
-    nclass = 0;
-  }
-  virtual ~SoftmaxMultiClassObj(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp( "num_class", name )) nclass = atoi(val);
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(nclass != 0, "must set num_class to use softmax");
-    utils::Check(info.labels.size() != 0, "label set cannot be empty");
-    utils::Check(preds.size() % (static_cast<size_t>(nclass) * info.labels.size()) == 0,
-                 "SoftmaxMultiClassObj: label size and pred size does not match");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    const unsigned nstep = static_cast<unsigned>(info.labels.size() * nclass);
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size() / nclass);
-    int label_error = 0;
-    #pragma omp parallel
-    {
-      std::vector<float> rec(nclass);
-      #pragma omp for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        for (int k = 0; k < nclass; ++k) {
-          rec[k] = preds[i * nclass + k];
-        }
-        Softmax(&rec);
-        const unsigned j = i % nstep;
-        int label = static_cast<int>(info.labels[j]);
-        if (label < 0 || label >= nclass)  {
-          label_error = label; label = 0;
-        }
-        const float wt = info.GetWeight(j);
-        for (int k = 0; k < nclass; ++k) {
-          float p = rec[k];
-          const float h = 2.0f * p * (1.0f - p) * wt;
-          if (label == k) {
-            gpair[i * nclass + k] = bst_gpair((p - 1.0f) * wt, h);
-          } else {
-            gpair[i * nclass + k] = bst_gpair(p* wt, h);
-          }
-        }
-      }
-    }
-    utils::Check(label_error >= 0 && label_error < nclass,
-                 "SoftmaxMultiClassObj: label must be in [0, num_class),"\
-                 " num_class=%d but found %d in label", nclass, label_error);
-  }
-  virtual void PredTransform(std::vector<float> *io_preds) {
-    this->Transform(io_preds, output_prob);
-  }
-  virtual void EvalTransform(std::vector<float> *io_preds) {
-    this->Transform(io_preds, 1);
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return "merror";
-  }
-
- private:
-  inline void Transform(std::vector<float> *io_preds, int prob) {
-    utils::Check(nclass != 0, "must set num_class to use softmax");
-    std::vector<float> &preds = *io_preds;
-    std::vector<float> tmp;
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()/nclass);
-    if (prob == 0) tmp.resize(ndata);
-    #pragma omp parallel
-    {
-      std::vector<float> rec(nclass);
-      #pragma omp for schedule(static)
-      for (bst_omp_uint j = 0; j < ndata; ++j) {
-        for (int k = 0; k < nclass; ++k) {
-          rec[k] = preds[j * nclass + k];
-        }
-        if (prob == 0) {
-          tmp[j] = static_cast<float>(FindMaxIndex(rec));
-        } else {
-          Softmax(&rec);
-          for (int k = 0; k < nclass; ++k) {
-            preds[j * nclass + k] = rec[k];
-          }
-        }
-      }
-    }
-    if (prob == 0) preds = tmp;
-  }
-  // data field
-  int nclass;
-  int output_prob;
-};
-
-/*! \brief objective for lambda rank */
-class LambdaRankObj : public IObjFunction {
- public:
-  LambdaRankObj(void) {
-    loss.loss_type = LossType::kLogisticRaw;
-    fix_list_weight = 0.0f;
-    num_pairsample = 1;
-  }
-  virtual ~LambdaRankObj(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp( "loss_type", name )) loss.loss_type = atoi(val);
-    if (!strcmp( "fix_list_weight", name)) fix_list_weight = static_cast<float>(atof(val));
-    if (!strcmp( "num_pairsample", name)) num_pairsample = atoi(val);
-  }
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) {
-    utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
-    std::vector<bst_gpair> &gpair = *out_gpair;
-    gpair.resize(preds.size());
-    // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels.size());
-    const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
-    utils::Check(gptr.size() != 0 && gptr.back() == info.labels.size(),
-                 "group structure not consistent with #rows");
-    const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    #pragma omp parallel
-    {
-      // parall construct, declare random number generator here, so that each
-      // thread use its own random number generator, seed by thread id and current iteration
-      random::Random rnd; rnd.Seed(iter* 1111 + omp_get_thread_num());
-      std::vector<LambdaPair> pairs;
-      std::vector<ListEntry>  lst;
-      std::vector< std::pair<float, unsigned> > rec;
-      #pragma omp for schedule(static)
-      for (bst_omp_uint k = 0; k < ngroup; ++k) {
-        lst.clear(); pairs.clear();
-        for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
-          lst.push_back(ListEntry(preds[j], info.labels[j], j));
-          gpair[j] = bst_gpair(0.0f, 0.0f);
-        }
-        std::sort(lst.begin(), lst.end(), ListEntry::CmpPred);
-        rec.resize(lst.size());
-        for (unsigned i = 0; i < lst.size(); ++i) {
-          rec[i] = std::make_pair(lst[i].label, i);
-        }
-        std::sort(rec.begin(), rec.end(), CmpFirst);
-        // enumerate buckets with same label, for each item in the lst, grab another sample randomly
-        for (unsigned i = 0; i < rec.size(); ) {
-          unsigned j = i + 1;
-          while (j < rec.size() && rec[j].first == rec[i].first) ++j;
-          // bucket in [i,j), get a sample outside bucket
-          unsigned nleft = i, nright = static_cast<unsigned>(rec.size() - j);
-          if (nleft + nright != 0) {
-            int nsample = num_pairsample;
-            while (nsample --) {
-              for (unsigned pid = i; pid < j; ++pid) {
-                unsigned ridx = static_cast<unsigned>(rnd.RandDouble() * (nleft+nright));
-                if (ridx < nleft) {
-                  pairs.push_back(LambdaPair(rec[ridx].second, rec[pid].second));
-                } else {
-                  pairs.push_back(LambdaPair(rec[pid].second, rec[ridx+j-i].second));
-                }
-              }
-            }
-          }
-          i = j;
-        }
-        // get lambda weight for the pairs
-        this->GetLambdaWeight(lst, &pairs);
-        // rescale each gradient and hessian so that the lst have constant weighted
-        float scale = 1.0f / num_pairsample;
-        if (fix_list_weight != 0.0f) {
-          scale *= fix_list_weight / (gptr[k+1] - gptr[k]);
-        }
-        for (size_t i = 0; i < pairs.size(); ++i) {
-          const ListEntry &pos = lst[pairs[i].pos_index];
-          const ListEntry &neg = lst[pairs[i].neg_index];
-          const float w = pairs[i].weight * scale;
-          float p = loss.PredTransform(pos.pred - neg.pred);
-          float g = loss.FirstOrderGradient(p, 1.0f);
-          float h = loss.SecondOrderGradient(p, 1.0f);
-          // accumulate gradient and hessian in both pid, and nid
-          gpair[pos.rindex].grad += g * w;
-          gpair[pos.rindex].hess += 2.0f * w * h;
-          gpair[neg.rindex].grad -= g * w;
-          gpair[neg.rindex].hess += 2.0f * w * h;
-        }
-      }
-    }
-  }
-  virtual const char* DefaultEvalMetric(void) const {
-    return "map";
-  }
-
- protected:
-  /*! \brief helper information in a list */
-  struct ListEntry {
-    /*! \brief the predict score we in the data */
-    float pred;
-    /*! \brief the actual label of the entry */
-    float label;
-    /*! \brief row index in the data matrix */
-    unsigned rindex;
-    // constructor
-    ListEntry(float pred, float label, unsigned rindex)
-        : pred(pred), label(label), rindex(rindex) {}
-    // comparator by prediction
-    inline static bool CmpPred(const ListEntry &a, const ListEntry &b) {
-      return a.pred > b.pred;
-    }
-    // comparator by label
-    inline static bool CmpLabel(const ListEntry &a, const ListEntry &b) {
-      return a.label > b.label;
-    }
-  };
-  /*! \brief a pair in the lambda rank */
-  struct LambdaPair {
-    /*! \brief positive index: this is a position in the list */
-    unsigned pos_index;
-    /*! \brief negative index: this is a position in the list */
-    unsigned neg_index;
-    /*! \brief weight to be filled in */
-    float weight;
-    // constructor
-    LambdaPair(unsigned pos_index, unsigned neg_index)
-        : pos_index(pos_index), neg_index(neg_index), weight(1.0f) {}
-  };
-  /*!
-   * \brief get lambda weight for existing pairs
-   * \param list a list that is sorted by pred score
-   * \param io_pairs record of pairs, containing the pairs to fill in weights
-   */
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) = 0;
-
- private:
-  // loss function
-  LossType loss;
-  // number of samples peformed for each instance
-  int num_pairsample;
-  // fix weight of each elements in list
-  float fix_list_weight;
-};
-
-class PairwiseRankObj: public LambdaRankObj{
- public:
-  virtual ~PairwiseRankObj(void) {}
-
- protected:
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) {}
-};
-
-// beta version: NDCG lambda rank
-class LambdaRankObjNDCG : public LambdaRankObj {
- public:
-  virtual ~LambdaRankObjNDCG(void) {}
-
- protected:
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) {
-    std::vector<LambdaPair> &pairs = *io_pairs;
-    float IDCG;
-    {
-      std::vector<float> labels(sorted_list.size());
-      for (size_t i = 0; i < sorted_list.size(); ++i) {
-        labels[i] = sorted_list[i].label;
-      }
-      std::sort(labels.begin(), labels.end(), std::greater<float>());
-      IDCG = CalcDCG(labels);
-    }
-    if (IDCG == 0.0) {
-      for (size_t i = 0; i < pairs.size(); ++i) {
-        pairs[i].weight = 0.0f;
-      }
-    } else {
-      IDCG = 1.0f / IDCG;
-      for (size_t i = 0; i < pairs.size(); ++i) {
-        unsigned pos_idx = pairs[i].pos_index;
-        unsigned neg_idx = pairs[i].neg_index;
-        float pos_loginv = 1.0f / std::log(pos_idx + 2.0f);
-        float neg_loginv = 1.0f / std::log(neg_idx + 2.0f);
-        int pos_label = static_cast<int>(sorted_list[pos_idx].label);
-        int neg_label = static_cast<int>(sorted_list[neg_idx].label);
-        float original =
-            ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv;
-        float changed  =
-            ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv;
-        float delta = (original - changed) * IDCG;
-        if (delta < 0.0f) delta = - delta;
-        pairs[i].weight = delta;
-      }
-    }
-  }
-  inline static float CalcDCG(const std::vector<float> &labels) {
-    double sumdcg = 0.0;
-    for (size_t i = 0; i < labels.size(); ++i) {
-      const unsigned rel = static_cast<unsigned>(labels[i]);
-      if (rel != 0) {
-        sumdcg += ((1 << rel) - 1) / std::log(static_cast<float>(i + 2));
-      }
-    }
-    return static_cast<float>(sumdcg);
-  }
-};
-
-class LambdaRankObjMAP : public LambdaRankObj {
- public:
-  virtual ~LambdaRankObjMAP(void) {}
-
- protected:
-  struct MAPStats {
-    /*! \brief the accumulated precision */
-    float ap_acc;
-    /*!
-     * \brief the accumulated precision,
-     *   assuming a positive instance is missing
-     */
-    float ap_acc_miss;
-    /*!
-     * \brief the accumulated precision,
-     * assuming that one more positive instance is inserted ahead
-     */
-    float ap_acc_add;
-    /* \brief the accumulated positive instance count */
-    float hits;
-    MAPStats(void) {}
-    MAPStats(float ap_acc, float ap_acc_miss, float ap_acc_add, float hits)
-        : ap_acc(ap_acc), ap_acc_miss(ap_acc_miss), ap_acc_add(ap_acc_add), hits(hits) {}
-  };
-  /*!
-   * \brief Obtain the delta MAP if trying to switch the positions of instances in index1 or index2
-   *        in sorted triples
-   * \param sorted_list the list containing entry information
-   * \param index1,index2 the instances switched
-   * \param map_stats a vector containing the accumulated precisions for each position in a list
-   */
-  inline float GetLambdaMAP(const std::vector<ListEntry> &sorted_list,
-                            int index1, int index2,
-                            std::vector<MAPStats> *p_map_stats) {
-    std::vector<MAPStats> &map_stats = *p_map_stats;
-    if (index1 == index2 || map_stats[map_stats.size() - 1].hits == 0) {
-      return 0.0f;
-    }
-    if (index1 > index2) std::swap(index1, index2);
-    float original = map_stats[index2].ap_acc;
-    if (index1 != 0) original -= map_stats[index1 - 1].ap_acc;
-    float changed = 0;
-    float label1 = sorted_list[index1].label > 0.0f ? 1.0f : 0.0f;
-    float label2 = sorted_list[index2].label > 0.0f ? 1.0f : 0.0f;
-    if (label1 == label2) {
-      return 0.0;
-    } else if (label1 < label2) {
-      changed += map_stats[index2 - 1].ap_acc_add - map_stats[index1].ap_acc_add;
-      changed += (map_stats[index1].hits + 1.0f) / (index1 + 1);
-    } else {
-      changed += map_stats[index2 - 1].ap_acc_miss - map_stats[index1].ap_acc_miss;
-      changed += map_stats[index2].hits / (index2 + 1);
-    }
-    float ans = (changed - original) / (map_stats[map_stats.size() - 1].hits);
-    if (ans < 0) ans = -ans;
-    return ans;
-  }
-  /*
-   * \brief obtain preprocessing results for calculating delta MAP
-   * \param sorted_list the list containing entry information
-   * \param map_stats a vector containing the accumulated precisions for each position in a list
-   */
-  inline void GetMAPStats(const std::vector<ListEntry> &sorted_list,
-                          std::vector<MAPStats> *p_map_acc) {
-    std::vector<MAPStats> &map_acc = *p_map_acc;
-    map_acc.resize(sorted_list.size());
-    float hit = 0, acc1 = 0, acc2 = 0, acc3 = 0;
-    for (size_t i = 1; i <= sorted_list.size(); ++i) {
-      if (sorted_list[i - 1].label > 0.0f) {
-        hit++;
-        acc1 += hit / i;
-        acc2 += (hit - 1) / i;
-        acc3 += (hit + 1) / i;
-      }
-      map_acc[i - 1] = MAPStats(acc1, acc2, acc3, hit);
-    }
-  }
-  virtual void GetLambdaWeight(const std::vector<ListEntry> &sorted_list,
-                               std::vector<LambdaPair> *io_pairs) {
-    std::vector<LambdaPair> &pairs = *io_pairs;
-    std::vector<MAPStats> map_stats;
-    GetMAPStats(sorted_list, &map_stats);
-    for (size_t i = 0; i < pairs.size(); ++i) {
-      pairs[i].weight =
-          GetLambdaMAP(sorted_list, pairs[i].pos_index,
-                       pairs[i].neg_index, &map_stats);
-    }
-  }
-};
-
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
--- a/src/learner/objective.h
+++ b/src/learner/objective.h
@@ -1,89 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file objective.h
- * \brief interface of objective function used for gradient boosting
- * \author Tianqi Chen, Kailong Chen
- */
-#ifndef XGBOOST_LEARNER_OBJECTIVE_H_
-#define XGBOOST_LEARNER_OBJECTIVE_H_
-
-#include <vector>
-#include "./dmatrix.h"
-
-namespace xgboost {
-namespace learner {
-/*! \brief interface of objective function */
-class IObjFunction{
- public:
-  /*! \brief virtual destructor */
-  virtual ~IObjFunction(void) {}
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val value of the parameter
-   */
-  virtual void SetParam(const char *name, const char *val) = 0;
-  /*!
-   * \brief get gradient over each of predictions, given existing information
-   * \param preds prediction of current round
-   * \param info information about labels, weights, groups in rank
-   * \param iter current iteration number
-   * \param out_gpair output of get gradient, saves gradient and second order gradient in
-   */
-  virtual void GetGradient(const std::vector<float> &preds,
-                           const MetaInfo &info,
-                           int iter,
-                           std::vector<bst_gpair> *out_gpair) = 0;
-  /*! \return the default evaluation metric for the objective */
-  virtual const char* DefaultEvalMetric(void) const = 0;
-  // the following functions are optional, most of time default implementation is good enough
-  /*!
-   * \brief transform prediction values, this is only called when Prediction is called
-   * \param io_preds prediction values, saves to this vector as well
-   */
-  virtual void PredTransform(std::vector<float> *io_preds) {}
-  /*!
-   * \brief transform prediction values, this is only called when Eval is called,
-   *  usually it redirect to PredTransform
-   * \param io_preds prediction values, saves to this vector as well
-   */
-  virtual void EvalTransform(std::vector<float> *io_preds) {
-    this->PredTransform(io_preds);
-  }
-  /*!
-   * \brief transform probability value back to margin
-   * this is used to transform user-set base_score back to margin
-   * used by gradient boosting
-   * \return transformed value
-   */
-  virtual float ProbToMargin(float base_score) const {
-    return base_score;
-  }
-};
-}  // namespace learner
-}  // namespace xgboost
-
-// this are implementations of objective functions
-#include "objective-inl.hpp"
-// factory function
-namespace xgboost {
-namespace learner {
-/*! \brief factory function to create objective function by name */
-inline IObjFunction* CreateObjFunction(const char *name) {
-  using namespace std;
-  if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
-  if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
-  if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
-  if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw);
-  if (!strcmp("count:poisson", name)) return new PoissonRegression();
-  if (!strcmp("multi:softmax", name)) return new SoftmaxMultiClassObj(0);
-  if (!strcmp("multi:softprob", name)) return new SoftmaxMultiClassObj(1);
-  if (!strcmp("rank:pairwise", name )) return new PairwiseRankObj();
-  if (!strcmp("rank:ndcg", name)) return new LambdaRankObjNDCG();
-  if (!strcmp("rank:map", name)) return new LambdaRankObjMAP();
-  utils::Error("unknown objective function type: %s", name);
-  return NULL;
-}
-}  // namespace learner
-}  // namespace xgboost
-#endif  // XGBOOST_LEARNER_OBJECTIVE_H_
--- a/src/sync/sync.h
+++ b/src/sync/sync.h
@@ -1,13 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file sync.h
- * \brief the synchronization module of rabit
- *        redirects to subtree rabit header
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_SYNC_SYNC_H_
-#define XGBOOST_SYNC_SYNC_H_
-
-#include "../../subtree/rabit/include/rabit.h"
-#include "../../subtree/rabit/include/rabit/timer.h"
-#endif  // XGBOOST_SYNC_SYNC_H_
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -1,573 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file model.h
- * \brief model structure for tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_MODEL_H_
-#define XGBOOST_TREE_MODEL_H_
-
-#include <string>
-#include <cstring>
-#include <sstream>
-#include <limits>
-#include <algorithm>
-#include <vector>
-#include <cmath>
-#include "../utils/io.h"
-#include "../utils/fmap.h"
-#include "../utils/utils.h"
-
-namespace xgboost {
-namespace tree {
-/*!
- * \brief template class of TreeModel
- * \tparam TSplitCond data type to indicate split condition
- * \tparam TNodeStat auxiliary statistics of node to help tree building
- */
-template<typename TSplitCond, typename TNodeStat>
-class TreeModel {
- public:
-  /*! \brief data type to indicate split condition */
-  typedef TNodeStat  NodeStat;
-  /*! \brief auxiliary statistics of node to help tree building */
-  typedef TSplitCond SplitCond;
-  /*! \brief parameters of the tree */
-  struct Param{
-    /*! \brief number of start root */
-    int num_roots;
-    /*! \brief total number of nodes */
-    int num_nodes;
-    /*!\brief number of deleted nodes */
-    int num_deleted;
-    /*! \brief maximum depth, this is a statistics of the tree */
-    int max_depth;
-    /*! \brief  number of features used for tree construction */
-    int num_feature;
-    /*!
-     * \brief leaf vector size, used for vector tree
-     * used to store more than one dimensional information in tree
-     */
-    int size_leaf_vector;
-    /*! \brief reserved part */
-    int reserved[31];
-    /*! \brief constructor */
-    Param(void) {
-      max_depth = 0;
-      size_leaf_vector = 0;
-      std::memset(reserved, 0, sizeof(reserved));
-    }
-    /*!
-     * \brief set parameters from outside
-     * \param name name of the parameter
-     * \param val  value of the parameter
-     */
-    inline void SetParam(const char *name, const char *val) {
-      using namespace std;
-      if (!strcmp("num_roots", name)) num_roots = atoi(val);
-      if (!strcmp("num_feature", name)) num_feature = atoi(val);
-      if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
-    }
-  };
-  /*! \brief tree node */
-  class Node {
-   public:
-    Node(void) : sindex_(0) {}
-    /*! \brief index of left child */
-    inline int cleft(void) const {
-      return this->cleft_;
-    }
-    /*! \brief index of right child */
-    inline int cright(void) const {
-      return this->cright_;
-    }
-    /*! \brief index of default child when feature is missing */
-    inline int cdefault(void) const {
-      return this->default_left() ? this->cleft() : this->cright();
-    }
-    /*! \brief feature index of split condition */
-    inline unsigned split_index(void) const {
-      return sindex_ & ((1U << 31) - 1U);
-    }
-    /*! \brief when feature is unknown, whether goes to left child */
-    inline bool default_left(void) const {
-      return (sindex_ >> 31) != 0;
-    }
-    /*! \brief whether current node is leaf node */
-    inline bool is_leaf(void) const {
-      return cleft_ == -1;
-    }
-    /*! \brief get leaf value of leaf node */
-    inline float leaf_value(void) const {
-      return (this->info_).leaf_value;
-    }
-    /*! \brief get split condition of the node */
-    inline TSplitCond split_cond(void) const {
-      return (this->info_).split_cond;
-    }
-    /*! \brief get parent of the node */
-    inline int parent(void) const {
-      return parent_ & ((1U << 31) - 1);
-    }
-    /*! \brief whether current node is left child */
-    inline bool is_left_child(void) const {
-      return (parent_ & (1U << 31)) != 0;
-    }
-    /*! \brief whether this node is deleted */
-    inline bool is_deleted(void) const {
-      return sindex_ == std::numeric_limits<unsigned>::max();
-    }
-    /*! \brief whether current node is root */
-    inline bool is_root(void) const {
-      return parent_ == -1;
-    }
-    /*!
-     * \brief set the right child
-     * \param nide node id to right child
-     */
-    inline void set_right_child(int nid) {
-      this->cright_ = nid;
-    }
-    /*!
-     * \brief set split condition of current node
-     * \param split_index feature index to split
-     * \param split_cond  split condition
-     * \param default_left the default direction when feature is unknown
-     */
-    inline void set_split(unsigned split_index, TSplitCond split_cond,
-                          bool default_left = false) {
-      if (default_left) split_index |= (1U << 31);
-      this->sindex_ = split_index;
-      (this->info_).split_cond = split_cond;
-    }
-    /*!
-     * \brief set the leaf value of the node
-     * \param value leaf value
-     * \param right right index, could be used to store
-     *        additional information
-     */
-    inline void set_leaf(float value, int right = -1) {
-      (this->info_).leaf_value = value;
-      this->cleft_ = -1;
-      this->cright_ = right;
-    }
-    /*! \brief mark that this node is deleted */
-    inline void mark_delete(void) {
-      this->sindex_ = std::numeric_limits<unsigned>::max();
-    }
-
-   private:
-    friend class TreeModel<TSplitCond, TNodeStat>;
-    /*!
-     * \brief in leaf node, we have weights, in non-leaf nodes,
-     *        we have split condition
-     */
-    union Info{
-      float leaf_value;
-      TSplitCond split_cond;
-    };
-    // pointer to parent, highest bit is used to
-    // indicate whether it's a left child or not
-    int parent_;
-    // pointer to left, right
-    int cleft_, cright_;
-    // split feature index, left split or right split depends on the highest bit
-    unsigned sindex_;
-    // extra info
-    Info info_;
-    // set parent
-    inline void set_parent(int pidx, bool is_left_child = true) {
-      if (is_left_child) pidx |= (1U << 31);
-      this->parent_ = pidx;
-    }
-  };
-
- protected:
-  // vector of nodes
-  std::vector<Node> nodes;
-  // free node space, used during training process
-  std::vector<int>  deleted_nodes;
-  // stats of nodes
-  std::vector<TNodeStat> stats;
-  // leaf vector, that is used to store additional information
-  std::vector<bst_float> leaf_vector;
-  // allocate a new node,
-  // !!!!!! NOTE: may cause BUG here, nodes.resize
-  inline int AllocNode(void) {
-    if (param.num_deleted != 0) {
-      int nd = deleted_nodes.back();
-      deleted_nodes.pop_back();
-      --param.num_deleted;
-      return nd;
-    }
-    int nd = param.num_nodes++;
-    utils::Check(param.num_nodes < std::numeric_limits<int>::max(),
-                 "number of nodes in the tree exceed 2^31");
-    nodes.resize(param.num_nodes);
-    stats.resize(param.num_nodes);
-    leaf_vector.resize(param.num_nodes * param.size_leaf_vector);
-    return nd;
-  }
-  // delete a tree node, keep the parent field to allow trace back
-  inline void DeleteNode(int nid) {
-    utils::Assert(nid >= param.num_roots, "can not delete root");
-    deleted_nodes.push_back(nid);
-    nodes[nid].mark_delete();
-    ++param.num_deleted;
-  }
-
- public:
-  /*!
-   * \brief change a non leaf node to a leaf node, delete its children
-   * \param rid node id of the node
-   * \param new leaf value
-   */
-  inline void ChangeToLeaf(int rid, float value) {
-    utils::Assert(nodes[nodes[rid].cleft() ].is_leaf(),
-                  "can not delete a non termial child");
-    utils::Assert(nodes[nodes[rid].cright()].is_leaf(),
-                  "can not delete a non termial child");
-    this->DeleteNode(nodes[rid].cleft());
-    this->DeleteNode(nodes[rid].cright());
-    nodes[rid].set_leaf(value);
-  }
-  /*!
-   * \brief collapse a non leaf node to a leaf node, delete its children
-   * \param rid node id of the node
-   * \param new leaf value
-   */
-  inline void CollapseToLeaf(int rid, float value) {
-    if (nodes[rid].is_leaf()) return;
-    if (!nodes[nodes[rid].cleft() ].is_leaf()) {
-      CollapseToLeaf(nodes[rid].cleft(), 0.0f);
-    }
-    if (!nodes[nodes[rid].cright() ].is_leaf()) {
-      CollapseToLeaf(nodes[rid].cright(), 0.0f);
-    }
-    this->ChangeToLeaf(rid, value);
-  }
-
- public:
-  /*! \brief model parameter */
-  Param param;
-  /*! \brief constructor */
-  TreeModel(void) {
-    param.num_nodes = 1;
-    param.num_roots = 1;
-    param.num_deleted = 0;
-    nodes.resize(1);
-  }
-  /*! \brief get node given nid */
-  inline Node &operator[](int nid) {
-    return nodes[nid];
-  }
-  /*! \brief get node given nid */
-  inline const Node &operator[](int nid) const {
-    return nodes[nid];
-  }
-  /*! \brief get node statistics given nid */
-  inline NodeStat &stat(int nid) {
-    return stats[nid];
-  }
-  /*! \brief get leaf vector given nid */
-  inline bst_float* leafvec(int nid) {
-    if (leaf_vector.size() == 0) return NULL;
-    return &leaf_vector[nid * param.size_leaf_vector];
-  }
-  /*! \brief get leaf vector given nid */
-  inline const bst_float* leafvec(int nid) const {
-    if (leaf_vector.size() == 0) return NULL;
-    return &leaf_vector[nid * param.size_leaf_vector];
-  }
-  /*! \brief initialize the model */
-  inline void InitModel(void) {
-    param.num_nodes = param.num_roots;
-    nodes.resize(param.num_nodes);
-    stats.resize(param.num_nodes);
-    leaf_vector.resize(param.num_nodes * param.size_leaf_vector, 0.0f);
-    for (int i = 0; i < param.num_nodes; i ++) {
-      nodes[i].set_leaf(0.0f);
-      nodes[i].set_parent(-1);
-    }
-  }
-  /*!
-   * \brief load model from stream
-   * \param fi input stream
-   */
-  inline void LoadModel(utils::IStream &fi) { // NOLINT(*)
-    utils::Check(fi.Read(&param, sizeof(Param)) > 0,
-                 "TreeModel: wrong format");
-    nodes.resize(param.num_nodes); stats.resize(param.num_nodes);
-    utils::Assert(param.num_nodes != 0, "invalid model");
-    utils::Check(fi.Read(BeginPtr(nodes), sizeof(Node) * nodes.size()) > 0,
-                 "TreeModel: wrong format");
-    utils::Check(fi.Read(BeginPtr(stats), sizeof(NodeStat) * stats.size()) > 0,
-                 "TreeModel: wrong format");
-    if (param.size_leaf_vector != 0) {
-      utils::Check(fi.Read(&leaf_vector), "TreeModel: wrong format");
-    }
-    // chg deleted nodes
-    deleted_nodes.resize(0);
-    for (int i = param.num_roots; i < param.num_nodes; ++i) {
-      if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
-    }
-    utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
-                  "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
-                  param.num_deleted, deleted_nodes.size(), param.num_nodes);
-  }
-  /*!
-   * \brief save model to stream
-   * \param fo output stream
-   */
-  inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
-    utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
-                  "TreeModel::SaveModel");
-    utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
-                  "TreeModel::SaveModel");
-    fo.Write(&param, sizeof(Param));
-    utils::Assert(param.num_nodes != 0, "invalid model");
-    fo.Write(BeginPtr(nodes), sizeof(Node) * nodes.size());
-    fo.Write(BeginPtr(stats), sizeof(NodeStat) * nodes.size());
-    if (param.size_leaf_vector != 0) fo.Write(leaf_vector);
-  }
-  /*!
-   * \brief add child nodes to node
-   * \param nid node id to add childs
-   */
-  inline void AddChilds(int nid) {
-    int pleft  = this->AllocNode();
-    int pright = this->AllocNode();
-    nodes[nid].cleft_  = pleft;
-    nodes[nid].cright_ = pright;
-    nodes[nodes[nid].cleft() ].set_parent(nid, true);
-    nodes[nodes[nid].cright()].set_parent(nid, false);
-  }
-  /*!
-   * \brief only add a right child to a leaf node
-   * \param node id to add right child
-   */
-  inline void AddRightChild(int nid) {
-    int pright = this->AllocNode();
-    nodes[nid].right  = pright;
-    nodes[nodes[nid].right].set_parent(nid, false);
-  }
-  /*!
-   * \brief get current depth
-   * \param nid node id
-   * \param pass_rchild whether right child is not counted in depth
-   */
-  inline int GetDepth(int nid, bool pass_rchild = false) const {
-    int depth = 0;
-    while (!nodes[nid].is_root()) {
-      if (!pass_rchild || nodes[nid].is_left_child()) ++depth;
-      nid = nodes[nid].parent();
-    }
-    return depth;
-  }
-  /*!
-   * \brief get maximum depth
-   * \param nid node id
-   */
-  inline int MaxDepth(int nid) const {
-    if (nodes[nid].is_leaf()) return 0;
-    return std::max(MaxDepth(nodes[nid].cleft())+1,
-                     MaxDepth(nodes[nid].cright())+1);
-  }
-  /*!
-   * \brief get maximum depth
-   */
-  inline int MaxDepth(void) {
-    int maxd = 0;
-    for (int i = 0; i < param.num_roots; ++i) {
-      maxd = std::max(maxd, MaxDepth(i));
-    }
-    return maxd;
-  }
-  /*! \brief number of extra nodes besides the root */
-  inline int num_extra_nodes(void) const {
-    return param.num_nodes - param.num_roots - param.num_deleted;
-  }
-  /*!
-   * \brief dump model to text string
-   * \param fmap feature map of feature types
-   * \param with_stats whether dump out statistics as well
-   * \return the string of dumped model
-   */
-  inline std::string DumpModel(const utils::FeatMap& fmap, bool with_stats) {
-    std::stringstream fo("");
-    for (int i = 0; i < param.num_roots; ++i) {
-      this->Dump(i, fo, fmap, 0, with_stats);
-    }
-    return fo.str();
-  }
-
- private:
-  void Dump(int nid, std::stringstream &fo, // NOLINT(*)
-            const utils::FeatMap& fmap, int depth, bool with_stats) {
-    for (int i = 0;  i < depth; ++i) {
-      fo << '\t';
-    }
-    if (nodes[nid].is_leaf()) {
-      fo << nid << ":leaf=" << nodes[nid].leaf_value();
-      if (with_stats) {
-        stat(nid).Print(fo, true);
-      }
-      fo << '\n';
-    } else {
-      // right then left,
-      TSplitCond cond = nodes[nid].split_cond();
-      const unsigned split_index = nodes[nid].split_index();
-      if (split_index < fmap.size()) {
-        switch (fmap.type(split_index)) {
-          case utils::FeatMap::kIndicator: {
-            int nyes = nodes[nid].default_left() ?
-                nodes[nid].cright() : nodes[nid].cleft();
-            fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
-               << ",no=" << nodes[nid].cdefault();
-            break;
-          }
-          case utils::FeatMap::kInteger: {
-            fo << nid << ":[" << fmap.name(split_index) << "<"
-               << int(float(cond)+1.0f)
-               << "] yes=" << nodes[nid].cleft()
-               << ",no=" << nodes[nid].cright()
-               << ",missing=" << nodes[nid].cdefault();
-            break;
-          }
-          case utils::FeatMap::kFloat:
-          case utils::FeatMap::kQuantitive: {
-            fo << nid << ":[" << fmap.name(split_index) << "<"<< float(cond)
-               << "] yes=" << nodes[nid].cleft()
-               << ",no=" << nodes[nid].cright()
-               << ",missing=" << nodes[nid].cdefault();
-            break;
-          }
-          default: utils::Error("unknown fmap type");
-        }
-      } else {
-        fo << nid << ":[f" << split_index << "<"<< float(cond)
-           << "] yes=" << nodes[nid].cleft()
-           << ",no=" << nodes[nid].cright()
-           << ",missing=" << nodes[nid].cdefault();
-      }
-      if (with_stats) {
-        stat(nid).Print(fo, false);
-      }
-      fo << '\n';
-      this->Dump(nodes[nid].cleft(), fo, fmap, depth+1, with_stats);
-      this->Dump(nodes[nid].cright(), fo, fmap, depth+1, with_stats);
-    }
-  }
-};
-
-/*! \brief node statistics used in regression tree */
-struct RTreeNodeStat {
-  /*! \brief loss change caused by current split */
-  float loss_chg;
-  /*! \brief sum of hessian values, used to measure coverage of data */
-  float sum_hess;
-  /*! \brief weight of current node */
-  float base_weight;
-  /*! \brief number of child that is leaf node known up to now */
-  int   leaf_child_cnt;
-  /*! \brief print information of current stats to fo */
-  inline void Print(std::stringstream &fo, bool is_leaf) const { // NOLINT(*)
-    if (!is_leaf) {
-      fo << ",gain=" << loss_chg << ",cover=" << sum_hess;
-    } else {
-      fo << ",cover=" << sum_hess;
-    }
-  }
-};
-
-/*! \brief define regression tree to be the most common tree model */
-class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
- public:
-  /*!
-   * \brief dense feature vector that can be taken by RegTree
-   * to do traverse efficiently
-   * and can be construct from sparse feature vector
-   */
-  struct FVec {
-    /*!
-     * \brief a union value of value and flag
-     * when flag == -1, this indicate the value is missing
-     */
-    union Entry{
-      float fvalue;
-      int flag;
-    };
-    std::vector<Entry> data;
-    /*! \brief initialize the vector with size vector */
-    inline void Init(size_t size) {
-      Entry e; e.flag = -1;
-      data.resize(size);
-      std::fill(data.begin(), data.end(), e);
-    }
-    /*! \brief fill the vector with sparse vector */
-    inline void Fill(const RowBatch::Inst &inst) {
-      for (bst_uint i = 0; i < inst.length; ++i) {
-        if (inst[i].index >= data.size()) continue;
-        data[inst[i].index].fvalue = inst[i].fvalue;
-      }
-    }
-    /*! \brief drop the trace after fill, must be called after fill */
-    inline void Drop(const RowBatch::Inst &inst) {
-      for (bst_uint i = 0; i < inst.length; ++i) {
-        if (inst[i].index >= data.size()) continue;
-        data[inst[i].index].flag = -1;
-      }
-    }
-    /*! \brief get ith value */
-    inline float fvalue(size_t i) const {
-      return data[i].fvalue;
-    }
-    /*! \brief check whether i-th entry is missing */
-    inline bool is_missing(size_t i) const {
-      return data[i].flag == -1;
-    }
-  };
-  /*!
-   * \brief get the leaf index
-   * \param feat dense feature vector, if the feature is missing the field is set to NaN
-   * \param root_id starting root index of the instance
-   * \return the leaf index of the given feature
-   */
-  inline int GetLeafIndex(const FVec &feat, unsigned root_id = 0) const {
-    // start from groups that belongs to current data
-    int pid = static_cast<int>(root_id);
-    // traverse tree
-    while (!(*this)[ pid ].is_leaf()) {
-      unsigned split_index = (*this)[pid].split_index();
-      pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
-    }
-    return pid;
-  }
-  /*!
-   * \brief get the prediction of regression tree, only accepts dense feature vector
-   * \param feats dense feature vector, if the feature is missing the field is set to NaN
-   * \param root_id starting root index of the instance
-   * \return the leaf index of the given feature
-   */
-  inline float Predict(const FVec &feat, unsigned root_id = 0) const {
-    int pid = this->GetLeafIndex(feat, root_id);
-    return (*this)[pid].leaf_value();
-  }
-  /*! \brief get next position of the tree given current pid */
-  inline int GetNext(int pid, float fvalue, bool is_unknown) const {
-    float split_value = (*this)[pid].split_cond();
-    if (is_unknown) {
-      return (*this)[pid].cdefault();
-    } else {
-      if (fvalue < split_value) {
-        return (*this)[pid].cleft();
-      } else {
-        return (*this)[pid].cright();
-      }
-    }
-  }
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_MODEL_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -1,429 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file param.h
- * \brief training parameters, statistics used to support tree construction
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_PARAM_H_
-#define XGBOOST_TREE_PARAM_H_
-
-#include <vector>
-#include <cstring>
-#include "../data.h"
-
-namespace xgboost {
-namespace tree {
-
-/*! \brief training parameters for regression tree */
-struct TrainParam{
-  // learning step size for a time
-  float learning_rate;
-  // minimum loss change required for a split
-  float min_split_loss;
-  // maximum depth of a tree
-  int max_depth;
-  //----- the rest parameters are less important ----
-  // minimum amount of hessian(weight) allowed in a child
-  float min_child_weight;
-  // L2 regularization factor
-  float reg_lambda;
-  // L1 regularization factor
-  float reg_alpha;
-  // default direction choice
-  int default_direction;
-  // maximum delta update we can add in weight estimation
-  // this parameter can be used to stabilize update
-  // default=0 means no constraint on weight delta
-  float max_delta_step;
-  // whether we want to do subsample
-  float subsample;
-  // whether to subsample columns each split, in each level
-  float colsample_bylevel;
-  // whether to subsample columns during tree construction
-  float colsample_bytree;
-  // speed optimization for dense column
-  float opt_dense_col;
-  // accuracy of sketch
-  float sketch_eps;
-  // accuracy of sketch
-  float sketch_ratio;
-  // leaf vector size
-  int size_leaf_vector;
-  // option for parallelization
-  int parallel_option;
-  // option to open cacheline optimization
-  int cache_opt;
-  // number of threads to be used for tree construction,
-  // if OpenMP is enabled, if equals 0, use system default
-  int nthread;
-  /*! \brief constructor */
-  TrainParam(void) {
-    learning_rate = 0.3f;
-    min_split_loss = 0.0f;
-    min_child_weight = 1.0f;
-    max_delta_step = 0.0f;
-    max_depth = 6;
-    reg_lambda = 1.0f;
-    reg_alpha = 0.0f;
-    default_direction = 0;
-    subsample = 1.0f;
-    colsample_bytree = 1.0f;
-    colsample_bylevel = 1.0f;
-    opt_dense_col = 1.0f;
-    nthread = 0;
-    size_leaf_vector = 0;
-    // enforce parallel option to 0 for now, investigate the other strategy
-    parallel_option = 0;
-    sketch_eps = 0.1f;
-    sketch_ratio = 2.0f;
-    cache_opt = 1;
-  }
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    // sync-names
-    if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
-    if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
-    if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
-    if (!strcmp(name, "alpha")) reg_alpha = static_cast<float>(atof(val));
-    if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
-    if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
-    if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
-    if (!strcmp(name, "max_delta_step")) max_delta_step = static_cast<float>(atof(val));
-    if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
-    if (!strcmp(name, "reg_alpha")) reg_alpha = static_cast<float>(atof(val));
-    if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
-    if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
-    if (!strcmp(name, "colsample_bytree")) colsample_bytree  = static_cast<float>(atof(val));
-    if (!strcmp(name, "sketch_eps")) sketch_eps  = static_cast<float>(atof(val));
-    if (!strcmp(name, "sketch_ratio")) sketch_ratio  = static_cast<float>(atof(val));
-    if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
-    if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
-    if (!strcmp(name, "cache_opt")) cache_opt = atoi(val);
-    if (!strcmp(name, "max_depth")) max_depth = atoi(val);
-    if (!strcmp(name, "nthread")) nthread = atoi(val);
-    if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
-    if (!strcmp(name, "default_direction")) {
-      if (!strcmp(val, "learn")) default_direction = 0;
-      if (!strcmp(val, "left")) default_direction = 1;
-      if (!strcmp(val, "right")) default_direction = 2;
-    }
-  }
-  // calculate the cost of loss function
-  inline double CalcGain(double sum_grad, double sum_hess) const {
-    if (sum_hess < min_child_weight) return 0.0;
-    if (max_delta_step == 0.0f) {
-      if (reg_alpha == 0.0f) {
-        return Sqr(sum_grad) / (sum_hess + reg_lambda);
-      } else {
-        return Sqr(ThresholdL1(sum_grad, reg_alpha)) / (sum_hess + reg_lambda);
-      }
-    } else {
-      double w = CalcWeight(sum_grad, sum_hess);
-      double ret = sum_grad * w + 0.5 * (sum_hess + reg_lambda) * Sqr(w);
-      if (reg_alpha == 0.0f) {
-        return - 2.0 * ret;
-      } else {
-        return - 2.0 * (ret + reg_alpha * std::abs(w));
-      }
-    }
-  }
-  // calculate cost of loss function with four statistics
-  inline double CalcGain(double sum_grad, double sum_hess,
-                         double test_grad, double test_hess) const {
-    double w = CalcWeight(sum_grad, sum_hess);
-    double ret = test_grad * w  + 0.5 * (test_hess + reg_lambda) * Sqr(w);
-    if (reg_alpha == 0.0f) {
-      return - 2.0 * ret;
-    } else {
-      return - 2.0 * (ret + reg_alpha * std::abs(w));
-    }
-  }
-  // calculate weight given the statistics
-  inline double CalcWeight(double sum_grad, double sum_hess) const {
-    if (sum_hess < min_child_weight) return 0.0;
-    double dw;
-    if (reg_alpha == 0.0f) {
-      dw = -sum_grad / (sum_hess + reg_lambda);
-    } else {
-      dw = -ThresholdL1(sum_grad, reg_alpha) / (sum_hess + reg_lambda);
-    }
-    if (max_delta_step != 0.0f) {
-      if (dw > max_delta_step) dw = max_delta_step;
-      if (dw < -max_delta_step) dw = -max_delta_step;
-    }
-    return dw;
-  }
-  /*! \brief whether need forward small to big search: default right */
-  inline bool need_forward_search(float col_density, bool indicator) const {
-    return this->default_direction == 2 ||
-        (default_direction == 0 && (col_density < opt_dense_col) && !indicator);
-  }
-  /*! \brief whether need backward big to small search: default left */
-  inline bool need_backward_search(float col_density, bool indicator) const {
-    return this->default_direction != 2;
-  }
-  /*! \brief given the loss change, whether we need to invoke pruning */
-  inline bool need_prune(double loss_chg, int depth) const {
-    return loss_chg < this->min_split_loss;
-  }
-  /*! \brief whether we can split with current hessian */
-  inline bool cannot_split(double sum_hess, int depth) const {
-    return sum_hess < this->min_child_weight * 2.0;
-  }
-  /*! \brief maximum sketch size */
-  inline unsigned max_sketch_size(void) const {
-    unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
-    utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
-    return ret;
-  }
-
- protected:
-  // functions for L1 cost
-  inline static double ThresholdL1(double w, double lambda) {
-    if (w > +lambda) return w - lambda;
-    if (w < -lambda) return w + lambda;
-    return 0.0;
-  }
-  inline static double Sqr(double a) {
-    return a * a;
-  }
-};
-
-/*! \brief core statistics used for tree construction */
-struct GradStats {
-  /*! \brief sum gradient statistics */
-  double sum_grad;
-  /*! \brief sum hessian statistics */
-  double sum_hess;
-  /*!
-   * \brief whether this is simply statistics and we only need to call
-   *   Add(gpair), instead of Add(gpair, info, ridx)
-   */
-  static const int kSimpleStats = 1;
-  /*! \brief constructor, the object must be cleared during construction */
-  explicit GradStats(const TrainParam &param) {
-    this->Clear();
-  }
-  /*! \brief clear the statistics */
-  inline void Clear(void) {
-    sum_grad = sum_hess = 0.0f;
-  }
-  /*! \brief check if necessary information is ready */
-  inline static void CheckInfo(const BoosterInfo &info) {
-  }
-  /*!
-   * \brief accumulate statistics
-   * \param p the gradient pair
-   */
-  inline void Add(bst_gpair p) {
-    this->Add(p.grad, p.hess);
-  }
-  /*!
-   * \brief accumulate statistics, more complicated version
-   * \param gpair the vector storing the gradient statistics
-   * \param info the additional information
-   * \param ridx instance index of this instance
-   */
-  inline void Add(const std::vector<bst_gpair> &gpair,
-                  const BoosterInfo &info,
-                  bst_uint ridx) {
-    const bst_gpair &b = gpair[ridx];
-    this->Add(b.grad, b.hess);
-  }
-  /*! \brief calculate leaf weight */
-  inline double CalcWeight(const TrainParam &param) const {
-    return param.CalcWeight(sum_grad, sum_hess);
-  }
-  /*! \brief calculate gain of the solution */
-  inline double CalcGain(const TrainParam &param) const {
-    return param.CalcGain(sum_grad, sum_hess);
-  }
-  /*! \brief add statistics to the data */
-  inline void Add(const GradStats &b) {
-    this->Add(b.sum_grad, b.sum_hess);
-  }
-  /*! \brief same as add, reduce is used in All Reduce */
-  inline static void Reduce(GradStats &a, const GradStats &b) { // NOLINT(*)
-    a.Add(b);
-  }
-  /*! \brief set current value to a - b */
-  inline void SetSubstract(const GradStats &a, const GradStats &b) {
-    sum_grad = a.sum_grad - b.sum_grad;
-    sum_hess = a.sum_hess - b.sum_hess;
-  }
-  /*! \return whether the statistics is not used yet */
-  inline bool Empty(void) const {
-    return sum_hess == 0.0;
-  }
-  /*! \brief set leaf vector value based on statistics */
-  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
-  }
-  // constructor to allow inheritance
-  GradStats(void) {}
-  /*! \brief add statistics to the data */
-  inline void Add(double grad, double hess) {
-    sum_grad += grad; sum_hess += hess;
-  }
-};
-
-/*! \brief vectorized cv statistics */
-template<unsigned vsize>
-struct CVGradStats : public GradStats {
-  // additional statistics
-  GradStats train[vsize], valid[vsize];
-  // constructor
-  explicit CVGradStats(const TrainParam &param) {
-    utils::Check(param.size_leaf_vector == vsize,
-                 "CVGradStats: vsize must match size_leaf_vector");
-    this->Clear();
-  }
-  /*! \brief check if necessary information is ready */
-  inline static void CheckInfo(const BoosterInfo &info) {
-    utils::Check(info.fold_index.size() != 0,
-                 "CVGradStats: require fold_index");
-  }
-  /*! \brief clear the statistics */
-  inline void Clear(void) {
-    GradStats::Clear();
-    for (unsigned i = 0; i < vsize; ++i) {
-      train[i].Clear(); valid[i].Clear();
-    }
-  }
-  inline void Add(const std::vector<bst_gpair> &gpair,
-                  const BoosterInfo &info,
-                  bst_uint ridx) {
-    GradStats::Add(gpair[ridx].grad, gpair[ridx].hess);
-    const size_t step = info.fold_index.size();
-    for (unsigned i = 0; i < vsize; ++i) {
-      const bst_gpair &b = gpair[(i + 1) * step + ridx];
-      if (info.fold_index[ridx] == i) {
-        valid[i].Add(b.grad, b.hess);
-      } else {
-        train[i].Add(b.grad, b.hess);
-      }
-    }
-  }
-  /*! \brief calculate gain of the solution */
-  inline double CalcGain(const TrainParam &param) const {
-    double ret = 0.0;
-    for (unsigned i = 0; i < vsize; ++i) {
-      ret += param.CalcGain(train[i].sum_grad,
-                            train[i].sum_hess,
-                            vsize * valid[i].sum_grad,
-                            vsize * valid[i].sum_hess);
-    }
-    return ret / vsize;
-  }
-  /*! \brief add statistics to the data */
-  inline void Add(const CVGradStats &b) {
-    GradStats::Add(b);
-    for (unsigned i = 0; i < vsize; ++i) {
-      train[i].Add(b.train[i]);
-      valid[i].Add(b.valid[i]);
-    }
-  }
-  /*! \brief same as add, reduce is used in All Reduce */
-  inline static void Reduce(CVGradStats &a, const CVGradStats &b) { // NOLINT(*)
-    a.Add(b);
-  }
-  /*! \brief set current value to a - b */
-  inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
-    GradStats::SetSubstract(a, b);
-    for (int i = 0; i < vsize; ++i) {
-      train[i].SetSubstract(a.train[i], b.train[i]);
-      valid[i].SetSubstract(a.valid[i], b.valid[i]);
-    }
-  }
-  /*! \brief set leaf vector value based on statistics */
-  inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
-    for (int i = 0; i < vsize; ++i) {
-      vec[i] = param.learning_rate *
-          param.CalcWeight(train[i].sum_grad, train[i].sum_hess);
-    }
-  }
-};
-
-/*!
- * \brief statistics that is helpful to store
- *   and represent a split solution for the tree
- */
-struct SplitEntry{
-  /*! \brief loss change after split this node */
-  bst_float loss_chg;
-  /*! \brief split index */
-  unsigned sindex;
-  /*! \brief split value */
-  float split_value;
-  /*! \brief constructor */
-  SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
-  /*!
-   * \brief decides whether we can replace current entry with the given statistics
-   *   This function gives better priority to lower index when loss_chg == new_loss_chg.
-   *   Not the best way, but helps to give consistent result during multi-thread execution.
-   * \param new_loss_chg the loss reduction get through the split
-   * \param split_index the feature index where the split is on
-   */
-  inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
-    if (this->split_index() <= split_index) {
-      return new_loss_chg > this->loss_chg;
-    } else {
-      return !(this->loss_chg > new_loss_chg);
-    }
-  }
-  /*!
-   * \brief update the split entry, replace it if e is better
-   * \param e candidate split solution
-   * \return whether the proposed split is better and can replace current split
-   */
-  inline bool Update(const SplitEntry &e) {
-    if (this->NeedReplace(e.loss_chg, e.split_index())) {
-      this->loss_chg = e.loss_chg;
-      this->sindex = e.sindex;
-      this->split_value = e.split_value;
-      return true;
-    } else {
-      return false;
-    }
-  }
-  /*!
-   * \brief update the split entry, replace it if e is better
-   * \param new_loss_chg loss reduction of new candidate
-   * \param split_index feature index to split on
-   * \param new_split_value the split point
-   * \param default_left whether the missing value goes to left
-   * \return whether the proposed split is better and can replace current split
-   */
-  inline bool Update(bst_float new_loss_chg, unsigned split_index,
-                     float new_split_value, bool default_left) {
-    if (this->NeedReplace(new_loss_chg, split_index)) {
-      this->loss_chg = new_loss_chg;
-      if (default_left) split_index |= (1U << 31);
-      this->sindex = split_index;
-      this->split_value = new_split_value;
-      return true;
-    } else {
-      return false;
-    }
-  }
-  /*! \brief same as update, used by AllReduce*/
-  inline static void Reduce(SplitEntry &dst, const SplitEntry &src) { // NOLINT(*)
-    dst.Update(src);
-  }
-  /*!\return feature index to split on */
-  inline unsigned split_index(void) const {
-    return sindex & ((1U << 31) - 1U);
-  }
-  /*!\return whether missing value goes to left branch */
-  inline bool default_left(void) const {
-    return (sindex >> 31) != 0;
-  }
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_PARAM_H_
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -1,35 +0,0 @@
-// Copyright 2014 by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <cstring>
-#include "./updater.h"
-#include "./updater_prune-inl.hpp"
-#include "./updater_refresh-inl.hpp"
-#include "./updater_colmaker-inl.hpp"
-#ifndef XGBOOST_STRICT_CXX98_
-#include "./updater_sync-inl.hpp"
-#include "./updater_distcol-inl.hpp"
-#include "./updater_histmaker-inl.hpp"
-#include "./updater_skmaker-inl.hpp"
-#endif
-
-namespace xgboost {
-namespace tree {
-IUpdater* CreateUpdater(const char *name) {
-  using namespace std;
-  if (!strcmp(name, "prune")) return new TreePruner();
-  if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
-  if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
-#ifndef XGBOOST_STRICT_CXX98_
-  if (!strcmp(name, "sync")) return new TreeSyncher();
-  if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
-  if (!strcmp(name, "grow_skmaker")) return new SketchMaker();
-  if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
-#endif
-  utils::Error("unknown updater:%s", name);
-  return NULL;
-}
-
-}  // namespace tree
-}  // namespace xgboost
--- a/src/tree/updater.h
+++ b/src/tree/updater.h
@@ -1,63 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater.h
- * \brief interface to update the tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_H_
-#define XGBOOST_TREE_UPDATER_H_
-
-#include <vector>
-
-#include "../data.h"
-#include "./model.h"
-
-namespace xgboost {
-namespace tree {
-/*!
- * \brief interface of tree update module, that performs update of a tree
- */
-class IUpdater {
- public:
-  /*!
-   * \brief set parameters from outside
-   * \param name name of the parameter
-   * \param val  value of the parameter
-   */
-  virtual void SetParam(const char *name, const char *val) = 0;
-  /*!
-   * \brief perform update to the tree models
-   * \param gpair the gradient pair statistics of the data
-   * \param p_fmat feature matrix that provide access to features
-   * \param info extra side information that may be need, such as root index
-   * \param trees references the trees to be updated, updater will change the content of trees
-   *   note: all the trees in the vector are updated, with the same statistics,
-   *         but maybe different random seeds, usually one tree is passed in at a time,
-   *         there can be multiple trees when we train random forest style model
-   */
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) = 0;
-
-  /*!
-   * \brief this is simply a function for optimizing performance
-   * this function asks the updater to return the leaf position of each instance in the p_fmat,
-   * if it is cached in the updater, if it is not available, return NULL
-   * \return array of leaf position of each instance in the last updated tree
-   */
-  virtual const int* GetLeafPosition(void) const {
-    return NULL;
-  }
-  // destructor
-  virtual ~IUpdater(void) {}
-};
-/*!
- * \brief create an updater based on name
- * \param name name of updater
- * \return return the updater instance
- */
-IUpdater* CreateUpdater(const char *name);
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_H_
--- a/src/tree/updater_basemaker-inl.hpp
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -1,427 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_basemaker-inl.hpp
- * \brief implement a common tree constructor
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
-#include <vector>
-#include <algorithm>
-#include <string>
-#include <limits>
-#include "../sync/sync.h"
-#include "../utils/random.h"
-#include "../utils/quantile.h"
-
-namespace xgboost {
-namespace tree {
-/*!
- * \brief base tree maker class that defines common operation
- *  needed in tree making
- */
-class BaseMaker: public IUpdater {
- public:
-  // destructor
-  virtual ~BaseMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-  }
-
- protected:
-  // helper to collect and query feature meta information
-  struct FMetaHelper {
-   public:
-    /*! \brief find type of each feature, use column format */
-    inline void InitByCol(IFMatrix *p_fmat,
-                          const RegTree &tree) {
-      fminmax.resize(tree.param.num_feature * 2);
-      std::fill(fminmax.begin(), fminmax.end(),
-                -std::numeric_limits<bst_float>::max());
-      // start accumulating statistics
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
-      iter->BeforeFirst();
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (bst_uint i = 0; i < batch.size; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const ColBatch::Inst &c = batch[i];
-          if (c.length != 0) {
-            fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
-            fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
-          }
-        }
-      }
-      rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
-    }
-    // get feature type, 0:empty 1:binary 2:real
-    inline int Type(bst_uint fid) const {
-      utils::Assert(fid * 2 + 1 < fminmax.size(),
-                    "FeatHelper fid exceed query bound ");
-      bst_float a = fminmax[fid * 2];
-      bst_float b = fminmax[fid * 2 + 1];
-      if (a == -std::numeric_limits<bst_float>::max()) return 0;
-      if (-a == b) {
-        return 1;
-      } else {
-        return 2;
-      }
-    }
-    inline bst_float MaxValue(bst_uint fid) const {
-      return fminmax[fid *2 + 1];
-    }
-    inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
-      std::vector<bst_uint> &findex = *p_findex;
-      findex.clear();
-      for (size_t i = 0; i < fminmax.size(); i += 2) {
-        const bst_uint fid = static_cast<bst_uint>(i / 2);
-        if (this->Type(fid) != 0) findex.push_back(fid);
-      }
-      unsigned n = static_cast<unsigned>(p * findex.size());
-      random::Shuffle(findex);
-      findex.resize(n);
-      // sync the findex if it is subsample
-      std::string s_cache;
-      utils::MemoryBufferStream fc(&s_cache);
-      utils::IStream &fs = fc;
-      if (rabit::GetRank() == 0) {
-        fs.Write(findex);
-      }
-      rabit::Broadcast(&s_cache, 0);
-      fs.Read(&findex);
-    }
-
-   private:
-    std::vector<bst_float> fminmax;
-  };
-  // ------static helper functions ------
-  // helper function to get to next level of the tree
-  /*! \brief this is  helper function for row based data*/
-  inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
-    const RegTree::Node &n = tree[nid];
-    bst_uint findex = n.split_index();
-    for (unsigned i = 0; i < inst.length; ++i) {
-      if (findex == inst[i].index) {
-        if (inst[i].fvalue < n.split_cond()) {
-          return n.cleft();
-        } else {
-          return n.cright();
-        }
-      }
-    }
-    return n.cdefault();
-  }
-  /*! \brief get number of omp thread in current context */
-  inline static int get_nthread(void) {
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    return nthread;
-  }
-  //  ------class member helpers---------
-  /*! \brief initialize temp data structure */
-  inline void InitData(const std::vector<bst_gpair> &gpair,
-                       const IFMatrix &fmat,
-                       const std::vector<unsigned> &root_index,
-                       const RegTree &tree) {
-    utils::Assert(tree.param.num_nodes == tree.param.num_roots,
-                  "TreeMaker: can only grow new tree");
-    {
-      // setup position
-      position.resize(gpair.size());
-      if (root_index.size() == 0) {
-        std::fill(position.begin(), position.end(), 0);
-      } else {
-        for (size_t i = 0; i < position.size(); ++i) {
-          position[i] = root_index[i];
-          utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
-                        "root index exceed setting");
-        }
-      }
-      // mark delete for the deleted datas
-      for (size_t i = 0; i < position.size(); ++i) {
-        if (gpair[i].hess < 0.0f) position[i] = ~position[i];
-      }
-      // mark subsample
-      if (param.subsample < 1.0f) {
-        for (size_t i = 0; i < position.size(); ++i) {
-          if (gpair[i].hess < 0.0f) continue;
-          if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
-        }
-      }
-    }
-    {
-      // expand query
-      qexpand.reserve(256); qexpand.clear();
-      for (int i = 0; i < tree.param.num_roots; ++i) {
-        qexpand.push_back(i);
-      }
-      this->UpdateNode2WorkIndex(tree);
-    }
-  }
-  /*! \brief update queue expand add in new leaves */
-  inline void UpdateQueueExpand(const RegTree &tree) {
-    std::vector<int> newnodes;
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      const int nid = qexpand[i];
-      if (!tree[nid].is_leaf()) {
-        newnodes.push_back(tree[nid].cleft());
-        newnodes.push_back(tree[nid].cright());
-      }
-    }
-    // use new nodes for qexpand
-    qexpand = newnodes;
-    this->UpdateNode2WorkIndex(tree);
-  }
-  // return decoded position
-  inline int DecodePosition(bst_uint ridx) const {
-    const int pid = position[ridx];
-    return pid < 0 ? ~pid : pid;
-  }
-  // encode the encoded position value for ridx
-  inline void SetEncodePosition(bst_uint ridx, int nid) {
-    if (position[ridx] < 0) {
-      position[ridx] = ~nid;
-    } else {
-      position[ridx] = nid;
-    }
-  }
-  /*!
-   * \brief this is helper function uses column based data structure,
-   *        reset the positions to the lastest one
-   * \param nodes the set of nodes that contains the split to be used
-   * \param p_fmat feature matrix needed for tree construction
-   * \param tree the regression tree structure
-   */
-  inline void ResetPositionCol(const std::vector<int> &nodes,
-                               IFMatrix *p_fmat, const RegTree &tree) {
-    // set the positions in the nondefault
-    this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
-    // set rest of instances to default position
-    const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-    // set default direct nodes to default
-    // for leaf nodes that are not fresh, mark then to ~nid,
-    // so that they are ignored in future statistics collection
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_uint ridx = rowset[i];
-      const int nid = this->DecodePosition(ridx);
-      if (tree[nid].is_leaf()) {
-        // mark finish when it is not a fresh leaf
-        if (tree[nid].cright() == -1) {
-          position[ridx] = ~nid;
-        }
-        } else {
-        // push to default branch
-        if (tree[nid].default_left()) {
-          this->SetEncodePosition(ridx, tree[nid].cleft());
-        } else {
-          this->SetEncodePosition(ridx, tree[nid].cright());
-        }
-      }
-    }
-  }
-  /*!
-   * \brief this is helper function uses column based data structure,
-   *        update all positions into nondefault branch, if any, ignore the default branch
-   * \param nodes the set of nodes that contains the split to be used
-   * \param p_fmat feature matrix needed for tree construction
-   * \param tree the regression tree structure
-   */
-  virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
-                                        IFMatrix *p_fmat, const RegTree &tree) {
-    // step 1, classify the non-default data into right places
-    std::vector<unsigned> fsplits;
-    for (size_t i = 0; i < nodes.size(); ++i) {
-      const int nid = nodes[i];
-      if (!tree[nid].is_leaf()) {
-        fsplits.push_back(tree[nid].split_index());
-      }
-    }
-    std::sort(fsplits.begin(), fsplits.end());
-    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      for (size_t i = 0; i < batch.size; ++i) {
-        ColBatch::Inst col = batch[i];
-        const bst_uint fid = batch.col_index[i];
-        const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = 0; j < ndata; ++j) {
-          const bst_uint ridx = col[j].index;
-          const float fvalue = col[j].fvalue;
-          const int nid = this->DecodePosition(ridx);
-          // go back to parent, correct those who are not default
-          if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
-            if (fvalue < tree[nid].split_cond()) {
-              this->SetEncodePosition(ridx, tree[nid].cleft());
-            } else {
-              this->SetEncodePosition(ridx, tree[nid].cright());
-            }
-          }
-        }
-      }
-    }
-  }
-  /*! \brief helper function to get statistics from a tree */
-  template<typename TStats>
-  inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
-                           const IFMatrix &fmat,
-                           const RegTree &tree,
-                           const BoosterInfo &info,
-                           std::vector< std::vector<TStats> > *p_thread_temp,
-                           std::vector<TStats> *p_node_stats) {
-    std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
-    thread_temp.resize(this->get_nthread());
-    p_node_stats->resize(tree.param.num_nodes);
-    #pragma omp parallel
-    {
-      const int tid = omp_get_thread_num();
-      thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const unsigned nid = qexpand[i];
-        thread_temp[tid][nid].Clear();
-      }
-    }
-    const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-    // setup position
-    const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < ndata; ++i) {
-      const bst_uint ridx = rowset[i];
-      const int nid = position[ridx];
-      const int tid = omp_get_thread_num();
-      if (nid >= 0) {
-        thread_temp[tid][nid].Add(gpair, info, ridx);
-      }
-    }
-    // sum the per thread statistics together
-    for (size_t j = 0; j < qexpand.size(); ++j) {
-      const int nid = qexpand[j];
-      TStats &s = (*p_node_stats)[nid];
-      s.Clear();
-      for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
-        s.Add(thread_temp[tid][nid]);
-      }
-    }
-  }
-  /*! \brief common helper data structure to build sketch */
-  struct SketchEntry {
-    /*! \brief total sum of amount to be met */
-    double sum_total;
-    /*! \brief statistics used in the sketch */
-    double rmin, wmin;
-    /*! \brief last seen feature value */
-    bst_float last_fvalue;
-    /*! \brief current size of sketch */
-    double next_goal;
-    // pointer to the sketch to put things in
-    utils::WXQuantileSketch<bst_float, bst_float> *sketch;
-    // initialize the space
-    inline void Init(unsigned max_size) {
-      next_goal = -1.0f;
-      rmin = wmin = 0.0f;
-      sketch->temp.Reserve(max_size + 1);
-      sketch->temp.size = 0;
-    }
-    /*!
-     * \brief push a new element to sketch
-     * \param fvalue feature value, comes in sorted ascending order
-     * \param w weight
-     * \param max_size
-     */
-    inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
-      if (next_goal == -1.0f) {
-        next_goal = 0.0f;
-        last_fvalue = fvalue;
-        wmin = w;
-        return;
-      }
-      if (last_fvalue != fvalue) {
-        double rmax = rmin + wmin;
-        if (rmax >= next_goal && sketch->temp.size != max_size) {
-          if (sketch->temp.size == 0 ||
-              last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-            // push to sketch
-            sketch->temp.data[sketch->temp.size] =
-                utils::WXQuantileSketch<bst_float, bst_float>::
-                Entry(static_cast<bst_float>(rmin),
-                      static_cast<bst_float>(rmax),
-                      static_cast<bst_float>(wmin), last_fvalue);
-            utils::Assert(sketch->temp.size < max_size,
-                          "invalid maximum size max_size=%u, stemp.size=%lu\n",
-                          max_size, sketch->temp.size);
-            ++sketch->temp.size;
-          }
-          if (sketch->temp.size == max_size) {
-            next_goal = sum_total * 2.0f + 1e-5f;
-          } else {
-            next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
-          }
-        } else {
-          if (rmax >= next_goal) {
-            rabit::TrackerPrintf("INFO: rmax=%g, sum_total=%g, next_goal=%g, size=%lu\n",
-                                 rmax, sum_total, next_goal, sketch->temp.size);
-          }
-        }
-        rmin = rmax;
-        wmin = w;
-        last_fvalue = fvalue;
-      } else {
-        wmin += w;
-      }
-    }
-    /*! \brief push final unfinished value to the sketch */
-    inline void Finalize(unsigned max_size) {
-      double rmax = rmin + wmin;
-      if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
-        utils::Assert(sketch->temp.size <= max_size,
-                      "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
-                      sketch->temp.size, max_size);
-        // push to sketch
-        sketch->temp.data[sketch->temp.size] =
-            utils::WXQuantileSketch<bst_float, bst_float>::
-            Entry(static_cast<bst_float>(rmin),
-                  static_cast<bst_float>(rmax),
-                  static_cast<bst_float>(wmin), last_fvalue);
-        ++sketch->temp.size;
-      }
-      sketch->PushTemp();
-    }
-  };
-  /*! \brief training parameter of tree grower */
-  TrainParam param;
-  /*! \brief queue of nodes to be expanded */
-  std::vector<int> qexpand;
-  /*!
-   * \brief map active node to is working index offset in qexpand,
-   *   can be -1, which means the node is node actively expanding
-   */
-  std::vector<int> node2workindex;
-  /*!
-   * \brief position of each instance in the tree
-   *   can be negative, which means this position is no longer expanding
-   *   see also Decode/EncodePosition
-   */
-  std::vector<int> position;
-
- private:
-  inline void UpdateNode2WorkIndex(const RegTree &tree) {
-    // update the node2workindex
-    std::fill(node2workindex.begin(), node2workindex.end(), -1);
-    node2workindex.resize(tree.param.num_nodes);
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      node2workindex[qexpand[i]] = static_cast<int>(i);
-    }
-  }
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -1,732 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_colmaker-inl.hpp
- * \brief use columnwise update to construct a tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
-
-#include <vector>
-#include <cmath>
-#include <algorithm>
-#include "./param.h"
-#include "./updater.h"
-#include "../utils/omp.h"
-#include "../utils/random.h"
-
-namespace xgboost {
-namespace tree {
-/*! \brief column-wise update to construct a tree */
-template<typename TStats>
-class ColMaker: public IUpdater {
- public:
-  virtual ~ColMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-  }
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
-    // rescale learning rate according to size of trees
-    float lr = param.learning_rate;
-    param.learning_rate = lr / trees.size();
-    // build tree
-    for (size_t i = 0; i < trees.size(); ++i) {
-      Builder builder(param);
-      builder.Update(gpair, p_fmat, info, trees[i]);
-    }
-
-    param.learning_rate = lr;
-  }
-
- protected:
-  // training parameter
-  TrainParam param;
-  // data structure
-  /*! \brief per thread x per node entry to store tmp data */
-  struct ThreadEntry {
-    /*! \brief statistics of data */
-    TStats stats;
-    /*! \brief extra statistics of data */
-    TStats stats_extra;
-    /*! \brief last feature value scanned */
-    float  last_fvalue;
-    /*! \brief first feature value scanned */
-    float  first_fvalue;
-    /*! \brief current best solution */
-    SplitEntry best;
-    // constructor
-    explicit ThreadEntry(const TrainParam &param)
-        : stats(param), stats_extra(param) {
-    }
-  };
-  struct NodeEntry {
-    /*! \brief statics for node entry */
-    TStats stats;
-    /*! \brief loss of this node, without split */
-    bst_float root_gain;
-    /*! \brief weight calculated related to current data */
-    float weight;
-    /*! \brief current best solution */
-    SplitEntry best;
-    // constructor
-    explicit NodeEntry(const TrainParam &param)
-        : stats(param), root_gain(0.0f), weight(0.0f){
-    }
-  };
-  // actual builder that runs the algorithm
-  struct Builder{
-   public:
-    // constructor
-    explicit Builder(const TrainParam &param) : param(param) {}
-    // update one tree, growing
-    virtual void Update(const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
-                        RegTree *p_tree) {
-      this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-      this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
-      for (int depth = 0; depth < param.max_depth; ++depth) {
-        this->FindSplit(depth, qexpand_, gpair, p_fmat, info, p_tree);
-        this->ResetPosition(qexpand_, p_fmat, *p_tree);
-        this->UpdateQueueExpand(*p_tree, &qexpand_);
-        this->InitNewNode(qexpand_, gpair, *p_fmat, info, *p_tree);
-        // if nothing left to be expand, break
-        if (qexpand_.size() == 0) break;
-      }
-      // set all the rest expanding nodes to leaf
-      for (size_t i = 0; i < qexpand_.size(); ++i) {
-        const int nid = qexpand_[i];
-        (*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
-      }
-      // remember auxiliary statistics in the tree node
-      for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
-        p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
-        p_tree->stat(nid).base_weight = snode[nid].weight;
-        p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
-        snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
-      }
-    }
-
-   protected:
-    // initialize temp data structure
-    inline void InitData(const std::vector<bst_gpair> &gpair,
-                         const IFMatrix &fmat,
-                         const std::vector<unsigned> &root_index,
-                         const RegTree &tree) {
-      utils::Assert(tree.param.num_nodes == tree.param.num_roots,
-                    "ColMaker: can only grow new tree");
-      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-      {
-        // setup position
-        position.resize(gpair.size());
-        if (root_index.size() == 0) {
-          for (size_t i = 0; i < rowset.size(); ++i) {
-            position[rowset[i]] = 0;
-          }
-        } else {
-          for (size_t i = 0; i < rowset.size(); ++i) {
-            const bst_uint ridx = rowset[i];
-            position[ridx] = root_index[ridx];
-            utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots,
-                          "root index exceed setting");
-          }
-        }
-        // mark delete for the deleted datas
-        for (size_t i = 0; i < rowset.size(); ++i) {
-          const bst_uint ridx = rowset[i];
-          if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
-        }
-        // mark subsample
-        if (param.subsample < 1.0f) {
-          for (size_t i = 0; i < rowset.size(); ++i) {
-            const bst_uint ridx = rowset[i];
-            if (gpair[ridx].hess < 0.0f) continue;
-            if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
-          }
-        }
-      }
-      {
-        // initialize feature index
-        unsigned ncol = static_cast<unsigned>(fmat.NumCol());
-        for (unsigned i = 0; i < ncol; ++i) {
-          if (fmat.GetColSize(i) != 0) {
-            feat_index.push_back(i);
-          }
-        }
-        unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
-        random::Shuffle(feat_index);
-        utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included",
-                     param.colsample_bytree);
-        feat_index.resize(n);
-      }
-      {
-        // setup temp space for each thread
-        #pragma omp parallel
-        {
-          this->nthread = omp_get_num_threads();
-        }
-        // reserve a small space
-        stemp.clear();
-        stemp.resize(this->nthread, std::vector<ThreadEntry>());
-        for (size_t i = 0; i < stemp.size(); ++i) {
-          stemp[i].clear(); stemp[i].reserve(256);
-        }
-        snode.reserve(256);
-      }
-      {
-        // expand query
-        qexpand_.reserve(256); qexpand_.clear();
-        for (int i = 0; i < tree.param.num_roots; ++i) {
-          qexpand_.push_back(i);
-        }
-      }
-    }
-    /*!
-     * \brief initialize the base_weight, root_gain,
-     *  and NodeEntry for all the new nodes in qexpand
-     */
-    inline void InitNewNode(const std::vector<int> &qexpand,
-                            const std::vector<bst_gpair> &gpair,
-                            const IFMatrix &fmat,
-                            const BoosterInfo &info,
-                            const RegTree &tree) {
-      {
-        // setup statistics space for each tree node
-        for (size_t i = 0; i < stemp.size(); ++i) {
-          stemp[i].resize(tree.param.num_nodes, ThreadEntry(param));
-        }
-        snode.resize(tree.param.num_nodes, NodeEntry(param));
-      }
-      const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
-      // setup position
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int tid = omp_get_thread_num();
-        if (position[ridx] < 0) continue;
-        stemp[tid][position[ridx]].stats.Add(gpair, info, ridx);
-      }
-      // sum the per thread statistics together
-      for (size_t j = 0; j < qexpand.size(); ++j) {
-        const int nid = qexpand[j];
-        TStats stats(param);
-        for (size_t tid = 0; tid < stemp.size(); ++tid) {
-          stats.Add(stemp[tid][nid].stats);
-        }
-        // update node statistics
-        snode[nid].stats = stats;
-        snode[nid].root_gain = static_cast<float>(stats.CalcGain(param));
-        snode[nid].weight = static_cast<float>(stats.CalcWeight(param));
-      }
-    }
-    /*! \brief update queue expand add in new leaves */
-    inline void UpdateQueueExpand(const RegTree &tree, std::vector<int> *p_qexpand) {
-      std::vector<int> &qexpand = *p_qexpand;
-      std::vector<int> newnodes;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[ nid ].is_leaf()) {
-          newnodes.push_back(tree[nid].cleft());
-          newnodes.push_back(tree[nid].cright());
-        }
-      }
-      // use new nodes for qexpand
-      qexpand = newnodes;
-    }
-    // parallel find the best split of current fid
-    // this function does not support nested functions
-    inline void ParallelFindSplit(const ColBatch::Inst &col,
-                                  bst_uint fid,
-                                  const IFMatrix &fmat,
-                                  const std::vector<bst_gpair> &gpair,
-                                  const BoosterInfo &info) {
-      const bool ind = col.length != 0 && col.data[0].fvalue == col.data[col.length - 1].fvalue;
-      bool need_forward = param.need_forward_search(fmat.GetColDensity(fid), ind);
-      bool need_backward = param.need_backward_search(fmat.GetColDensity(fid), ind);
-      const std::vector<int> &qexpand = qexpand_;
-      #pragma omp parallel
-      {
-        const int tid = omp_get_thread_num();
-        std::vector<ThreadEntry> &temp = stemp[tid];
-        // cleanup temp statistics
-        for (size_t j = 0; j < qexpand.size(); ++j) {
-          temp[qexpand[j]].stats.Clear();
-        }
-        nthread = omp_get_num_threads();
-        bst_uint step = (col.length + nthread - 1) / nthread;
-        bst_uint end = std::min(col.length, step * (tid + 1));
-        for (bst_uint i = tid * step; i < end; ++i) {
-          const bst_uint ridx = col[i].index;
-          const int nid = position[ridx];
-          if (nid < 0) continue;
-          const float fvalue = col[i].fvalue;
-          if (temp[nid].stats.Empty()) {
-            temp[nid].first_fvalue = fvalue;
-          }
-          temp[nid].stats.Add(gpair, info, ridx);
-          temp[nid].last_fvalue = fvalue;
-        }
-      }
-      // start collecting the partial sum statistics
-      bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint j = 0; j < nnode; ++j) {
-        const int nid = qexpand[j];
-        TStats sum(param), tmp(param), c(param);
-        for (int tid = 0; tid < nthread; ++tid) {
-          tmp = stemp[tid][nid].stats;
-          stemp[tid][nid].stats = sum;
-          sum.Add(tmp);
-          if (tid != 0) {
-            std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
-          }
-        }
-        for (int tid = 0; tid < nthread; ++tid) {
-          stemp[tid][nid].stats_extra = sum;
-          ThreadEntry &e = stemp[tid][nid];
-          float fsplit;
-          if (tid != 0) {
-            if (std::abs(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
-              fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
-            } else {
-              continue;
-            }
-          } else {
-            fsplit = e.first_fvalue - rt_eps;
-          }
-          if (need_forward && tid != 0) {
-            c.SetSubstract(snode[nid].stats, e.stats);
-            if (c.sum_hess >= param.min_child_weight &&
-                e.stats.sum_hess >= param.min_child_weight) {
-              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                          c.CalcGain(param) - snode[nid].root_gain);
-              e.best.Update(loss_chg, fid, fsplit, false);
-            }
-          }
-          if (need_backward) {
-            tmp.SetSubstract(sum, e.stats);
-            c.SetSubstract(snode[nid].stats, tmp);
-            if (c.sum_hess >= param.min_child_weight &&
-                tmp.sum_hess >= param.min_child_weight) {
-              bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) +
-                                                          c.CalcGain(param) - snode[nid].root_gain);
-              e.best.Update(loss_chg, fid, fsplit, true);
-            }
-          }
-        }
-        if (need_backward) {
-          tmp = sum;
-          ThreadEntry &e = stemp[nthread-1][nid];
-          c.SetSubstract(snode[nid].stats, tmp);
-          if (c.sum_hess >= param.min_child_weight &&
-              tmp.sum_hess >= param.min_child_weight) {
-            bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) +
-                                                        c.CalcGain(param) - snode[nid].root_gain);
-            e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
-          }
-        }
-      }
-      // rescan, generate candidate split
-      #pragma omp parallel
-      {
-        TStats c(param), cright(param);
-        const int tid = omp_get_thread_num();
-        std::vector<ThreadEntry> &temp = stemp[tid];
-        nthread = static_cast<bst_uint>(omp_get_num_threads());
-        bst_uint step = (col.length + nthread - 1) / nthread;
-        bst_uint end = std::min(col.length, step * (tid + 1));
-        for (bst_uint i = tid * step; i < end; ++i) {
-          const bst_uint ridx = col[i].index;
-          const int nid = position[ridx];
-          if (nid < 0) continue;
-          const float fvalue = col[i].fvalue;
-          // get the statistics of nid
-          ThreadEntry &e = temp[nid];
-          if (e.stats.Empty()) {
-            e.stats.Add(gpair, info, ridx);
-            e.first_fvalue = fvalue;
-          } else {
-            // forward default right
-            if (std::abs(fvalue - e.first_fvalue) > rt_2eps) {
-              if (need_forward) {
-                c.SetSubstract(snode[nid].stats, e.stats);
-                if (c.sum_hess >= param.min_child_weight &&
-                    e.stats.sum_hess >= param.min_child_weight) {
-                  bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                              c.CalcGain(param) -
-                                                              snode[nid].root_gain);
-                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
-                }
-              }
-              if (need_backward) {
-                cright.SetSubstract(e.stats_extra, e.stats);
-                c.SetSubstract(snode[nid].stats, cright);
-                if (c.sum_hess >= param.min_child_weight &&
-                    cright.sum_hess >= param.min_child_weight) {
-                  bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) +
-                                                              c.CalcGain(param) -
-                                                              snode[nid].root_gain);
-                  e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
-                }
-              }
-            }
-            e.stats.Add(gpair, info, ridx);
-            e.first_fvalue = fvalue;
-          }
-        }
-      }
-    }
-    // update enumeration solution
-    inline void UpdateEnumeration(int nid, bst_gpair gstats,
-                                  float fvalue, int d_step, bst_uint fid,
-                                  TStats &c, std::vector<ThreadEntry> &temp) { // NOLINT(*)
-      // get the statistics of nid
-      ThreadEntry &e = temp[nid];
-      // test if first hit, this is fine, because we set 0 during init
-      if (e.stats.Empty()) {
-        e.stats.Add(gstats);
-        e.last_fvalue = fvalue;
-      } else {
-        // try to find a split
-        if (std::abs(fvalue - e.last_fvalue) > rt_2eps &&
-            e.stats.sum_hess >= param.min_child_weight) {
-          c.SetSubstract(snode[nid].stats, e.stats);
-          if (c.sum_hess >= param.min_child_weight) {
-            bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                        c.CalcGain(param) - snode[nid].root_gain);
-            e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
-          }
-        }
-        // update the statistics
-        e.stats.Add(gstats);
-        e.last_fvalue = fvalue;
-      }
-    }
-    // same as EnumerateSplit, with cacheline prefetch optimization
-    inline void EnumerateSplitCacheOpt(const ColBatch::Entry *begin,
-                                       const ColBatch::Entry *end,
-                                       int d_step,
-                                       bst_uint fid,
-                                       const std::vector<bst_gpair> &gpair,
-                                       std::vector<ThreadEntry> &temp) { // NOLINT(*)
-      const std::vector<int> &qexpand = qexpand_;
-      // clear all the temp statistics
-      for (size_t j = 0; j < qexpand.size(); ++j) {
-        temp[qexpand[j]].stats.Clear();
-      }
-      // left statistics
-      TStats c(param);
-      // local cache buffer for position and gradient pair
-      const int kBuffer = 32;
-      int buf_position[kBuffer];
-      bst_gpair buf_gpair[kBuffer];
-      // aligned ending position
-      const ColBatch::Entry *align_end;
-      if (d_step > 0) {
-        align_end = begin + (end - begin) / kBuffer * kBuffer;
-      } else {
-        align_end = begin - (begin - end) / kBuffer * kBuffer;
-      }
-      int i;
-      const ColBatch::Entry *it;
-      const int align_step = d_step * kBuffer;
-      // internal cached loop
-      for (it = begin; it != align_end; it += align_step) {
-        const ColBatch::Entry *p;
-        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
-          buf_position[i] = position[p->index];
-          buf_gpair[i] = gpair[p->index];
-        }
-        for (i = 0, p = it; i < kBuffer; ++i, p += d_step) {
-          const int nid = buf_position[i];
-          if (nid < 0) continue;
-          this->UpdateEnumeration(nid, buf_gpair[i],
-                                  p->fvalue, d_step,
-                                  fid, c, temp);
-        }
-      }
-      // finish up the ending piece
-      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
-        buf_position[i] = position[it->index];
-        buf_gpair[i] = gpair[it->index];
-      }
-      for (it = align_end, i = 0; it != end; ++i, it += d_step) {
-        const int nid = buf_position[i];
-        if (nid < 0) continue;
-        this->UpdateEnumeration(nid, buf_gpair[i],
-                                it->fvalue, d_step,
-                                fid, c, temp);
-      }
-      // finish updating all statistics, check if it is possible to include all sum statistics
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        ThreadEntry &e = temp[nid];
-        c.SetSubstract(snode[nid].stats, e.stats);
-        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
-          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                      c.CalcGain(param) - snode[nid].root_gain);
-          const float gap = std::abs(e.last_fvalue) + rt_eps;
-          const float delta = d_step == +1 ? gap: -gap;
-          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
-        }
-      }
-    }
-
-    // enumerate the split values of specific feature
-    inline void EnumerateSplit(const ColBatch::Entry *begin,
-                               const ColBatch::Entry *end,
-                               int d_step,
-                               bst_uint fid,
-                               const std::vector<bst_gpair> &gpair,
-                               const BoosterInfo &info,
-                               std::vector<ThreadEntry> &temp) { // NOLINT(*)
-      // use cacheline aware optimization
-      if (TStats::kSimpleStats != 0 && param.cache_opt != 0) {
-        EnumerateSplitCacheOpt(begin, end, d_step, fid, gpair, temp);
-        return;
-      }
-      const std::vector<int> &qexpand = qexpand_;
-      // clear all the temp statistics
-      for (size_t j = 0; j < qexpand.size(); ++j) {
-        temp[qexpand[j]].stats.Clear();
-      }
-      // left statistics
-      TStats c(param);
-      for (const ColBatch::Entry *it = begin; it != end; it += d_step) {
-        const bst_uint ridx = it->index;
-        const int nid = position[ridx];
-        if (nid < 0) continue;
-        // start working
-        const float fvalue = it->fvalue;
-        // get the statistics of nid
-        ThreadEntry &e = temp[nid];
-        // test if first hit, this is fine, because we set 0 during init
-        if (e.stats.Empty()) {
-          e.stats.Add(gpair, info, ridx);
-          e.last_fvalue = fvalue;
-        } else {
-          // try to find a split
-          if (std::abs(fvalue - e.last_fvalue) > rt_2eps &&
-              e.stats.sum_hess >= param.min_child_weight) {
-            c.SetSubstract(snode[nid].stats, e.stats);
-            if (c.sum_hess >= param.min_child_weight) {
-              bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                          c.CalcGain(param) - snode[nid].root_gain);
-              e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, d_step == -1);
-            }
-          }
-          // update the statistics
-          e.stats.Add(gpair, info, ridx);
-          e.last_fvalue = fvalue;
-        }
-      }
-      // finish updating all statistics, check if it is possible to include all sum statistics
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        ThreadEntry &e = temp[nid];
-        c.SetSubstract(snode[nid].stats, e.stats);
-        if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
-          bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) +
-                                                      c.CalcGain(param) - snode[nid].root_gain);
-          const float gap = std::abs(e.last_fvalue) + rt_eps;
-          const float delta = d_step == +1 ? gap: -gap;
-          e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1);
-        }
-      }
-    }
-
-    // update the solution candidate
-    virtual void UpdateSolution(const ColBatch &batch,
-                                const std::vector<bst_gpair> &gpair,
-                                const IFMatrix &fmat,
-                                const BoosterInfo &info) {
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #if defined(_OPENMP)
-      const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
-      #endif
-      int poption = param.parallel_option;
-      if (poption == 2) {
-        poption = static_cast<int>(nsize) * 2 < nthread ? 1 : 0;
-      }
-      if (poption == 0) {
-        #pragma omp parallel for schedule(dynamic, batch_size)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          const bst_uint fid = batch.col_index[i];
-          const int tid = omp_get_thread_num();
-          const ColBatch::Inst c = batch[i];
-          const bool ind = c.length != 0 && c.data[0].fvalue == c.data[c.length - 1].fvalue;
-          if (param.need_forward_search(fmat.GetColDensity(fid), ind)) {
-            this->EnumerateSplit(c.data, c.data + c.length, +1,
-                                 fid, gpair, info, stemp[tid]);
-          }
-          if (param.need_backward_search(fmat.GetColDensity(fid), ind)) {
-            this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
-                                 fid, gpair, info, stemp[tid]);
-          }
-        }
-      } else {
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          this->ParallelFindSplit(batch[i], batch.col_index[i],
-                                  fmat, gpair, info);
-        }
-      }
-    }
-    // find splits at current level, do split per level
-    inline void FindSplit(int depth,
-                          const std::vector<int> &qexpand,
-                          const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
-                          RegTree *p_tree) {
-      std::vector<bst_uint> feat_set = feat_index;
-      if (param.colsample_bylevel != 1.0f) {
-        random::Shuffle(feat_set);
-        unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
-        utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
-        feat_set.resize(n);
-      }
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
-      while (iter->Next()) {
-        this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
-      }
-      // after this each thread's stemp will get the best candidates, aggregate results
-      this->SyncBestSolution(qexpand);
-      // get the best result, we can synchronize the solution
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        NodeEntry &e = snode[nid];
-        // now we know the solution in snode[nid], set split
-        if (e.best.loss_chg > rt_eps) {
-          p_tree->AddChilds(nid);
-          (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
-          // mark right child as 0, to indicate fresh leaf
-          (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
-          (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
-        } else {
-          (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
-        }
-      }
-    }
-    // reset position of each data points after split is created in the tree
-    inline void ResetPosition(const std::vector<int> &qexpand,
-                              IFMatrix *p_fmat, const RegTree &tree) {
-      // set the positions in the nondefault
-      this->SetNonDefaultPosition(qexpand, p_fmat, tree);
-      // set rest of instances to default position
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // set default direct nodes to default
-      // for leaf nodes that are not fresh, mark then to ~nid,
-      // so that they are ignored in future statistics collection
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        if (ridx >= position.size()) {
-          utils::Printf("ridx exceed bound\n");
-        }
-        const int nid = this->DecodePosition(ridx);
-        if (tree[nid].is_leaf()) {
-          // mark finish when it is not a fresh leaf
-          if (tree[nid].cright() == -1) {
-            position[ridx] = ~nid;
-          }
-        } else {
-          // push to default branch
-          if (tree[nid].default_left()) {
-            this->SetEncodePosition(ridx, tree[nid].cleft());
-          } else {
-            this->SetEncodePosition(ridx, tree[nid].cright());
-          }
-        }
-      }
-    }
-    // customization part
-    // synchronize the best solution of each node
-    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        NodeEntry &e = snode[nid];
-        for (int tid = 0; tid < this->nthread; ++tid) {
-          e.best.Update(stemp[tid][nid].best);
-        }
-      }
-    }
-    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       IFMatrix *p_fmat, const RegTree &tree) {
-      // step 1, classify the non-default data into right places
-      std::vector<unsigned> fsplits;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) {
-          fsplits.push_back(tree[nid].split_index());
-        }
-      }
-      std::sort(fsplits.begin(), fsplits.end());
-      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
-          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
-          #pragma omp parallel for schedule(static)
-          for (bst_omp_uint j = 0; j < ndata; ++j) {
-            const bst_uint ridx = col[j].index;
-            const int nid = this->DecodePosition(ridx);
-            const float fvalue = col[j].fvalue;
-            // go back to parent, correct those who are not default
-            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                this->SetEncodePosition(ridx, tree[nid].cleft());
-              } else {
-                this->SetEncodePosition(ridx, tree[nid].cright());
-              }
-            }
-          }
-        }
-      }
-    }
-    // utils to get/set position, with encoded format
-    // return decoded position
-    inline int DecodePosition(bst_uint ridx) const {
-      const int pid = position[ridx];
-      return pid < 0 ? ~pid : pid;
-    }
-    // encode the encoded position value for ridx
-    inline void SetEncodePosition(bst_uint ridx, int nid) {
-      if (position[ridx] < 0) {
-        position[ridx] = ~nid;
-      } else {
-        position[ridx] = nid;
-      }
-    }
-    //  --data fields--
-    const TrainParam &param;
-    // number of omp thread used during training
-    int nthread;
-    // Per feature: shuffle index of each feature index
-    std::vector<bst_uint> feat_index;
-    // Instance Data: current node position in the tree of each instance
-    std::vector<int> position;
-    // PerThread x PerTreeNode: statistics for per thread construction
-    std::vector< std::vector<ThreadEntry> > stemp;
-    /*! \brief TreeNode Data: statistics for each constructed node */
-    std::vector<NodeEntry> snode;
-    /*! \brief queue of nodes to be expanded */
-    std::vector<int> qexpand_;
-  };
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
--- a/src/tree/updater_distcol-inl.hpp
+++ b/src/tree/updater_distcol-inl.hpp
@@ -1,175 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_distcol-inl.hpp
- * \brief beta distributed version that takes a sub-column
- *        and construct a tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
-#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/bitmap.h"
-#include "../utils/io.h"
-#include "./updater_colmaker-inl.hpp"
-#include "./updater_prune-inl.hpp"
-
-namespace xgboost {
-namespace tree {
-template<typename TStats>
-class DistColMaker : public ColMaker<TStats> {
- public:
-  DistColMaker(void) : builder(param) {}
-  virtual ~DistColMaker(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-    pruner.SetParam(name, val);
-  }
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
-    utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
-    // build the tree
-    builder.Update(gpair, p_fmat, info, trees[0]);
-    //// prune the tree, note that pruner will sync the tree
-    pruner.Update(gpair, p_fmat, info, trees);
-    // update position after the tree is pruned
-    builder.UpdatePosition(p_fmat, *trees[0]);
-  }
-  virtual const int* GetLeafPosition(void) const {
-    return builder.GetLeafPosition();
-  }
-
- private:
-  struct Builder : public ColMaker<TStats>::Builder {
-   public:
-    explicit Builder(const TrainParam &param)
-        : ColMaker<TStats>::Builder(param) {
-    }
-    inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        int nid = this->DecodePosition(ridx);
-        while (tree[nid].is_deleted()) {
-          nid = tree[nid].parent();
-          utils::Assert(nid >=0, "distributed learning error");
-        }
-        this->position[ridx] = nid;
-      }
-    }
-    virtual const int* GetLeafPosition(void) const {
-      return BeginPtr(this->position);
-    }
-
-   protected:
-    virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
-                                       IFMatrix *p_fmat, const RegTree &tree) {
-      // step 2, classify the non-default data into right places
-      std::vector<unsigned> fsplits;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        if (!tree[nid].is_leaf()) {
-          fsplits.push_back(tree[nid].split_index());
-        }
-      }
-      // get the candidate split index
-      std::sort(fsplits.begin(), fsplits.end());
-      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-      while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
-        fsplits.pop_back();
-      }
-      // bitmap is only word concurrent, set to bool first
-      {
-        bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
-        boolmap.resize(ndata);
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint j = 0; j < ndata; ++j) {
-            boolmap[j] = 0;
-        }
-      }
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        for (size_t i = 0; i < batch.size; ++i) {
-          ColBatch::Inst col = batch[i];
-          const bst_uint fid = batch.col_index[i];
-          const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
-          #pragma omp parallel for schedule(static)
-          for (bst_omp_uint j = 0; j < ndata; ++j) {
-            const bst_uint ridx = col[j].index;
-            const float fvalue = col[j].fvalue;
-            const int nid = this->DecodePosition(ridx);
-            if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
-              if (fvalue < tree[nid].split_cond()) {
-                if (!tree[nid].default_left()) boolmap[ridx] = 1;
-              } else {
-                if (tree[nid].default_left()) boolmap[ridx] = 1;
-              }
-            }
-          }
-        }
-      }
-
-      bitmap.InitFromBool(boolmap);
-      // communicate bitmap
-      rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
-      const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
-      // get the new position
-      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < ndata; ++i) {
-        const bst_uint ridx = rowset[i];
-        const int nid = this->DecodePosition(ridx);
-        if (bitmap.Get(ridx)) {
-          utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
-          if (tree[nid].default_left()) {
-            this->SetEncodePosition(ridx, tree[nid].cright());
-          } else {
-            this->SetEncodePosition(ridx, tree[nid].cleft());
-          }
-        }
-      }
-    }
-    // synchronize the best solution of each node
-    virtual void SyncBestSolution(const std::vector<int> &qexpand) {
-      std::vector<SplitEntry> vec;
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        for (int tid = 0; tid < this->nthread; ++tid) {
-          this->snode[nid].best.Update(this->stemp[tid][nid].best);
-        }
-        vec.push_back(this->snode[nid].best);
-      }
-      // TODO(tqchen) lazy version
-      // communicate best solution
-      reducer.Allreduce(BeginPtr(vec), vec.size());
-      // assign solution back
-      for (size_t i = 0; i < qexpand.size(); ++i) {
-        const int nid = qexpand[i];
-        this->snode[nid].best = vec[i];
-      }
-    }
-
-   private:
-    utils::BitMap bitmap;
-    std::vector<int> boolmap;
-    rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
-  };
-  // we directly introduce pruner here
-  TreePruner pruner;
-  // training parameter
-  TrainParam param;
-  // pointer to the builder
-  Builder builder;
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
--- a/src/tree/updater_histmaker-inl.hpp
+++ b/src/tree/updater_histmaker-inl.hpp
@@ -1,769 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_histmaker-inl.hpp
- * \brief use histogram counting to construct a tree
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/quantile.h"
-#include "../utils/group_data.h"
-#include "./updater_basemaker-inl.hpp"
-
-namespace xgboost {
-namespace tree {
-template<typename TStats>
-class HistMaker: public BaseMaker {
- public:
-  virtual ~HistMaker(void) {}
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    TStats::CheckInfo(info);
-    // rescale learning rate according to size of trees
-    float lr = param.learning_rate;
-    param.learning_rate = lr / trees.size();
-    // build tree
-    for (size_t i = 0; i < trees.size(); ++i) {
-      this->Update(gpair, p_fmat, info, trees[i]);
-    }
-    param.learning_rate = lr;
-  }
-
- protected:
-  /*! \brief a single histogram */
-  struct HistUnit {
-    /*! \brief cutting point of histogram, contains maximum point */
-    const bst_float *cut;
-    /*! \brief content of statistics data */
-    TStats *data;
-    /*! \brief size of histogram */
-    unsigned size;
-    // default constructor
-    HistUnit(void) {}
-    // constructor
-    HistUnit(const bst_float *cut, TStats *data, unsigned size)
-        : cut(cut), data(data), size(size) {}
-    /*! \brief add a histogram to data */
-    inline void Add(bst_float fv,
-                    const std::vector<bst_gpair> &gpair,
-                    const BoosterInfo &info,
-                    const bst_uint ridx) {
-      unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
-      utils::Assert(size != 0, "try insert into size=0");
-      utils::Assert(i < size,
-                    "maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
-      data[i].Add(gpair, info, ridx);
-    }
-  };
-  /*! \brief a set of histograms from different index */
-  struct HistSet {
-    /*! \brief the index pointer of each histunit */
-    const unsigned *rptr;
-    /*! \brief cutting points in each histunit */
-    const bst_float *cut;
-    /*! \brief data in different hist unit */
-    std::vector<TStats> data;
-    /*! \brief */
-    inline HistUnit operator[](size_t fid) {
-      return HistUnit(cut + rptr[fid],
-                      &data[0] + rptr[fid],
-                      rptr[fid+1] - rptr[fid]);
-    }
-  };
-  // thread workspace
-  struct ThreadWSpace {
-    /*! \brief actual unit pointer */
-    std::vector<unsigned> rptr;
-    /*! \brief cut field */
-    std::vector<bst_float> cut;
-    // per thread histset
-    std::vector<HistSet> hset;
-    // initialize the hist set
-    inline void Init(const TrainParam &param, int nthread) {
-      hset.resize(nthread);
-      // cleanup statistics
-      for (int tid = 0; tid < nthread; ++tid) {
-        for (size_t i = 0; i < hset[tid].data.size(); ++i) {
-          hset[tid].data[i].Clear();
-        }
-        hset[tid].rptr = BeginPtr(rptr);
-        hset[tid].cut = BeginPtr(cut);
-        hset[tid].data.resize(cut.size(), TStats(param));
-      }
-    }
-    // aggregate all statistics to hset[0]
-    inline void Aggregate(void) {
-      bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        for (size_t tid = 1; tid < hset.size(); ++tid) {
-          hset[0].data[i].Add(hset[tid].data[i]);
-        }
-      }
-    }
-    /*! \brief clear the workspace */
-    inline void Clear(void) {
-      cut.clear(); rptr.resize(1); rptr[0] = 0;
-    }
-    /*! \brief total size */
-    inline size_t Size(void) const {
-      return rptr.size() - 1;
-    }
-  };
-  // workspace of thread
-  ThreadWSpace wspace;
-  // reducer for histogram
-  rabit::Reducer<TStats, TStats::Reduce> histred;
-  // set of working features
-  std::vector<bst_uint> fwork_set;
-  // update function implementation
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      RegTree *p_tree) {
-    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-    this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
-    for (int depth = 0; depth < param.max_depth; ++depth) {
-      // reset and propose candidate split
-      this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
-      // create histogram
-      this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
-      // find split based on histogram statistics
-      this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
-      // reset position after split
-      this->ResetPositionAfterSplit(p_fmat, *p_tree);
-      this->UpdateQueueExpand(*p_tree);
-      // if nothing left to be expand, break
-      if (qexpand.size() == 0) break;
-    }
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      const int nid = qexpand[i];
-      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
-    }
-  }
-  // this function does two jobs
-  // (1) reset the position in array position, to be the latest leaf id
-  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const std::vector <bst_uint> &fset,
-                                  const RegTree &tree) = 0;
-  // initialize the current working set of features in this round
-  virtual void InitWorkSet(IFMatrix *p_fmat,
-                           const RegTree &tree,
-                           std::vector<bst_uint> *p_fset) {
-    p_fset->resize(tree.param.num_feature);
-    for (size_t i = 0; i < p_fset->size(); ++i) {
-      (*p_fset)[i] = static_cast<unsigned>(i);
-    }
-  }
-  // reset position after split, this is not a must, depending on implementation
-  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
-                                       const RegTree &tree) {
-  }
-  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
-                          const std::vector <bst_uint> &fset,
-                          const RegTree &tree)  = 0;
-
- private:
-  inline void EnumerateSplit(const HistUnit &hist,
-                             const TStats &node_sum,
-                             bst_uint fid,
-                             SplitEntry *best,
-                             TStats *left_sum) {
-    if (hist.size == 0) return;
-
-    double root_gain = node_sum.CalcGain(param);
-    TStats s(param), c(param);
-    for (bst_uint i = 0; i < hist.size; ++i) {
-      s.Add(hist.data[i]);
-      if (s.sum_hess >= param.min_child_weight) {
-        c.SetSubstract(node_sum, s);
-        if (c.sum_hess >= param.min_child_weight) {
-          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-          if (best->Update(static_cast<float>(loss_chg), fid, hist.cut[i], false)) {
-            *left_sum = s;
-          }
-        }
-      }
-    }
-    s.Clear();
-    for (bst_uint i = hist.size - 1; i != 0; --i) {
-      s.Add(hist.data[i]);
-      if (s.sum_hess >= param.min_child_weight) {
-        c.SetSubstract(node_sum, s);
-        if (c.sum_hess >= param.min_child_weight) {
-          double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-          if (best->Update(static_cast<float>(loss_chg), fid, hist.cut[i-1], true)) {
-            *left_sum = c;
-          }
-        }
-      }
-    }
-  }
-  inline void FindSplit(int depth,
-                        const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
-                        const std::vector <bst_uint> &fset,
-                        RegTree *p_tree) {
-    const size_t num_feature = fset.size();
-    // get the best split condition for each node
-    std::vector<SplitEntry> sol(qexpand.size());
-    std::vector<TStats> left_sum(qexpand.size());
-    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
-    #pragma omp parallel for schedule(dynamic, 1)
-    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
-      const int nid = qexpand[wid];
-      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
-                    "node2workindex inconsistent");
-      SplitEntry &best = sol[wid];
-      TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      for (size_t i = 0; i < fset.size(); ++i) {
-        EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
-                       node_sum, fset[i], &best, &left_sum[wid]);
-      }
-    }
-    // get the best result, we can synchronize the solution
-    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
-      const int nid = qexpand[wid];
-      const SplitEntry &best = sol[wid];
-      const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      this->SetStats(p_tree, nid, node_sum);
-      // set up the values
-      p_tree->stat(nid).loss_chg = best.loss_chg;
-      // now we know the solution in snode[nid], set split
-      if (best.loss_chg > rt_eps) {
-        p_tree->AddChilds(nid);
-        (*p_tree)[nid].set_split(best.split_index(),
-                                 best.split_value, best.default_left());
-        // mark right child as 0, to indicate fresh leaf
-        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
-        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
-        // right side sum
-        TStats right_sum;
-        right_sum.SetSubstract(node_sum, left_sum[wid]);
-        this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
-        this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
-      } else {
-        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
-      }
-    }
-  }
-
-  inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
-    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
-    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
-    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
-  }
-};
-
-template<typename TStats>
-class CQHistMaker: public HistMaker<TStats> {
- protected:
-  struct HistEntry {
-    typename HistMaker<TStats>::HistUnit hist;
-    unsigned istart;
-    /*!
-     * \brief add a histogram to data,
-     * do linear scan, start from istart
-     */
-    inline void Add(bst_float fv,
-                    const std::vector<bst_gpair> &gpair,
-                    const BoosterInfo &info,
-                    const bst_uint ridx) {
-      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
-      utils::Assert(istart != hist.size, "the bound variable must be max");
-      hist.data[istart].Add(gpair, info, ridx);
-    }
-    /*!
-     * \brief add a histogram to data,
-     * do linear scan, start from istart
-     */
-    inline void Add(bst_float fv,
-                    bst_gpair gstats) {
-      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
-      utils::Assert(istart != hist.size, "the bound variable must be max");
-      hist.data[istart].Add(gstats);
-    }
-  };
-  // sketch type used for this
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
-  // initialize the work set of tree
-  virtual void InitWorkSet(IFMatrix *p_fmat,
-                           const RegTree &tree,
-                           std::vector<bst_uint> *p_fset) {
-    feat_helper.InitByCol(p_fmat, tree);
-    feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
-  }
-  // code to create histogram
-  virtual void CreateHist(const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
-                          const std::vector<bst_uint> &fset,
-                          const RegTree &tree) {
-    // fill in reverse map
-    feat2workindex.resize(tree.param.num_feature);
-    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
-    for (size_t i = 0; i < fset.size(); ++i) {
-      feat2workindex[fset[i]] = static_cast<int>(i);
-    }
-    // start to work
-    this->wspace.Init(this->param, 1);
-    // if it is C++11, use lazy evaluation for Allreduce,
-    // to gain speedup in recovery
-#if __cplusplus >= 201103L
-    auto lazy_get_hist = [&]()
-#endif
-    {
-      thread_hist.resize(this->get_nthread());
-      // start accumulating statistics
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
-      iter->BeforeFirst();
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        // start enumeration
-        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-        #pragma omp parallel for schedule(dynamic, 1)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int offset = feat2workindex[batch.col_index[i]];
-          if (offset >= 0) {
-            this->UpdateHistCol(gpair, batch[i], info, tree,
-                                fset, offset,
-                                &thread_hist[omp_get_thread_num()]);
-          }
-        }
-      }
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const int nid = this->qexpand[i];
-        const int wid = this->node2workindex[nid];
-        this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
-            .data[0] = node_stats[nid];
-      }
-    };
-    // sync the histogram
-    // if it is C++11, use lazy evaluation for Allreduce
-#if __cplusplus >= 201103L
-    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data),
-                            this->wspace.hset[0].data.size(), lazy_get_hist);
-#else
-    this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());
-#endif
-  }
-  virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
-                                       const RegTree &tree) {
-    this->ResetPositionCol(this->qexpand, p_fmat, tree);
-  }
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const std::vector<bst_uint> &fset,
-                                  const RegTree &tree) {
-    // fill in reverse map
-    feat2workindex.resize(tree.param.num_feature);
-    std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
-    freal_set.clear();
-    for (size_t i = 0; i < fset.size(); ++i) {
-      if (feat_helper.Type(fset[i]) == 2) {
-        feat2workindex[fset[i]] = static_cast<int>(freal_set.size());
-        freal_set.push_back(fset[i]);
-      } else {
-        feat2workindex[fset[i]] = -2;
-      }
-    }
-    this->GetNodeStats(gpair, *p_fmat, tree, info,
-                       &thread_stats, &node_stats);
-    sketchs.resize(this->qexpand.size() * freal_set.size());
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      sketchs[i].Init(info.num_row, this->param.sketch_eps);
-    }
-    // intitialize the summary array
-    summary_array.resize(sketchs.size());
-    // setup maximum size
-    unsigned max_size = this->param.max_sketch_size();
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      summary_array[i].Reserve(max_size);
-    }
-    // if it is C++11, use lazy evaluation for Allreduce
-#if __cplusplus >= 201103L
-    auto lazy_get_summary = [&]()
-#endif
-        {
-      // get smmary
-      thread_sketch.resize(this->get_nthread());
-      // number of rows in
-      const size_t nrows = p_fmat->buffered_rowset().size();
-      // start accumulating statistics
-      utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
-      iter->BeforeFirst();
-      while (iter->Next()) {
-        const ColBatch &batch = iter->Value();
-        // start enumeration
-        const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-        #pragma omp parallel for schedule(dynamic, 1)
-        for (bst_omp_uint i = 0; i < nsize; ++i) {
-          int offset = feat2workindex[batch.col_index[i]];
-          if (offset >= 0) {
-            this->UpdateSketchCol(gpair, batch[i], tree,
-                                  node_stats,
-                                  freal_set, offset,
-                                  batch[i].length == nrows,
-                                  &thread_sketch[omp_get_thread_num()]);
-          }
-        }
-      }
-      for (size_t i = 0; i < sketchs.size(); ++i) {
-        utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-        sketchs[i].GetSummary(&out);
-        summary_array[i].SetPrune(out, max_size);
-      }
-      utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
-    };
-    if (summary_array.size() != 0) {
-      size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
-#if __cplusplus >= 201103L
-      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
-#else
-      sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
-#endif
-    }
-    // now we get the final result of sketch, setup the cut
-    this->wspace.cut.clear();
-    this->wspace.rptr.clear();
-    this->wspace.rptr.push_back(0);
-    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
-      for (size_t i = 0; i < fset.size(); ++i) {
-        int offset = feat2workindex[fset[i]];
-        if (offset >= 0) {
-          const WXQSketch::Summary &a = summary_array[wid * freal_set.size() + offset];
-          for (size_t i = 1; i < a.size; ++i) {
-            bst_float cpt = a.data[i].value - rt_eps;
-            if (i == 1 || cpt > this->wspace.cut.back()) {
-              this->wspace.cut.push_back(cpt);
-            }
-          }
-          // push a value that is greater than anything
-          if (a.size != 0) {
-            bst_float cpt = a.data[a.size - 1].value;
-            // this must be bigger than last value in a scale
-            bst_float last = cpt + fabs(cpt) + rt_eps;
-            this->wspace.cut.push_back(last);
-          }
-          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
-        } else {
-          utils::Assert(offset == -2, "BUG in mark");
-          bst_float cpt = feat_helper.MaxValue(fset[i]);
-          this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
-          this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
-        }
-      }
-      // reserve last value for global statistics
-      this->wspace.cut.push_back(0.0f);
-      this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
-    }
-    utils::Assert(this->wspace.rptr.size() ==
-                  (fset.size() + 1) * this->qexpand.size() + 1,
-                  "cut space inconsistent");
-  }
-
- private:
-  inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
-                            const ColBatch::Inst &c,
-                            const BoosterInfo &info,
-                            const RegTree &tree,
-                            const std::vector<bst_uint> &fset,
-                            bst_uint fid_offset,
-                            std::vector<HistEntry> *p_temp) {
-    if (c.length == 0) return;
-    // initialize sbuilder for use
-    std::vector<HistEntry> &hbuilder = *p_temp;
-    hbuilder.resize(tree.param.num_nodes);
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const unsigned nid = this->qexpand[i];
-      const unsigned wid = this->node2workindex[nid];
-      hbuilder[nid].istart = 0;
-      hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
-    }
-    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
-      const bst_uint kBuffer = 32;
-      bst_uint align_length = c.length / kBuffer * kBuffer;
-      int buf_position[kBuffer];
-      bst_gpair buf_gpair[kBuffer];
-      for (bst_uint j = 0; j < align_length; j += kBuffer) {
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          bst_uint ridx = c[j + i].index;
-          buf_position[i] = this->position[ridx];
-          buf_gpair[i] = gpair[ridx];
-        }
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          const int nid = buf_position[i];
-          if (nid >= 0) {
-            hbuilder[nid].Add(c[j + i].fvalue, buf_gpair[i]);
-          }
-        }
-      }
-      for (bst_uint j = align_length; j < c.length; ++j) {
-        const bst_uint ridx = c[j].index;
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          hbuilder[nid].Add(c[j].fvalue, gpair[ridx]);
-        }
-      }
-    } else {
-      for (bst_uint j = 0; j < c.length; ++j) {
-        const bst_uint ridx = c[j].index;
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
-        }
-      }
-    }
-  }
-  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
-                              const ColBatch::Inst &c,
-                              const RegTree &tree,
-                              const std::vector<TStats> &nstats,
-                              const std::vector<bst_uint> &frealset,
-                              bst_uint offset,
-                              bool col_full,
-                              std::vector<BaseMaker::SketchEntry> *p_temp) {
-    if (c.length == 0) return;
-    // initialize sbuilder for use
-    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
-    sbuilder.resize(tree.param.num_nodes);
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const unsigned nid = this->qexpand[i];
-      const unsigned wid = this->node2workindex[nid];
-      sbuilder[nid].sum_total = 0.0f;
-      sbuilder[nid].sketch = &sketchs[wid * frealset.size() + offset];
-    }
-
-    if (!col_full) {
-      // first pass, get sum of weight, TODO, optimization to skip first pass
-      for (bst_uint j = 0; j < c.length; ++j) {
-        const bst_uint ridx = c[j].index;
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          sbuilder[nid].sum_total += gpair[ridx].hess;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const unsigned nid = this->qexpand[i];
-        sbuilder[nid].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
-      }
-    }
-    // if only one value, no need to do second pass
-    if (c[0].fvalue  == c[c.length-1].fvalue) {
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const int nid = this->qexpand[i];
-        sbuilder[nid].sketch->Push(c[0].fvalue, static_cast<bst_float>(sbuilder[nid].sum_total));
-      }
-      return;
-    }
-    // two pass scan
-    unsigned max_size = this->param.max_sketch_size();
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const int nid = this->qexpand[i];
-      sbuilder[nid].Init(max_size);
-    }
-    // second pass, build the sketch
-    if (TStats::kSimpleStats != 0 && this->param.cache_opt != 0) {
-      const bst_uint kBuffer = 32;
-      bst_uint align_length = c.length / kBuffer * kBuffer;
-      int buf_position[kBuffer];
-      bst_float buf_hess[kBuffer];
-      for (bst_uint j = 0; j < align_length; j += kBuffer) {
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          bst_uint ridx = c[j + i].index;
-          buf_position[i] = this->position[ridx];
-          buf_hess[i] = gpair[ridx].hess;
-        }
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          const int nid = buf_position[i];
-          if (nid >= 0) {
-            sbuilder[nid].Push(c[j + i].fvalue, buf_hess[i], max_size);
-          }
-        }
-      }
-      for (bst_uint j = align_length; j < c.length; ++j) {
-        const bst_uint ridx = c[j].index;
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
-        }
-      }
-    } else {
-      for (bst_uint j = 0; j < c.length; ++j) {
-        const bst_uint ridx = c[j].index;
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
-        }
-      }
-    }
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const int nid = this->qexpand[i];
-      sbuilder[nid].Finalize(max_size);
-    }
-  }
-  // feature helper
-  BaseMaker::FMetaHelper feat_helper;
-  // temp space to map feature id to working index
-  std::vector<int> feat2workindex;
-  // set of index from fset that are real
-  std::vector<bst_uint> freal_set;
-  // thread temp data
-  std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
-  // used to hold statistics
-  std::vector< std::vector<TStats> > thread_stats;
-  // used to hold start pointer
-  std::vector< std::vector<HistEntry> > thread_hist;
-  // node statistics
-  std::vector<TStats> node_stats;
-  // summary array
-  std::vector<WXQSketch::SummaryContainer> summary_array;
-  // reducer for summary
-  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
-  // per node, per feature sketch
-  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
-};
-
-template<typename TStats>
-class QuantileHistMaker: public HistMaker<TStats> {
- protected:
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
-  virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
-                                  IFMatrix *p_fmat,
-                                  const BoosterInfo &info,
-                                  const std::vector <bst_uint> &fset,
-                                  const RegTree &tree) {
-    // initialize the data structure
-    int nthread = BaseMaker::get_nthread();
-    sketchs.resize(this->qexpand.size() * tree.param.num_feature);
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      sketchs[i].Init(info.num_row, this->param.sketch_eps);
-    }
-    // start accumulating statistics
-    utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const RowBatch &batch = iter->Value();
-      // parallel convert to column major format
-      utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
-      builder.InitBudget(tree.param.num_feature, nthread);
-
-      const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        int nid = this->position[ridx];
-        if (nid >= 0) {
-          if (!tree[nid].is_leaf()) {
-            this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
-          }
-          if (this->node2workindex[nid] < 0) {
-            this->position[ridx] = ~nid;
-          } else {
-            for (bst_uint j = 0; j < inst.length; ++j) {
-              builder.AddBudget(inst[j].index, omp_get_thread_num());
-            }
-          }
-        }
-      }
-      builder.InitStorage();
-      #pragma omp parallel for schedule(static)
-      for (bst_omp_uint i = 0; i < nbatch; ++i) {
-        RowBatch::Inst inst = batch[i];
-        const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          for (bst_uint j = 0; j < inst.length; ++j) {
-            builder.Push(inst[j].index,
-                         SparseBatch::Entry(nid, inst[j].fvalue),
-                         omp_get_thread_num());
-          }
-        }
-      }
-      // start putting things into sketch
-      const bst_omp_uint nfeat = col_ptr.size() - 1;
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint k = 0; k < nfeat; ++k) {
-        for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
-          const SparseBatch::Entry &e = col_data[i];
-          const int wid = this->node2workindex[e.index];
-          sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].hess);
-        }
-      }
-    }
-    // setup maximum size
-    unsigned max_size = this->param.max_sketch_size();
-    // synchronize sketch
-    summary_array.resize(sketchs.size());
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-      sketchs[i].GetSummary(&out);
-      summary_array[i].Reserve(max_size);
-      summary_array[i].SetPrune(out, max_size);
-    }
-
-    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
-    sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
-    // now we get the final result of sketch, setup the cut
-    this->wspace.cut.clear();
-    this->wspace.rptr.clear();
-    this->wspace.rptr.push_back(0);
-    for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
-      for (int fid = 0; fid < tree.param.num_feature; ++fid) {
-        const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
-        for (size_t i = 1; i < a.size; ++i) {
-          bst_float cpt = a.data[i].value - rt_eps;
-          if (i == 1 || cpt > this->wspace.cut.back()) {
-            this->wspace.cut.push_back(cpt);
-          }
-        }
-        // push a value that is greater than anything
-        if (a.size != 0) {
-          bst_float cpt = a.data[a.size - 1].value;
-          // this must be bigger than last value in a scale
-          bst_float last = cpt + fabs(cpt) + rt_eps;
-          this->wspace.cut.push_back(last);
-        }
-        this->wspace.rptr.push_back(this->wspace.cut.size());
-      }
-      // reserve last value for global statistics
-      this->wspace.cut.push_back(0.0f);
-      this->wspace.rptr.push_back(this->wspace.cut.size());
-    }
-    utils::Assert(this->wspace.rptr.size() ==
-                  (tree.param.num_feature + 1) * this->qexpand.size() + 1,
-                  "cut space inconsistent");
-  }
-
- private:
-  // summary array
-  std::vector<WXQSketch::SummaryContainer> summary_array;
-  // reducer for summary
-  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
-  // local temp column data structure
-  std::vector<size_t> col_ptr;
-  // local storage of column data
-  std::vector<SparseBatch::Entry> col_data;
-  std::vector< std::vector<size_t> > thread_col_ptr;
-  // per node, per feature sketch
-  std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune-inl.hpp
@@ -1,87 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_prune-inl.hpp
- * \brief prune a tree given the statistics
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
-#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
-
-#include <vector>
-#include "./param.h"
-#include "./updater.h"
-#include "./updater_sync-inl.hpp"
-
-namespace xgboost {
-namespace tree {
-/*! \brief pruner that prunes a tree after growing finishes */
-class TreePruner: public IUpdater {
- public:
-  virtual ~TreePruner(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    using namespace std;
-    param.SetParam(name, val);
-    syncher.SetParam(name, val);
-    if (!strcmp(name, "silent")) silent = atoi(val);
-  }
-  // update the tree, do pruning
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    // rescale learning rate according to size of trees
-    float lr = param.learning_rate;
-    param.learning_rate = lr / trees.size();
-    for (size_t i = 0; i < trees.size(); ++i) {
-      this->DoPrune(*trees[i]);
-    }
-    param.learning_rate = lr;
-    syncher.Update(gpair, p_fmat, info, trees);
-  }
-
- private:
-  // try to prune off current leaf
-  inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) { // NOLINT(*)
-    if (tree[nid].is_root()) return npruned;
-    int pid = tree[nid].parent();
-    RegTree::NodeStat &s = tree.stat(pid);
-    ++s.leaf_child_cnt;
-    if (s.leaf_child_cnt >= 2 && param.need_prune(s.loss_chg, depth - 1)) {
-      // need to be pruned
-      tree.ChangeToLeaf(pid, param.learning_rate * s.base_weight);
-      // tail recursion
-      return this->TryPruneLeaf(tree, pid, depth - 1, npruned+2);
-    } else {
-      return npruned;
-    }
-  }
-  /*! \brief do pruning of a tree */
-  inline void DoPrune(RegTree &tree) { // NOLINT(*)
-    int npruned = 0;
-    // initialize auxiliary statistics
-    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
-      tree.stat(nid).leaf_child_cnt = 0;
-    }
-    for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
-      if (tree[nid].is_leaf()) {
-        npruned = this->TryPruneLeaf(tree, nid, tree.GetDepth(nid), npruned);
-      }
-    }
-    if (silent == 0) {
-      utils::Printf("tree pruning end, %d roots, %d extra nodes, %d pruned nodes, max_depth=%d\n",
-                    tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
-    }
-  }
-
- private:
-  // synchronizer
-  TreeSyncher syncher;
-  // shutup
-  int silent;
-  // training parameter
-  TrainParam param;
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
--- a/src/tree/updater_refresh-inl.hpp
+++ b/src/tree/updater_refresh-inl.hpp
@@ -1,157 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_refresh-inl.hpp
- * \brief refresh the statistics and leaf value on the tree on the dataset
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
-#define XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
-
-#include <vector>
-#include <limits>
-#include "../sync/sync.h"
-#include "./param.h"
-#include "./updater.h"
-#include "../utils/omp.h"
-
-namespace xgboost {
-namespace tree {
-/*! \brief pruner that prunes a tree after growing finishs */
-template<typename TStats>
-class TreeRefresher: public IUpdater {
- public:
-  virtual ~TreeRefresher(void) {}
-  // set training parameter
-  virtual void SetParam(const char *name, const char *val) {
-    param.SetParam(name, val);
-  }
-  // update the tree, do pruning
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    if (trees.size() == 0) return;
-    // number of threads
-    // thread temporal space
-    std::vector< std::vector<TStats> > stemp;
-    std::vector<RegTree::FVec> fvec_temp;
-    // setup temp space for each thread
-    int nthread;
-    #pragma omp parallel
-    {
-      nthread = omp_get_num_threads();
-    }
-    fvec_temp.resize(nthread, RegTree::FVec());
-    stemp.resize(nthread, std::vector<TStats>());
-    #pragma omp parallel
-    {
-      int tid = omp_get_thread_num();
-      int num_nodes = 0;
-      for (size_t i = 0; i < trees.size(); ++i) {
-        num_nodes += trees[i]->param.num_nodes;
-      }
-      stemp[tid].resize(num_nodes, TStats(param));
-      std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
-      fvec_temp[tid].Init(trees[0]->param.num_feature);
-    }
-    // if it is C++11, use lazy evaluation for Allreduce,
-    // to gain speedup in recovery
-#if __cplusplus >= 201103L
-    auto lazy_get_stats = [&]()
-#endif
-    {
-      // start accumulating statistics
-      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
-      iter->BeforeFirst();
-      while (iter->Next()) {
-        const RowBatch &batch = iter->Value();
-        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
-                     "too large batch size ");
-        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
-        #pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nbatch; ++i) {
-          RowBatch::Inst inst = batch[i];
-          const int tid = omp_get_thread_num();
-          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
-          RegTree::FVec &feats = fvec_temp[tid];
-          feats.Fill(inst);
-          int offset = 0;
-          for (size_t j = 0; j < trees.size(); ++j) {
-            AddStats(*trees[j], feats, gpair, info, ridx,
-                     BeginPtr(stemp[tid]) + offset);
-            offset += trees[j]->param.num_nodes;
-          }
-          feats.Drop(inst);
-        }
-      }
-      // aggregate the statistics
-      int num_nodes = static_cast<int>(stemp[0].size());
-      #pragma omp parallel for schedule(static)
-      for (int nid = 0; nid < num_nodes; ++nid) {
-        for (int tid = 1; tid < nthread; ++tid) {
-          stemp[0][nid].Add(stemp[tid][nid]);
-        }
-      }
-    };
-#if __cplusplus >= 201103L
-    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
-#else
-    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
-#endif
-    // rescale learning rate according to size of trees
-    float lr = param.learning_rate;
-    param.learning_rate = lr / trees.size();
-    int offset = 0;
-    for (size_t i = 0; i < trees.size(); ++i) {
-      for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
-        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
-      }
-      offset += trees[i]->param.num_nodes;
-    }
-    // set learning rate back
-    param.learning_rate = lr;
-  }
-
- private:
-  inline static void AddStats(const RegTree &tree,
-                              const RegTree::FVec &feat,
-                              const std::vector<bst_gpair> &gpair,
-                              const BoosterInfo &info,
-                              const bst_uint ridx,
-                              TStats *gstats) {
-    // start from groups that belongs to current data
-    int pid = static_cast<int>(info.GetRoot(ridx));
-    gstats[pid].Add(gpair, info, ridx);
-    // tranverse tree
-    while (!tree[pid].is_leaf()) {
-      unsigned split_index = tree[pid].split_index();
-      pid = tree.GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
-      gstats[pid].Add(gpair, info, ridx);
-    }
-  }
-  inline void Refresh(const TStats *gstats,
-                      int nid, RegTree *p_tree) {
-    RegTree &tree = *p_tree;
-    tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
-    tree.stat(nid).sum_hess = static_cast<float>(gstats[nid].sum_hess);
-    gstats[nid].SetLeafVec(param, tree.leafvec(nid));
-    if (tree[nid].is_leaf()) {
-      tree[nid].set_leaf(tree.stat(nid).base_weight * param.learning_rate);
-    } else {
-      tree.stat(nid).loss_chg = static_cast<float>(
-          gstats[tree[nid].cleft()].CalcGain(param) +
-          gstats[tree[nid].cright()].CalcGain(param) -
-          gstats[nid].CalcGain(param));
-      this->Refresh(gstats, tree[nid].cleft(), p_tree);
-      this->Refresh(gstats, tree[nid].cright(), p_tree);
-    }
-  }
-  // training parameter
-  TrainParam param;
-  // reducer
-  rabit::Reducer<TStats, TStats::Reduce> reducer;
-};
-
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_REFRESH_INL_HPP_
--- a/src/tree/updater_skmaker-inl.hpp
+++ b/src/tree/updater_skmaker-inl.hpp
@@ -1,399 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_skmaker-inl.hpp
- * \brief use approximation sketch to construct a tree,
-          a refresh is needed to make the statistics exactly correct
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
-#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
-
-#include <vector>
-#include <algorithm>
-#include "../sync/sync.h"
-#include "../utils/quantile.h"
-#include "./updater_basemaker-inl.hpp"
-
-namespace xgboost {
-namespace tree {
-class SketchMaker: public BaseMaker {
- public:
-  virtual ~SketchMaker(void) {}
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    // rescale learning rate according to size of trees
-    float lr = param.learning_rate;
-    param.learning_rate = lr / trees.size();
-    // build tree
-    for (size_t i = 0; i < trees.size(); ++i) {
-      this->Update(gpair, p_fmat, info, trees[i]);
-    }
-    param.learning_rate = lr;
-  }
-
- protected:
-  inline void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      RegTree *p_tree) {
-    this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
-    for (int depth = 0; depth < param.max_depth; ++depth) {
-      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
-                         &thread_stats, &node_stats);
-      this->BuildSketch(gpair, p_fmat, info, *p_tree);
-      this->SyncNodeStats();
-      this->FindSplit(depth, gpair, p_fmat, info, p_tree);
-      this->ResetPositionCol(qexpand, p_fmat, *p_tree);
-      this->UpdateQueueExpand(*p_tree);
-      // if nothing left to be expand, break
-      if (qexpand.size() == 0) break;
-    }
-    if (qexpand.size() != 0) {
-      this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
-                         &thread_stats, &node_stats);
-      this->SyncNodeStats();
-    }
-    // set all statistics correctly
-    for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
-      this->SetStats(nid, node_stats[nid], p_tree);
-      if (!(*p_tree)[nid].is_leaf()) {
-        p_tree->stat(nid).loss_chg = static_cast<float>(
-            node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
-            node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
-            node_stats[nid].CalcGain(param));
-      }
-    }
-    // set left leaves
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      const int nid = qexpand[i];
-      (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
-    }
-  }
-  // define the sketch we want to use
-  typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
-
- private:
-  // statistics needed in the gradient calculation
-  struct SKStats {
-    /*! \brief sum of all positive gradient */
-    double pos_grad;
-    /*! \brief sum of all negative gradient */
-    double neg_grad;
-    /*! \brief sum of hessian statistics */
-    double sum_hess;
-    SKStats(void) {}
-    // constructor
-    explicit SKStats(const TrainParam &param) {
-      this->Clear();
-    }
-    /*! \brief clear the statistics */
-    inline void Clear(void) {
-      neg_grad = pos_grad = sum_hess = 0.0f;
-    }
-    // accumulate statistics
-    inline void Add(const std::vector<bst_gpair> &gpair,
-                    const BoosterInfo &info,
-                    bst_uint ridx) {
-      const bst_gpair &b = gpair[ridx];
-      if (b.grad >= 0.0f) {
-        pos_grad += b.grad;
-      } else {
-        neg_grad -= b.grad;
-      }
-      sum_hess += b.hess;
-    }
-    /*! \brief calculate gain of the solution */
-    inline double CalcGain(const TrainParam &param) const {
-      return param.CalcGain(pos_grad - neg_grad, sum_hess);
-    }
-    /*! \brief set current value to a - b */
-    inline void SetSubstract(const SKStats &a, const SKStats &b) {
-      pos_grad = a.pos_grad - b.pos_grad;
-      neg_grad = a.neg_grad - b.neg_grad;
-      sum_hess = a.sum_hess - b.sum_hess;
-    }
-    // calculate leaf weight
-    inline double CalcWeight(const TrainParam &param) const {
-      return param.CalcWeight(pos_grad - neg_grad, sum_hess);
-    }
-    /*! \brief add statistics to the data */
-    inline void Add(const SKStats &b) {
-      pos_grad += b.pos_grad;
-      neg_grad += b.neg_grad;
-      sum_hess += b.sum_hess;
-    }
-    /*! \brief same as add, reduce is used in All Reduce */
-    inline static void Reduce(SKStats &a, const SKStats &b) { // NOLINT(*)
-      a.Add(b);
-    }
-    /*! \brief set leaf vector value based on statistics */
-    inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
-    }
-  };
-  inline void BuildSketch(const std::vector<bst_gpair> &gpair,
-                          IFMatrix *p_fmat,
-                          const BoosterInfo &info,
-                          const RegTree &tree) {
-    sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      sketchs[i].Init(info.num_row, this->param.sketch_eps);
-    }
-    thread_sketch.resize(this->get_nthread());
-    // number of rows in
-    const size_t nrows = p_fmat->buffered_rowset().size();
-    // start accumulating statistics
-    utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
-    iter->BeforeFirst();
-    while (iter->Next()) {
-      const ColBatch &batch = iter->Value();
-      // start enumeration
-      const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
-      #pragma omp parallel for schedule(dynamic, 1)
-      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        this->UpdateSketchCol(gpair, batch[i], tree,
-                              node_stats,
-                              batch.col_index[i],
-                              batch[i].length == nrows,
-                              &thread_sketch[omp_get_thread_num()]);
-      }
-    }
-    // setup maximum size
-    unsigned max_size = param.max_sketch_size();
-    // synchronize sketch
-    summary_array.resize(sketchs.size());
-    for (size_t i = 0; i < sketchs.size(); ++i) {
-      utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-      sketchs[i].GetSummary(&out);
-      summary_array[i].Reserve(max_size);
-      summary_array[i].SetPrune(out, max_size);
-    }
-    size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
-    sketch_reducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
-  }
-  // update sketch information in column fid
-  inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
-                              const ColBatch::Inst &c,
-                              const RegTree &tree,
-                              const std::vector<SKStats> &nstats,
-                              bst_uint fid,
-                              bool col_full,
-                              std::vector<SketchEntry> *p_temp) {
-    if (c.length == 0) return;
-    // initialize sbuilder for use
-    std::vector<SketchEntry> &sbuilder = *p_temp;
-    sbuilder.resize(tree.param.num_nodes * 3);
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const unsigned nid = this->qexpand[i];
-      const unsigned wid = this->node2workindex[nid];
-      for (int k = 0; k < 3; ++k) {
-        sbuilder[3 * nid + k].sum_total = 0.0f;
-        sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];
-      }
-    }
-    if (!col_full) {
-      for (bst_uint j = 0; j < c.length; ++j) {
-        const bst_uint ridx = c[j].index;
-        const int nid = this->position[ridx];
-        if (nid >= 0) {
-          const bst_gpair &e = gpair[ridx];
-          if (e.grad >= 0.0f) {
-            sbuilder[3 * nid + 0].sum_total += e.grad;
-          } else {
-            sbuilder[3 * nid + 1].sum_total -= e.grad;
-          }
-          sbuilder[3 * nid + 2].sum_total += e.hess;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const unsigned nid = this->qexpand[i];
-        sbuilder[3 * nid + 0].sum_total = static_cast<bst_float>(nstats[nid].pos_grad);
-        sbuilder[3 * nid + 1].sum_total = static_cast<bst_float>(nstats[nid].neg_grad);
-        sbuilder[3 * nid + 2].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
-      }
-    }
-    // if only one value, no need to do second pass
-    if (c[0].fvalue  == c[c.length-1].fvalue) {
-      for (size_t i = 0; i < this->qexpand.size(); ++i) {
-        const int nid = this->qexpand[i];
-        for (int k = 0; k < 3; ++k) {
-          sbuilder[3 * nid + k].sketch->Push(c[0].fvalue,
-                                             static_cast<bst_float>(
-                                                 sbuilder[3 * nid + k].sum_total));
-        }
-      }
-      return;
-    }
-    // two pass scan
-    unsigned max_size = param.max_sketch_size();
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const int nid = this->qexpand[i];
-      for (int k = 0; k < 3; ++k) {
-        sbuilder[3 * nid + k].Init(max_size);
-      }
-    }
-    // second pass, build the sketch
-    for (bst_uint j = 0; j < c.length; ++j) {
-      const bst_uint ridx = c[j].index;
-      const int nid = this->position[ridx];
-      if (nid >= 0) {
-        const bst_gpair &e = gpair[ridx];
-        if (e.grad >= 0.0f) {
-          sbuilder[3 * nid + 0].Push(c[j].fvalue, e.grad, max_size);
-        } else {
-          sbuilder[3 * nid + 1].Push(c[j].fvalue, -e.grad, max_size);
-        }
-        sbuilder[3 * nid + 2].Push(c[j].fvalue, e.hess, max_size);
-      }
-    }
-    for (size_t i = 0; i < this->qexpand.size(); ++i) {
-      const int nid = this->qexpand[i];
-      for (int k = 0; k < 3; ++k) {
-        sbuilder[3 * nid + k].Finalize(max_size);
-      }
-    }
-  }
-  inline void SyncNodeStats(void) {
-    utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
-    std::vector<SKStats> tmp(qexpand.size());
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      tmp[i] = node_stats[qexpand[i]];
-    }
-    stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
-    for (size_t i = 0; i < qexpand.size(); ++i) {
-      node_stats[qexpand[i]] = tmp[i];
-    }
-  }
-  inline void FindSplit(int depth,
-                        const std::vector<bst_gpair> &gpair,
-                        IFMatrix *p_fmat,
-                        const BoosterInfo &info,
-                        RegTree *p_tree) {
-    const bst_uint num_feature = p_tree->param.num_feature;
-    // get the best split condition for each node
-    std::vector<SplitEntry> sol(qexpand.size());
-    bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
-    #pragma omp parallel for schedule(dynamic, 1)
-    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
-      const int nid = qexpand[wid];
-      utils::Assert(node2workindex[nid] == static_cast<int>(wid),
-                    "node2workindex inconsistent");
-      SplitEntry &best = sol[wid];
-      for (bst_uint fid = 0; fid < num_feature; ++fid) {
-        unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
-        EnumerateSplit(summary_array[base + 0],
-                       summary_array[base + 1],
-                       summary_array[base + 2],
-                       node_stats[nid], fid, &best);
-      }
-    }
-    // get the best result, we can synchronize the solution
-    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
-      const int nid = qexpand[wid];
-      const SplitEntry &best = sol[wid];
-      // set up the values
-      p_tree->stat(nid).loss_chg = best.loss_chg;
-      this->SetStats(nid, node_stats[nid], p_tree);
-      // now we know the solution in snode[nid], set split
-      if (best.loss_chg > rt_eps) {
-        p_tree->AddChilds(nid);
-        (*p_tree)[nid].set_split(best.split_index(),
-                                 best.split_value, best.default_left());
-        // mark right child as 0, to indicate fresh leaf
-        (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
-        (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
-      } else {
-        (*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
-      }
-    }
-  }
-  // set statistics on ptree
-  inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
-    p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
-    p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
-    node_sum.SetLeafVec(param, p_tree->leafvec(nid));
-  }
-  inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
-                             const WXQSketch::Summary &neg_grad,
-                             const WXQSketch::Summary &sum_hess,
-                             const SKStats &node_sum,
-                             bst_uint fid,
-                             SplitEntry *best) {
-    if (sum_hess.size == 0) return;
-    double root_gain = node_sum.CalcGain(param);
-    std::vector<bst_float> fsplits;
-    for (size_t i = 0; i < pos_grad.size; ++i) {
-      fsplits.push_back(pos_grad.data[i].value);
-    }
-    for (size_t i = 0; i < neg_grad.size; ++i) {
-      fsplits.push_back(neg_grad.data[i].value);
-    }
-    for (size_t i = 0; i < sum_hess.size; ++i) {
-      fsplits.push_back(sum_hess.data[i].value);
-    }
-    std::sort(fsplits.begin(), fsplits.end());
-    fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
-    // sum feature
-    SKStats feat_sum;
-    feat_sum.pos_grad = pos_grad.data[pos_grad.size - 1].rmax;
-    feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax;
-    feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax;
-    size_t ipos = 0, ineg = 0, ihess = 0;
-    for (size_t i = 1; i < fsplits.size(); ++i) {
-      WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos);
-      WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg);
-      WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess);
-      SKStats s, c;
-      s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin);
-      s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin);
-      s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
-      c.SetSubstract(node_sum, s);
-      // forward
-      if (s.sum_hess >= param.min_child_weight &&
-          c.sum_hess >= param.min_child_weight) {
-        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-        best->Update(static_cast<bst_float>(loss_chg), fid, fsplits[i], false);
-      }
-      // backward
-      c.SetSubstract(feat_sum, s);
-      s.SetSubstract(node_sum, c);
-      if (s.sum_hess >= param.min_child_weight &&
-          c.sum_hess >= param.min_child_weight) {
-        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-        best->Update(static_cast<bst_float>(loss_chg), fid, fsplits[i], true);
-      }
-    }
-    {
-      // all including
-      SKStats s = feat_sum, c;
-      c.SetSubstract(node_sum, s);
-      if (s.sum_hess >= param.min_child_weight &&
-          c.sum_hess >= param.min_child_weight) {
-        bst_float cpt = fsplits.back();
-        double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
-        best->Update(static_cast<bst_float>(loss_chg), fid, cpt + fabsf(cpt) + 1.0f, false);
-      }
-    }
-  }
-
-  // thread temp data
-  // used to hold temporal sketch
-  std::vector< std::vector<SketchEntry> > thread_sketch;
-  // used to hold statistics
-  std::vector< std::vector<SKStats> > thread_stats;
-  // node statistics
-  std::vector<SKStats> node_stats;
-  // summary array
-  std::vector<WXQSketch::SummaryContainer> summary_array;
-  // reducer for summary
-  rabit::Reducer<SKStats, SKStats::Reduce> stats_reducer;
-  // reducer for summary
-  rabit::SerializeReducer<WXQSketch::SummaryContainer> sketch_reducer;
-  // per node, per feature sketch
-  std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
--- a/src/tree/updater_sync-inl.hpp
+++ b/src/tree/updater_sync-inl.hpp
@@ -1,56 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file updater_sync-inl.hpp
- * \brief synchronize the tree in all distributed nodes
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
-#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
-
-#include <vector>
-#include <string>
-#include <limits>
-#include "../sync/sync.h"
-#include "./updater.h"
-
-namespace xgboost {
-namespace tree {
-/*!
- * \brief syncher that synchronize the tree in all distributed nodes
- * can implement various strategies, so far it is always set to node 0's tree
- */
-class TreeSyncher: public IUpdater {
- public:
-  virtual ~TreeSyncher(void) {}
-  virtual void SetParam(const char *name, const char *val) {
-  }
-  // update the tree, do pruning
-  virtual void Update(const std::vector<bst_gpair> &gpair,
-                      IFMatrix *p_fmat,
-                      const BoosterInfo &info,
-                      const std::vector<RegTree*> &trees) {
-    this->SyncTrees(trees);
-  }
-
- private:
-  // synchronize the trees in different nodes, take tree from rank 0
-  inline void SyncTrees(const std::vector<RegTree *> &trees) {
-    if (rabit::GetWorldSize() == 1) return;
-    std::string s_model;
-    utils::MemoryBufferStream fs(&s_model);
-    int rank = rabit::GetRank();
-    if (rank == 0) {
-      for (size_t i = 0; i < trees.size(); ++i) {
-        trees[i]->SaveModel(fs);
-      }
-    }
-    fs.Seek(0);
-    rabit::Broadcast(&s_model, 0);
-    for (size_t i = 0; i < trees.size(); ++i) {
-      trees[i]->LoadModel(fs);
-    }
-  }
-};
-}  // namespace tree
-}  // namespace xgboost
-#endif  // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
--- a/src/utils/base64-inl.h
+++ b/src/utils/base64-inl.h
@@ -1,267 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file base64.h
- * \brief data stream support to input and output from/to base64 stream
- * base64 is easier to store and pass as text format in mapreduce
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_BASE64_INL_H_
-#define XGBOOST_UTILS_BASE64_INL_H_
-
-#include <cctype>
-#include <cstdio>
-#include <string>
-#include "./io.h"
-
-namespace xgboost {
-namespace utils {
-/*! \brief buffer reader of the stream that allows you to get */
-class StreamBufferReader {
- public:
-  explicit StreamBufferReader(size_t buffer_size)
-      :stream_(NULL),
-       read_len_(1), read_ptr_(1) {
-    buffer_.resize(buffer_size);
-  }
-  /*!
-   * \brief set input stream
-   */
-  inline void set_stream(IStream *stream) {
-    stream_ = stream;
-    read_len_ = read_ptr_ = 1;
-  }
-  /*!
-   * \brief allows quick read using get char
-   */
-  inline char GetChar(void) {
-    while (true) {
-      if (read_ptr_ < read_len_) {
-        return buffer_[read_ptr_++];
-      } else {
-        read_len_ = stream_->Read(&buffer_[0], buffer_.length());
-        if (read_len_ == 0) return EOF;
-        read_ptr_ = 0;
-      }
-    }
-  }
-  /*! \brief whether we are reaching the end of file */
-  inline bool AtEnd(void) const {
-    return read_len_ == 0;
-  }
-
- private:
-  /*! \brief the underlying stream */
-  IStream *stream_;
-  /*! \brief buffer to hold data */
-  std::string buffer_;
-  /*! \brief length of valid data in buffer */
-  size_t read_len_;
-  /*! \brief pointer in the buffer */
-  size_t read_ptr_;
-};
-
-/*! \brief namespace of base64 decoding and encoding table */
-namespace base64 {
-const char DecodeTable[] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  62,  // '+'
-  0, 0, 0,
-  63,  // '/'
-  52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  // '0'-'9'
-  0, 0, 0, 0, 0, 0, 0,
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  // 'A'-'Z'
-  0, 0, 0, 0, 0, 0,
-  26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
-  39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  // 'a'-'z'
-};
-static const char EncodeTable[] =
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-}  // namespace base64
-/*! \brief the stream that reads from base64, note we take from file pointers */
-class Base64InStream: public IStream {
- public:
-  explicit Base64InStream(IStream *fs) : reader_(256) {
-    reader_.set_stream(fs);
-    num_prev = 0; tmp_ch = 0;
-  }
-  /*!
-   * \brief initialize the stream position to beginning of next base64 stream
-   * call this function before actually start read
-   */
-  inline void InitPosition(void) {
-    // get a character
-    do {
-      tmp_ch = reader_.GetChar();
-    } while (isspace(tmp_ch));
-  }
-  /*! \brief whether current position is end of a base64 stream */
-  inline bool IsEOF(void) const {
-    return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    using base64::DecodeTable;
-    if (size == 0) return 0;
-    // use tlen to record left size
-    size_t tlen = size;
-    unsigned char *cptr = static_cast<unsigned char*>(ptr);
-    // if anything left, load from previous buffered result
-    if (num_prev != 0) {
-      if (num_prev == 2) {
-        if (tlen >= 2) {
-          *cptr++ = buf_prev[0];
-          *cptr++ = buf_prev[1];
-          tlen -= 2;
-          num_prev = 0;
-        } else {
-          // assert tlen == 1
-          *cptr++ = buf_prev[0]; --tlen;
-          buf_prev[0] = buf_prev[1];
-          num_prev = 1;
-        }
-      } else {
-        // assert num_prev == 1
-        *cptr++ = buf_prev[0]; --tlen; num_prev = 0;
-      }
-    }
-    if (tlen == 0) return size;
-    int nvalue;
-    // note: everything goes with 4 bytes in Base64
-    // so we process 4 bytes a unit
-    while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
-      // first byte
-      nvalue = DecodeTable[tmp_ch] << 18;
-      {
-        // second byte
-        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
-                     "invalid base64 format");
-        nvalue |= DecodeTable[tmp_ch] << 12;
-        *cptr++ = (nvalue >> 16) & 0xFF; --tlen;
-      }
-      {
-        // third byte
-        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
-                     "invalid base64 format");
-        // handle termination
-        if (tmp_ch == '=') {
-          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == '='), "invalid base64 format");
-          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
-                       "invalid base64 format");
-          break;
-        }
-        nvalue |= DecodeTable[tmp_ch] << 6;
-        if (tlen) {
-          *cptr++ = (nvalue >> 8) & 0xFF; --tlen;
-        } else {
-          buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
-        }
-      }
-      {
-        // fourth byte
-        utils::Check((tmp_ch = reader_.GetChar(), tmp_ch != EOF && !isspace(tmp_ch)),
-                     "invalid base64 format");
-        if (tmp_ch == '=') {
-          utils::Check((tmp_ch = reader_.GetChar(), tmp_ch == EOF || isspace(tmp_ch)),
-                       "invalid base64 format");
-          break;
-        }
-        nvalue |= DecodeTable[tmp_ch];
-        if (tlen) {
-          *cptr++ = nvalue & 0xFF; --tlen;
-        } else {
-          buf_prev[num_prev ++] = nvalue & 0xFF;
-        }
-      }
-      // get next char
-      tmp_ch = reader_.GetChar();
-    }
-    if (kStrictCheck) {
-      utils::Check(tlen == 0, "Base64InStream: read incomplete");
-    }
-    return size - tlen;
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    utils::Error("Base64InStream do not support write");
-  }
-
- private:
-  StreamBufferReader reader_;
-  int tmp_ch;
-  int num_prev;
-  unsigned char buf_prev[2];
-  // whether we need to do strict check
-  static const bool kStrictCheck = false;
-};
-/*! \brief the stream that write to base64, note we take from file pointers */
-class Base64OutStream: public IStream {
- public:
-  explicit Base64OutStream(IStream *fp) : fp(fp) {
-    buf_top = 0;
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    using base64::EncodeTable;
-    size_t tlen = size;
-    const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
-    while (tlen) {
-      while (buf_top < 3  && tlen != 0) {
-        buf[++buf_top] = *cptr++; --tlen;
-      }
-      if (buf_top == 3) {
-        // flush 4 bytes out
-        PutChar(EncodeTable[buf[1] >> 2]);
-        PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
-        PutChar(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F]);
-        PutChar(EncodeTable[buf[3] & 0x3F]);
-        buf_top = 0;
-      }
-    }
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    utils::Error("Base64OutStream do not support read");
-    return 0;
-  }
-  /*!
-   * \brief finish writing of all current base64 stream, do some post processing
-   * \param endch character to put to end of stream, if it is EOF, then nothing will be done
-   */
-  inline void Finish(char endch = EOF) {
-    using base64::EncodeTable;
-    if (buf_top == 1) {
-      PutChar(EncodeTable[buf[1] >> 2]);
-      PutChar(EncodeTable[(buf[1] << 4) & 0x3F]);
-      PutChar('=');
-      PutChar('=');
-    }
-    if (buf_top == 2) {
-      PutChar(EncodeTable[buf[1] >> 2]);
-      PutChar(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F]);
-      PutChar(EncodeTable[(buf[2] << 2) & 0x3F]);
-      PutChar('=');
-    }
-    buf_top = 0;
-    if (endch != EOF) PutChar(endch);
-    this->Flush();
-  }
-
- private:
-  IStream *fp;
-  int buf_top;
-  unsigned char buf[4];
-  std::string out_buf;
-  static const size_t kBufferSize = 256;
-
-  inline void PutChar(char ch) {
-    out_buf += ch;
-    if (out_buf.length() >= kBufferSize) Flush();
-  }
-  inline void Flush(void) {
-    if (out_buf.length() != 0) {
-      fp->Write(&out_buf[0], out_buf.length());
-      out_buf.clear();
-    }
-  }
-};
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_BASE64_INL_H_
--- a/src/utils/bitmap.h
+++ b/src/utils/bitmap.h
@@ -1,68 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file bitmap.h
- * \brief a simple implement of bitmap
- *  NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_BITMAP_H_
-#define XGBOOST_UTILS_BITMAP_H_
-
-#include <vector>
-#include "./utils.h"
-#include "./omp.h"
-
-namespace xgboost {
-namespace utils {
-/*! \brief bit map that contains set of bit indicators */
-struct BitMap {
-  /*! \brief internal data structure */
-  std::vector<uint32_t> data;
-  /*!
-   * \brief resize the bitmap to be certain size
-   * \param size the size of bitmap
-   */
-  inline void Resize(size_t size) {
-    data.resize((size + 31U) >> 5, 0);
-  }
-  /*!
-   * \brief query the i-th position of bitmap
-   * \param i the position in
-   */
-  inline bool Get(size_t i) const {
-    return (data[i >> 5] >> (i & 31U)) & 1U;
-  }
-  /*!
-   * \brief set i-th position to true
-   * \param i position index
-   */
-  inline void SetTrue(size_t i) {
-    data[i >> 5] |= (1 << (i & 31U));
-  }
-  /*! \brief initialize the value of bit map from vector of bool*/
-  inline void InitFromBool(const std::vector<int> &vec) {
-    this->Resize(vec.size());
-    // parallel over the full cases
-    bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
-    #pragma omp parallel for schedule(static)
-    for (bst_omp_uint i = 0; i < nsize; ++i) {
-      uint32_t res = 0;
-      for (int k = 0; k < 32; ++k) {
-        int bit = vec[(i << 5) | k];
-        res |= (bit << k);
-      }
-      data[i] = res;
-    }
-    if (nsize != vec.size()) data.back() = 0;
-    for (size_t i = nsize; i < vec.size(); ++i) {
-      if (vec[i]) this->SetTrue(i);
-    }
-  }
-  /*! \brief clear the bitmap, set all places to false */
-  inline void Clear(void) {
-    std::fill(data.begin(), data.end(), 0U);
-  }
-};
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_BITMAP_H_
--- a/src/utils/config.h
+++ b/src/utils/config.h
@@ -1,194 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file config.h
- * \brief helper class to load in configures from file
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_CONFIG_H_
-#define XGBOOST_UTILS_CONFIG_H_
-
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <istream>
-#include <fstream>
-#include "./utils.h"
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief base implementation of config reader
- */
-class ConfigReaderBase {
- public:
-  /*!
-   * \brief get current name, called after Next returns true
-   * \return current parameter name
-   */
-  inline const char *name(void) const {
-    return s_name.c_str();
-  }
-  /*!
-   * \brief get current value, called after Next returns true
-   * \return current parameter value
-   */
-  inline const char *val(void) const {
-    return s_val.c_str();
-  }
-  /*!
-   * \brief move iterator to next position
-   * \return true if there is value in next position
-   */
-  inline bool Next(void) {
-    while (!this->IsEnd()) {
-      GetNextToken(&s_name);
-      if (s_name == "=") return false;
-      if (GetNextToken(&s_buf) || s_buf != "=")  return false;
-      if (GetNextToken(&s_val) || s_val == "=")  return false;
-      return true;
-    }
-    return false;
-  }
-  // called before usage
-  inline void Init(void) {
-    ch_buf = this->GetChar();
-  }
-
- protected:
-  /*!
-   * \brief to be implemented by subclass,
-   * get next token, return EOF if end of file
-   */
-  virtual char GetChar(void) = 0;
-  /*! \brief to be implemented by child, check if end of stream */
-  virtual bool IsEnd(void) = 0;
-
- private:
-  char ch_buf;
-  std::string s_name, s_val, s_buf;
-
-  inline void SkipLine(void) {
-    do {
-      ch_buf = this->GetChar();
-    } while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
-  }
-
-  inline void ParseStr(std::string *tok) {
-    while ((ch_buf = this->GetChar()) != EOF) {
-      switch (ch_buf) {
-        case '\\': *tok += this->GetChar(); break;
-        case '\"': return;
-        case '\r':
-        case '\n': Error("ConfigReader: unterminated string");
-        default: *tok += ch_buf;
-      }
-    }
-    Error("ConfigReader: unterminated string");
-  }
-  inline void ParseStrML(std::string *tok) {
-    while ((ch_buf = this->GetChar()) != EOF) {
-      switch (ch_buf) {
-        case '\\': *tok += this->GetChar(); break;
-        case '\'': return;
-        default: *tok += ch_buf;
-      }
-    }
-    Error("unterminated string");
-  }
-  // return newline
-  inline bool GetNextToken(std::string *tok) {
-    tok->clear();
-    bool new_line = false;
-    while (ch_buf != EOF) {
-      switch (ch_buf) {
-        case '#' : SkipLine(); new_line = true; break;
-        case '\"':
-          if (tok->length() == 0) {
-            ParseStr(tok); ch_buf = this->GetChar(); return new_line;
-          } else {
-            Error("ConfigReader: token followed directly by string");
-          }
-        case '\'':
-          if (tok->length() == 0) {
-            ParseStrML(tok); ch_buf = this->GetChar(); return new_line;
-          } else {
-            Error("ConfigReader: token followed directly by string");
-          }
-        case '=':
-          if (tok->length() == 0) {
-            ch_buf = this->GetChar();
-            *tok = '=';
-          }
-          return new_line;
-        case '\r':
-        case '\n':
-          if (tok->length() == 0) new_line = true;
-        case '\t':
-        case ' ' :
-          ch_buf = this->GetChar();
-          if (tok->length() != 0) return new_line;
-          break;
-        default:
-          *tok += ch_buf;
-          ch_buf = this->GetChar();
-          break;
-      }
-    }
-    if (tok->length() == 0) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-/*!
- * \brief an iterator use stream base, allows use all types of istream
- */
-class ConfigStreamReader: public ConfigReaderBase {
- public:
-  /*!
-   * \brief constructor
-   * \param istream input stream
-   */
-  explicit ConfigStreamReader(std::istream &fin) : fin(fin) {}
-
- protected:
-  virtual char GetChar(void) {
-    return fin.get();
-  }
-  /*! \brief to be implemented by child, check if end of stream */
-  virtual bool IsEnd(void) {
-    return fin.eof();
-  }
-
- private:
-  std::istream &fin;
-};
-
-/*!
- * \brief an iterator that iterates over a configure file and gets the configures
- */
-class ConfigIterator: public ConfigStreamReader {
- public:
-  /*!
-   * \brief constructor
-   * \param fname name of configure file
-   */
-  explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) {
-    fi.open(fname);
-    if (fi.fail()) {
-      utils::Error("cannot open file %s", fname);
-    }
-    ConfigReaderBase::Init();
-  }
-  /*! \brief destructor */
-  ~ConfigIterator(void) {
-    fi.close();
-  }
-
- private:
-  std::ifstream fi;
-};
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_CONFIG_H_
--- a/src/utils/fmap.h
+++ b/src/utils/fmap.h
@@ -1,83 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file fmap.h
- * \brief helper class that holds the feature names and interpretations
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_FMAP_H_
-#define XGBOOST_UTILS_FMAP_H_
-
-#include <vector>
-#include <string>
-#include <cstring>
-#include "./utils.h"
-
-namespace xgboost {
-namespace utils {
-/*! \brief helper class that holds the feature names and interpretations */
-class FeatMap {
- public:
-  enum Type {
-    kIndicator = 0,
-    kQuantitive = 1,
-    kInteger = 2,
-    kFloat = 3
-  };
-  // function definitions
-  /*! \brief load feature map from text format */
-  inline void LoadText(const char *fname) {
-    std::FILE *fi = utils::FopenCheck(fname, "r");
-    this->LoadText(fi);
-    std::fclose(fi);
-  }
-  /*! \brief load feature map from text format */
-  inline void LoadText(std::FILE *fi) {
-    int fid;
-    char fname[1256], ftype[1256];
-    while (std::fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
-      this->PushBack(fid, fname, ftype);
-    }
-  }
-  /*!\brief push back feature map */
-  inline void PushBack(int fid, const char *fname, const char *ftype) {
-    utils::Check(fid == static_cast<int>(names_.size()), "invalid fmap format");
-    names_.push_back(std::string(fname));
-    types_.push_back(GetType(ftype));
-  }
-  inline void Clear(void) {
-    names_.clear(); types_.clear();
-  }
-  /*! \brief number of known features */
-  size_t size(void) const {
-    return names_.size();
-  }
-  /*! \brief return name of specific feature */
-  const char* name(size_t idx) const {
-    utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
-    return names_[idx].c_str();
-  }
-  /*! \brief return type of specific feature */
-  const Type& type(size_t idx) const {
-    utils::Assert(idx < names_.size(), "utils::FMap::type feature index exceed bound");
-    return types_[idx];
-  }
-
- private:
-  inline static Type GetType(const char *tname) {
-    using namespace std;
-    if (!strcmp("i", tname)) return kIndicator;
-    if (!strcmp("q", tname)) return kQuantitive;
-    if (!strcmp("int", tname)) return kInteger;
-    if (!strcmp("float", tname)) return kFloat;
-    utils::Error("unknown feature type, use i for indicator and q for quantity");
-    return kIndicator;
-  }
-  /*! \brief name of the feature */
-  std::vector<std::string> names_;
-  /*! \brief type of the feature */
-  std::vector<Type> types_;
-};
-
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_FMAP_H_
--- a/src/utils/group_data.h
+++ b/src/utils/group_data.h
@@ -1,114 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file group_data.h
- * \brief this file defines utils to group data by integer keys
- *     Input: given input sequence (key,value), (k1,v1), (k2,v2)
- *     Ouptupt: an array of values data = [v1,v2,v3 .. vn]
- *              and a group pointer ptr,
- *              data[ptr[k]:ptr[k+1]] contains values that corresponds to key k
- *
- * This can be used to construct CSR/CSC matrix from un-ordered input
- * The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_GROUP_DATA_H_
-#define XGBOOST_UTILS_GROUP_DATA_H_
-
-#include <vector>
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief multi-thread version of group builder
- * \tparam ValueType type of entries in the sparse matrix
- * \tparam SizeType type of the index range holder
- */
-template<typename ValueType, typename SizeType = size_t>
-struct ParallelGroupBuilder {
- public:
-  // parallel group builder of data
-  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
-                       std::vector<ValueType> *p_data)
-      : rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
-  }
-  ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
-                       std::vector<ValueType> *p_data,
-                       std::vector< std::vector<SizeType> > *p_thread_rptr)
-      : rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
-  }
-
- public:
-  /*!
-   * \brief step 1: initialize the helper, with hint of number keys
-   *                and thread used in the construction
-   * \param nkeys number of keys in the matrix, can be smaller than expected
-   * \param nthread number of thread that will be used in construction
-   */
-  inline void InitBudget(size_t nkeys, int nthread) {
-    thread_rptr.resize(nthread);
-    for (size_t i = 0;  i < thread_rptr.size(); ++i) {
-      thread_rptr[i].resize(nkeys);
-      std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
-    }
-  }
-  /*!
-   * \brief step 2: add budget to each key
-   * \param key the key
-   * \param threadid the id of thread that calls this function
-   * \param nelem number of element budget add to this row
-   */
-  inline void AddBudget(size_t key, int threadid, SizeType nelem = 1) {
-    std::vector<SizeType> &trptr = thread_rptr[threadid];
-    if (trptr.size() < key + 1) {
-      trptr.resize(key + 1, 0);
-    }
-    trptr[key] += nelem;
-  }
-  /*! \brief step 3: initialize the necessary storage */
-  inline void InitStorage(void) {
-    // set rptr to correct size
-    for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
-      if (rptr.size() <= thread_rptr[tid].size()) {
-        rptr.resize(thread_rptr[tid].size() + 1);
-      }
-    }
-    // initialize rptr to be beginning of each segment
-    size_t start = 0;
-    for (size_t i = 0; i + 1 < rptr.size(); ++i) {
-      for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
-        std::vector<SizeType> &trptr = thread_rptr[tid];
-        if (i < trptr.size()) {
-          size_t ncnt = trptr[i];
-          trptr[i] = start;
-          start += ncnt;
-        }
-      }
-      rptr[i + 1] = start;
-    }
-    data.resize(start);
-  }
-  /*!
-   * \brief step 4: add data to the allocated space,
-   *   the calls to this function should be exactly match previous call to AddBudget
-   *
-   * \param key the key of
-   * \param threadid the id of thread that calls this function
-   */
-  inline void Push(size_t key, ValueType value, int threadid) {
-    SizeType &rp = thread_rptr[threadid][key];
-    data[rp++] = value;
-  }
-
- private:
-  /*! \brief pointer to the beginning and end of each continuous key */
-  std::vector<SizeType> &rptr;
-  /*! \brief index of nonzero entries in each row */
-  std::vector<ValueType> &data;
-  /*! \brief thread local data structure */
-  std::vector< std::vector<SizeType> > &thread_rptr;
-  /*! \brief local temp thread ptr, use this if not specified by the constructor */
-  std::vector< std::vector<SizeType> > tmp_thread_rptr;
-};
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_GROUP_DATA_H_
--- a/src/utils/io.h
+++ b/src/utils/io.h
@@ -1,59 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file io.h
- * \brief general stream interface for serialization, I/O
- * \author Tianqi Chen
- */
-
-#ifndef XGBOOST_UTILS_IO_H_
-#define XGBOOST_UTILS_IO_H_
-#include <cstdio>
-#include <vector>
-#include <string>
-#include <cstring>
-#include "./utils.h"
-#include "../sync/sync.h"
-
-namespace xgboost {
-namespace utils {
-// reuse the definitions of streams
-typedef rabit::Stream IStream;
-typedef rabit::utils::SeekStream ISeekStream;
-typedef rabit::utils::MemoryFixSizeBuffer MemoryFixSizeBuffer;
-typedef rabit::utils::MemoryBufferStream MemoryBufferStream;
-
-/*! \brief implementation of file i/o stream */
-class FileStream : public ISeekStream {
- public:
-  explicit FileStream(std::FILE *fp) : fp(fp) {}
-  FileStream(void) {
-    this->fp = NULL;
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return std::fread(ptr, size, 1, fp);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    Check(std::fwrite(ptr, size, 1, fp) == 1, "FileStream::Write: fwrite error!");
-  }
-  virtual void Seek(size_t pos) {
-    std::fseek(fp, static_cast<long>(pos), SEEK_SET); // NOLINT(*)
-  }
-  virtual size_t Tell(void) {
-    return std::ftell(fp);
-  }
-  virtual bool AtEnd(void) const {
-    return std::feof(fp) != 0;
-  }
-  inline void Close(void) {
-    if (fp != NULL) {
-      std::fclose(fp); fp = NULL;
-    }
-  }
-
- private:
-  std::FILE *fp;
-};
-}  // namespace utils
-}  // namespace xgboost
-#include "./base64-inl.h"
-#endif  // XGBOOST_UTILS_IO_H_
--- a/src/utils/iterator.h
+++ b/src/utils/iterator.h
@@ -1,42 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file iterator.h
- * \brief itertator interface
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_ITERATOR_H_
-#define XGBOOST_UTILS_ITERATOR_H_
-#include <cstdio>
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief iterator interface
- * \tparam DType data type
- */
-template<typename DType>
-class IIterator {
- public:
-  /*!
-   * \brief set the parameter
-   * \param name name of parameter
-   * \param val value of parameter
-   */
-  virtual void SetParam(const char *name, const char *val) {}
-  /*! \brief initialize the iterator so that we can use the iterator */
-  virtual void Init(void) {}
-  /*! \brief set before first of the item */
-  virtual void BeforeFirst(void) = 0;
-  /*! \brief move to next item */
-  virtual bool Next(void) = 0;
-  /*! \brief get current data */
-  virtual const DType &Value(void) const = 0;
- public:
-  /*! \brief constructor */
-  virtual ~IIterator(void) {}
-};
-
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_ITERATOR_H_
-
--- a/src/utils/math.h
+++ b/src/utils/math.h
@@ -1,45 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file math.h
- * \brief support additional math
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_MATH_H_
-#define XGBOOST_UTILS_MATH_H_
-
-#include <cmath>
-
-namespace xgboost {
-namespace utils {
-#ifdef XGBOOST_STRICT_CXX98_
-// check nan
-bool CheckNAN(double v);
-double LogGamma(double v);
-#else
-template<typename T>
-inline bool CheckNAN(T v) {
-#ifdef _MSC_VER
-  return (_isnan(v) != 0);
-#else
-  return isnan(v);
-#endif
-}
-template<typename T>
-inline T LogGamma(T v) {
-#ifdef _MSC_VER
-#if _MSC_VER >= 1800
-  return lgamma(v);
-#else
-#pragma message("Warning: lgamma function was not available until VS2013"\
-                ", poisson regression will be disabled")
-  utils::Error("lgamma function was not available until VS2013");
-  return static_cast<T>(1.0);
-#endif
-#else
-  return lgamma(v);
-#endif
-}
-#endif
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_MATH_H_
--- a/src/utils/omp.h
+++ b/src/utils/omp.h
@@ -1,34 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file omp.h
- * \brief header to handle OpenMP compatibility issues
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_OMP_H_
-#define XGBOOST_UTILS_OMP_H_
-
-#if defined(_OPENMP) && !defined(DISABLE_OPENMP)
-#include <omp.h>
-#else
-#if !defined(DISABLE_OPENMP) && !defined(_MSC_VER)
-// use pragma message instead of warning
-#pragma message("Warning: OpenMP is not available,"\
-                "xgboost will be compiled into single-thread code."\
-                "Use OpenMP-enabled compiler to get benefit of multi-threading")
-#endif
-inline int omp_get_thread_num() { return 0; }
-inline int omp_get_num_threads() { return 1; }
-inline void omp_set_num_threads(int nthread) {}
-inline int omp_get_num_procs() { return 1; }
-#endif
-
-// loop variable used in openmp
-namespace xgboost {
-#ifdef _MSC_VER
-typedef int bst_omp_uint;
-#else
-typedef unsigned bst_omp_uint;
-#endif
-}  // namespace xgboost
-
-#endif  // XGBOOST_UTILS_OMP_H_
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -1,820 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file quantile.h
- * \brief util to compute quantiles
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_QUANTILE_H_
-#define XGBOOST_UTILS_QUANTILE_H_
-
-#include <cmath>
-#include <vector>
-#include <cstring>
-#include <algorithm>
-#include <iostream>
-#include "./io.h"
-#include "./utils.h"
-
-namespace xgboost {
-namespace utils {
-/*!
- * \brief experimental wsummary
- * \tparam DType type of data content
- * \tparam RType type of rank
- */
-template<typename DType, typename RType>
-struct WQSummary {
-  /*! \brief an entry in the sketch summary */
-  struct Entry {
-    /*! \brief minimum rank */
-    RType rmin;
-    /*! \brief maximum rank */
-    RType rmax;
-    /*! \brief maximum weight */
-    RType wmin;
-    /*! \brief the value of data */
-    DType value;
-    // constructor
-    Entry(void) {}
-    // constructor
-    Entry(RType rmin, RType rmax, RType wmin, DType value)
-        : rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
-    /*!
-     * \brief debug function,  check Valid
-     * \param eps the tolerate level for violating the relation
-     */
-    inline void CheckValid(RType eps = 0) const {
-      utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
-      utils::Assert(rmax- rmin - wmin > -eps, "relation constraint: min/max");
-    }
-    /*! \return rmin estimation for v strictly bigger than value */
-    inline RType rmin_next(void) const {
-      return rmin + wmin;
-    }
-    /*! \return rmax estimation for v strictly smaller than value */
-    inline RType rmax_prev(void) const {
-      return rmax - wmin;
-    }
-  };
-  /*! \brief input data queue before entering the summary */
-  struct Queue {
-    // entry in the queue
-    struct QEntry {
-      // value of the instance
-      DType value;
-      // weight of instance
-      RType weight;
-      // default constructor
-      QEntry(void) {}
-      // constructor
-      QEntry(DType value, RType weight)
-          : value(value), weight(weight) {}
-      // comparator on value
-      inline bool operator<(const QEntry &b) const {
-        return value < b.value;
-      }
-    };
-    // the input queue
-    std::vector<QEntry> queue;
-    // end of the queue
-    size_t qtail;
-    // push data to the queue
-    inline void Push(DType x, RType w) {
-      if (qtail == 0 || queue[qtail - 1].value != x) {
-        queue[qtail++] = QEntry(x, w);
-      } else {
-        queue[qtail - 1].weight += w;
-      }
-    }
-    inline void MakeSummary(WQSummary *out) {
-      std::sort(queue.begin(), queue.begin() + qtail);
-      out->size = 0;
-      // start update sketch
-      RType wsum = 0;
-      // construct data with unique weights
-      for (size_t i = 0; i < qtail;) {
-        size_t j = i + 1;
-        RType w = queue[i].weight;
-        while (j < qtail && queue[j].value == queue[i].value) {
-          w += queue[j].weight; ++j;
-        }
-        out->data[out->size++] = Entry(wsum, wsum + w, w, queue[i].value);
-        wsum += w; i = j;
-      }
-    }
-  };
-  /*! \brief data field */
-  Entry *data;
-  /*! \brief number of elements in the summary */
-  size_t size;
-  // constructor
-  WQSummary(Entry *data, size_t size)
-      : data(data), size(size) {}
-  /*!
-   * \return the maximum error of the Summary
-   */
-  inline RType MaxError(void) const {
-    RType res = data[0].rmax - data[0].rmin - data[0].wmin;
-    for (size_t i = 1; i < size; ++i) {
-      res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
-      res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
-    }
-    return res;
-  }
-  /*!
-   * \brief query qvalue, start from istart
-   * \param qvalue the value we query for
-   * \param istart starting position
-   */
-  inline Entry Query(DType qvalue, size_t &istart) const { // NOLINT(*)
-    while (istart < size && qvalue > data[istart].value) {
-      ++istart;
-    }
-    if (istart == size) {
-      RType rmax = data[size - 1].rmax;
-      return Entry(rmax, rmax, 0.0f, qvalue);
-    }
-    if (qvalue == data[istart].value) {
-      return data[istart];
-    } else {
-      if (istart == 0) {
-        return Entry(0.0f, 0.0f, 0.0f, qvalue);
-      } else {
-        return Entry(data[istart - 1].rmin_next(),
-                     data[istart].rmax_prev(),
-                     0.0f, qvalue);
-      }
-    }
-  }
-  /*! \return maximum rank in the summary */
-  inline RType MaxRank(void) const {
-    return data[size - 1].rmax;
-  }
-  /*!
-   * \brief copy content from src
-   * \param src source sketch
-   */
-  inline void CopyFrom(const WQSummary &src) {
-    size = src.size;
-    std::memcpy(data, src.data, sizeof(Entry) * size);
-  }
-  /*!
-   * \brief debug function, validate whether the summary
-   *  run consistency check to check if it is a valid summary
-   * \param eps the tolerate error level, used when RType is floating point and
-   *        some inconsistency could occur due to rounding error
-   */
-  inline void CheckValid(RType eps) const {
-    for (size_t i = 0; i < size; ++i) {
-      data[i].CheckValid(eps);
-      if (i != 0) {
-        utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
-        utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
-      }
-    }
-  }
-  /*!
-   * \brief set current summary to be pruned summary of src
-   *        assume data field is already allocated to be at least maxsize
-   * \param src source summary
-   * \param maxsize size we can afford in the pruned sketch
-   */
-
-  inline void SetPrune(const WQSummary &src, size_t maxsize) {
-    if (src.size <= maxsize) {
-      this->CopyFrom(src); return;
-    }
-    const RType begin = src.data[0].rmax;
-    const RType range = src.data[src.size - 1].rmin - src.data[0].rmax;
-    const size_t n = maxsize - 1;
-    data[0] = src.data[0];
-    this->size = 1;
-    // lastidx is used to avoid duplicated records
-    size_t i = 1, lastidx = 0;
-    for (size_t k = 1; k < n; ++k) {
-      RType dx2 =  2 * ((k * range) / n + begin);
-      // find first i such that  d < (rmax[i+1] + rmin[i+1]) / 2
-      while (i < src.size - 1
-             && dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
-      utils::Assert(i != src.size - 1, "this cannot happen");
-      if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
-        if (i != lastidx) {
-          data[size++] = src.data[i]; lastidx = i;
-        }
-      } else {
-        if (i + 1 != lastidx) {
-          data[size++] = src.data[i + 1]; lastidx = i + 1;
-        }
-      }
-    }
-    if (lastidx != src.size - 1) {
-      data[size++] = src.data[src.size - 1];
-    }
-  }
-  /*!
-   * \brief set current summary to be merged summary of sa and sb
-   * \param sa first input summary to be merged
-   * \param sb second input summary to be merged
-   */
-  inline void SetCombine(const WQSummary &sa,
-                         const WQSummary &sb) {
-    if (sa.size == 0) {
-      this->CopyFrom(sb); return;
-    }
-    if (sb.size == 0) {
-      this->CopyFrom(sa); return;
-    }
-    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge");
-    const Entry *a = sa.data, *a_end = sa.data + sa.size;
-    const Entry *b = sb.data, *b_end = sb.data + sb.size;
-    // extended rmin value
-    RType aprev_rmin = 0, bprev_rmin = 0;
-    Entry *dst = this->data;
-    while (a != a_end && b != b_end) {
-      // duplicated value entry
-      if (a->value == b->value) {
-        *dst = Entry(a->rmin + b->rmin,
-                     a->rmax + b->rmax,
-                     a->wmin + b->wmin, a->value);
-        aprev_rmin = a->rmin_next();
-        bprev_rmin = b->rmin_next();
-        ++dst; ++a; ++b;
-      } else if (a->value < b->value) {
-        *dst = Entry(a->rmin + bprev_rmin,
-                     a->rmax + b->rmax_prev(),
-                     a->wmin, a->value);
-        aprev_rmin = a->rmin_next();
-        ++dst; ++a;
-      } else {
-        *dst = Entry(b->rmin + aprev_rmin,
-                     b->rmax + a->rmax_prev(),
-                     b->wmin, b->value);
-        bprev_rmin = b->rmin_next();
-        ++dst; ++b;
-      }
-    }
-    if (a != a_end) {
-      RType brmax = (b_end - 1)->rmax;
-      do {
-        *dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
-        ++dst; ++a;
-      } while (a != a_end);
-    }
-    if (b != b_end) {
-      RType armax = (a_end - 1)->rmax;
-      do {
-        *dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
-        ++dst; ++b;
-      } while (b != b_end);
-    }
-    this->size = dst - data;
-    const RType tol = 10;
-    RType err_mingap, err_maxgap, err_wgap;
-    this->FixError(&err_mingap, &err_maxgap, &err_wgap);
-    if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
-      utils::Printf("INFO: mingap=%g, maxgap=%g, wgap=%g\n",
-                    err_mingap, err_maxgap, err_wgap);
-    }
-
-    utils::Assert(size <= sa.size + sb.size, "bug in combine");
-  }
-  // helper function to print the current content of sketch
-  inline void Print() const {
-    for (size_t i = 0; i < this->size; ++i) {
-      utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g\n",
-                    i, data[i].rmin, data[i].rmax,
-                    data[i].wmin, data[i].value);
-    }
-  }
-  // try to fix rounding error
-  // and re-establish invariance
-  inline void FixError(RType *err_mingap,
-                       RType *err_maxgap,
-                       RType *err_wgap) const {
-    *err_mingap = 0;
-    *err_maxgap = 0;
-    *err_wgap = 0;
-    RType prev_rmin = 0, prev_rmax = 0;
-    for (size_t i = 0; i < this->size; ++i) {
-      if (data[i].rmin < prev_rmin) {
-        data[i].rmin = prev_rmin;
-        *err_mingap = std::max(*err_mingap, prev_rmin - data[i].rmin);
-      } else {
-        prev_rmin = data[i].rmin;
-      }
-      if (data[i].rmax < prev_rmax) {
-        data[i].rmax = prev_rmax;
-        *err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].rmax);
-      }
-      RType rmin_next = data[i].rmin_next();
-      if (data[i].rmax < rmin_next) {
-        data[i].rmax = rmin_next;
-        *err_wgap = std::max(*err_wgap, data[i].rmax - rmin_next);
-      }
-      prev_rmax = data[i].rmax;
-    }
-  }
-  // check consistency of the summary
-  inline bool Check(const char *msg) const {
-    const float tol = 10.0f;
-    for (size_t i = 0; i < this->size; ++i) {
-      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
-          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
-        utils::Printf("----%s: Check not Pass------\n", msg);
-        this->Print();
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-/*! \brief try to do efficient pruning */
-template<typename DType, typename RType>
-struct WXQSummary : public WQSummary<DType, RType> {
-  // redefine entry type
-  typedef typename WQSummary<DType, RType>::Entry Entry;
-  // constructor
-  WXQSummary(Entry *data, size_t size)
-      : WQSummary<DType, RType>(data, size) {}
-  // check if the block is large chunk
-  inline static bool CheckLarge(const Entry &e, RType chunk) {
-    return  e.rmin_next() > e.rmax_prev() + chunk;
-  }
-  // set prune
-  inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
-    if (src.size <= maxsize) {
-      this->CopyFrom(src); return;
-    }
-    RType begin = src.data[0].rmax;
-    size_t n = maxsize - 1, nbig = 0;
-    RType range = src.data[src.size - 1].rmin - begin;
-    // prune off zero weights
-    if (range == 0.0f) {
-      // special case, contain only two effective data pts
-      this->data[0] = src.data[0];
-      this->data[1] = src.data[src.size - 1];
-      this->size = 2;
-      return;
-    } else {
-      range = std::max(range, static_cast<RType>(1e-3f));
-    }
-    const RType chunk = 2 * range / n;
-    // minimized range
-    RType mrange = 0;
-    {
-      // first scan, grab all the big chunk
-      // moving block index
-      size_t bid = 0;
-      for (size_t i = 1; i < src.size; ++i) {
-        if (CheckLarge(src.data[i], chunk)) {
-          if (bid != i - 1) {
-            mrange += src.data[i].rmax_prev() - src.data[bid].rmin_next();
-          }
-          bid = i; ++nbig;
-        }
-      }
-      if (bid != src.size - 2) {
-        mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
-      }
-    }
-    if (nbig >= n - 1) {
-      // see what was the case
-      utils::Printf("LOG: check quantile stats, nbig=%lu, n=%lu\n", nbig, n);
-      utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
-                    src.size, maxsize, static_cast<double>(range),
-                    static_cast<double>(chunk));
-      src.Print();
-      utils::Assert(nbig < n - 1, "quantile: too many large chunk");
-    }
-    this->data[0] = src.data[0];
-    this->size = 1;
-    // use smaller size
-    n = n - nbig;
-    // find the rest of point
-    size_t bid = 0, k = 1, lastidx = 0;
-    for (size_t end = 1; end < src.size; ++end) {
-      if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
-        if (bid != end - 1) {
-          size_t i = bid;
-          RType maxdx2 = src.data[end].rmax_prev() * 2;
-          for (; k < n; ++k) {
-            RType dx2 =  2 * ((k * mrange) / n + begin);
-            if (dx2 >= maxdx2) break;
-            while (i < end &&
-                   dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
-            if (i == end) break;
-            if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
-              if (i != lastidx) {
-                this->data[this->size++] = src.data[i]; lastidx = i;
-              }
-            } else {
-              if (i + 1 != lastidx) {
-                this->data[this->size++] = src.data[i + 1]; lastidx = i + 1;
-              }
-            }
-          }
-        }
-        if (lastidx != end) {
-          this->data[this->size++] = src.data[end];
-          lastidx = end;
-        }
-        bid = end;
-        // shift base by the gap
-        begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev();
-      }
-    }
-  }
-};
-/*!
- * \brief traditional GK summary
- */
-template<typename DType, typename RType>
-struct GKSummary {
-  /*! \brief an entry in the sketch summary */
-  struct Entry {
-    /*! \brief minimum rank */
-    RType rmin;
-    /*! \brief maximum rank */
-    RType rmax;
-    /*! \brief the value of data */
-    DType value;
-    // constructor
-    Entry(void) {}
-    // constructor
-    Entry(RType rmin, RType rmax, DType value)
-        : rmin(rmin), rmax(rmax), value(value) {}
-  };
-  /*! \brief input data queue before entering the summary */
-  struct Queue {
-    // the input queue
-    std::vector<DType> queue;
-    // end of the queue
-    size_t qtail;
-    // push data to the queue
-    inline void Push(DType x, RType w) {
-      queue[qtail++] = x;
-    }
-    inline void MakeSummary(GKSummary *out) {
-      std::sort(queue.begin(), queue.begin() + qtail);
-      out->size = qtail;
-      for (size_t i = 0; i < qtail; ++i) {
-        out->data[i] = Entry(i + 1, i + 1, queue[i]);
-      }
-    }
-  };
-  /*! \brief data field */
-  Entry *data;
-  /*! \brief number of elements in the summary */
-  size_t size;
-  GKSummary(Entry *data, size_t size)
-      : data(data), size(size) {}
-  /*! \brief the maximum error of the summary */
-  inline RType MaxError(void) const {
-    RType res = 0;
-    for (size_t i = 1; i < size; ++i) {
-      res = std::max(data[i].rmax - data[i-1].rmin, res);
-    }
-    return res;
-  }
-  /*! \return maximum rank in the summary */
-  inline RType MaxRank(void) const {
-    return data[size - 1].rmax;
-  }
-  /*!
-   * \brief copy content from src
-   * \param src source sketch
-   */
-  inline void CopyFrom(const GKSummary &src) {
-    size = src.size;
-    std::memcpy(data, src.data, sizeof(Entry) * size);
-  }
-  inline void CheckValid(RType eps) const {
-    // assume always valid
-  }
-  /*! \brief used for debug purpose, print the summary */
-  inline void Print(void) const {
-    for (size_t i = 0; i < size; ++i) {
-      std::cout << "x=" << data[i].value << "\t"
-                << "[" << data[i].rmin << "," << data[i].rmax << "]"
-                << std::endl;
-    }
-  }
-  /*!
-   * \brief set current summary to be pruned summary of src
-   *        assume data field is already allocated to be at least maxsize
-   * \param src source summary
-   * \param maxsize size we can afford in the pruned sketch
-   */
-  inline void SetPrune(const GKSummary &src, size_t maxsize) {
-    if (src.size <= maxsize) {
-      this->CopyFrom(src); return;
-    }
-    const RType max_rank = src.MaxRank();
-    this->size = maxsize;
-    data[0] = src.data[0];
-    size_t n = maxsize - 1;
-    RType top = 1;
-    for (size_t i = 1; i < n; ++i) {
-      RType k = (i * max_rank) / n;
-      while (k > src.data[top + 1].rmax) ++top;
-      // assert src.data[top].rmin <= k
-      // because k > src.data[top].rmax >= src.data[top].rmin
-      if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
-        data[i] = src.data[top];
-      } else {
-        data[i] = src.data[top + 1];
-      }
-    }
-    data[n] = src.data[src.size - 1];
-  }
-  inline void SetCombine(const GKSummary &sa,
-                         const GKSummary &sb) {
-    if (sa.size == 0) {
-      this->CopyFrom(sb); return;
-    }
-    if (sb.size == 0) {
-      this->CopyFrom(sa); return;
-    }
-    utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge");
-    const Entry *a = sa.data, *a_end = sa.data + sa.size;
-    const Entry *b = sb.data, *b_end = sb.data + sb.size;
-    this->size = sa.size + sb.size;
-    RType aprev_rmin = 0, bprev_rmin = 0;
-    Entry *dst = this->data;
-    while (a != a_end && b != b_end) {
-      if (a->value < b->value) {
-        *dst = Entry(bprev_rmin + a->rmin,
-                     a->rmax + b->rmax - 1, a->value);
-        aprev_rmin = a->rmin;
-        ++dst; ++a;
-      } else {
-        *dst = Entry(aprev_rmin + b->rmin,
-                     b->rmax + a->rmax - 1, b->value);
-        bprev_rmin = b->rmin;
-        ++dst; ++b;
-      }
-    }
-    if (a != a_end) {
-      RType bprev_rmax = (b_end - 1)->rmax;
-      do {
-        *dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
-        ++dst; ++a;
-      } while (a != a_end);
-    }
-    if (b != b_end) {
-      RType aprev_rmax = (a_end - 1)->rmax;
-      do {
-        *dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
-        ++dst; ++b;
-      } while (b != b_end);
-    }
-    utils::Assert(dst == data + size, "bug in combine");
-  }
-};
-
-/*!
- * \brief template for all quantile sketch algorithm
- *        that uses merge/prune scheme
- * \tparam DType type of data content
- * \tparam RType type of rank
- * \tparam TSummary actual summary data structure it uses
- */
-template<typename DType, typename RType, class TSummary>
-class QuantileSketchTemplate {
- public:
-  /*! \brief type of summary type */
-  typedef TSummary Summary;
-  /*! \brief the entry type */
-  typedef typename Summary::Entry Entry;
-  /*! \brief same as summary, but use STL to backup the space */
-  struct SummaryContainer : public Summary {
-    std::vector<Entry> space;
-    SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) {
-      this->space = src.space;
-      this->data = BeginPtr(this->space);
-    }
-    SummaryContainer(void) : Summary(NULL, 0) {
-    }
-    /*! \brief reserve space for summary */
-    inline void Reserve(size_t size) {
-      if (size > space.size()) {
-        space.resize(size);
-        this->data = BeginPtr(space);
-      }
-    }
-    /*!
-     * \brief set the space to be merge of all Summary arrays
-     * \param begin beginning position in the summary array
-     * \param end ending position in the Summary array
-     */
-    inline void SetMerge(const Summary *begin,
-                         const Summary *end) {
-      utils::Assert(begin < end, "can not set combine to empty instance");
-      size_t len = end - begin;
-      if (len == 1) {
-        this->Reserve(begin[0].size);
-        this->CopyFrom(begin[0]);
-      } else if (len == 2) {
-        this->Reserve(begin[0].size + begin[1].size);
-        this->SetMerge(begin[0], begin[1]);
-      } else {
-        // recursive merge
-        SummaryContainer lhs, rhs;
-        lhs.SetCombine(begin, begin + len / 2);
-        rhs.SetCombine(begin + len / 2, end);
-        this->Reserve(lhs.size + rhs.size);
-        this->SetCombine(lhs, rhs);
-      }
-    }
-    /*!
-     * \brief do elementwise combination of summary array
-     *        this[i] = combine(this[i], src[i]) for each i
-     * \param src the source summary
-     * \param max_nbyte, maximum number of byte allowed in here
-     */
-    inline void Reduce(const Summary &src, size_t max_nbyte) {
-      this->Reserve((max_nbyte - sizeof(this->size)) / sizeof(Entry));
-      SummaryContainer temp;
-      temp.Reserve(this->size + src.size);
-      temp.SetCombine(*this, src);
-      this->SetPrune(temp, space.size());
-    }
-    /*! \brief return the number of bytes this data structure cost in serialization */
-    inline static size_t CalcMemCost(size_t nentry) {
-      return sizeof(size_t) + sizeof(Entry) * nentry;
-    }
-    /*! \brief save the data structure into stream */
-    template<typename TStream>
-    inline void Save(TStream &fo) const {  // NOLINT(*)
-      fo.Write(&(this->size), sizeof(this->size));
-      if (this->size != 0) {
-        fo.Write(this->data, this->size * sizeof(Entry));
-      }
-    }
-    /*! \brief load data structure from input stream */
-    template<typename TStream>
-    inline void Load(TStream &fi) {  // NOLINT(*)
-      utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
-      this->Reserve(this->size);
-      if (this->size != 0) {
-        utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0,
-                     "invalid SummaryArray 2");
-      }
-    }
-  };
-  /*!
-   * \brief initialize the quantile sketch, given the performance specification
-   * \param maxn maximum number of data points can be feed into sketch
-   * \param eps accuracy level of summary
-   */
-  inline void Init(size_t maxn, double eps) {
-    nlevel = 1;
-    while (true) {
-      limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
-      size_t n = (1UL << nlevel);
-      if (n * limit_size >= maxn) break;
-      ++nlevel;
-    }
-    // check invariant
-    size_t n = (1UL << nlevel);
-    utils::Assert(n * limit_size >= maxn, "invalid init parameter");
-    utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
-    // lazy reserve the space, if there is only one value, no need to allocate space
-    inqueue.queue.resize(1);
-    inqueue.qtail = 0;
-    data.clear();
-    level.clear();
-  }
-  /*!
-   * \brief add an element to a sketch
-   * \param x the element added to the sketch
-   */
-  inline void Push(DType x, RType w = 1) {
-    if (w == static_cast<RType>(0)) return;
-    if (inqueue.qtail == inqueue.queue.size()) {
-      // jump from lazy one value to limit_size * 2
-      if (inqueue.queue.size() == 1) {
-        inqueue.queue.resize(limit_size * 2);
-      } else {
-        temp.Reserve(limit_size * 2);
-        inqueue.MakeSummary(&temp);
-        // cleanup queue
-        inqueue.qtail = 0;
-        this->PushTemp();
-      }
-    }
-    inqueue.Push(x, w);
-  }
-  /*! \brief push up temp */
-  inline void PushTemp(void) {
-    temp.Reserve(limit_size * 2);
-    for (size_t l = 1; true; ++l) {
-      this->InitLevel(l + 1);
-      // check if level l is empty
-      if (level[l].size == 0) {
-        level[l].SetPrune(temp, limit_size);
-        break;
-      } else {
-        // level 0 is actually temp space
-        level[0].SetPrune(temp, limit_size);
-        temp.SetCombine(level[0], level[l]);
-        if (temp.size > limit_size) {
-          // try next level
-          level[l].size = 0;
-        } else {
-          // if merged record is still smaller, no need to send to next level
-          level[l].CopyFrom(temp); break;
-        }
-      }
-    }
-  }
-  /*! \brief get the summary after finalize */
-  inline void GetSummary(SummaryContainer *out) {
-    if (level.size() != 0) {
-      out->Reserve(limit_size * 2);
-    } else {
-      out->Reserve(inqueue.queue.size());
-    }
-    inqueue.MakeSummary(out);
-    if (level.size() != 0) {
-      level[0].SetPrune(*out, limit_size);
-      for (size_t l = 1; l < level.size(); ++l) {
-        if (level[l].size == 0) continue;
-        if (level[0].size == 0) {
-          level[0].CopyFrom(level[l]);
-        } else {
-          out->SetCombine(level[0], level[l]);
-          level[0].SetPrune(*out, limit_size);
-        }
-      }
-      out->CopyFrom(level[0]);
-    } else {
-      if (out->size > limit_size) {
-        temp.Reserve(limit_size);
-        temp.SetPrune(*out, limit_size);
-        out->CopyFrom(temp);
-      }
-    }
-  }
-  // used for debug, check if the sketch is valid
-  inline void CheckValid(RType eps) const {
-    for (size_t l = 1; l < level.size(); ++l) {
-      level[l].CheckValid(eps);
-    }
-  }
-  // initialize level space to at least nlevel
-  inline void InitLevel(size_t nlevel) {
-    if (level.size() >= nlevel) return;
-    data.resize(limit_size * nlevel);
-    level.resize(nlevel, Summary(NULL, 0));
-    for (size_t l = 0; l < level.size(); ++l) {
-      level[l].data = BeginPtr(data) + l * limit_size;
-    }
-  }
-  // input data queue
-  typename Summary::Queue inqueue;
-  // number of levels
-  size_t nlevel;
-  // size of summary in each level
-  size_t limit_size;
-  // the level of each summaries
-  std::vector<Summary> level;
-  // content of the summary
-  std::vector<Entry> data;
-  // temporal summary, used for temp-merge
-  SummaryContainer temp;
-};
-
-/*!
- * \brief Quantile sketch use WQSummary
- * \tparam DType type of data content
- * \tparam RType type of rank
- */
-template<typename DType, typename RType = unsigned>
-class WQuantileSketch :
-      public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
-};
-
-/*!
- * \brief Quantile sketch use WXQSummary
- * \tparam DType type of data content
- * \tparam RType type of rank
- */
-template<typename DType, typename RType = unsigned>
-class WXQuantileSketch :
-      public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> >{
-};
-/*!
- * \brief Quantile sketch use WQSummary
- * \tparam DType type of data content
- * \tparam RType type of rank
- */
-template<typename DType, typename RType = unsigned>
-class GKQuantileSketch :
-      public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> >{
-};
-
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_QUANTILE_H_
--- a/src/utils/random.h
+++ b/src/utils/random.h
@@ -1,108 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file xgboost_random.h
- * \brief PRNG to support random number generation
- * \author Tianqi Chen: tianqi.tchen@gmail.com
- *
- * Use standard PRNG from stdlib
- */
-#ifndef XGBOOST_UTILS_RANDOM_H_
-#define XGBOOST_UTILS_RANDOM_H_
-
-#include <cmath>
-#include <cstdlib>
-#include <vector>
-#include <algorithm>
-#include "./utils.h"
-
-/*! namespace of PRNG */
-namespace xgboost {
-namespace random {
-#ifndef XGBOOST_CUSTOMIZE_PRNG_
-/*! \brief seed the PRNG */
-inline void Seed(unsigned seed) {
-  srand(seed);
-}
-/*! \brief basic function, uniform */
-inline double Uniform(void) {
-  return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0); // NOLINT(*)
-}
-/*! \brief return a real number uniform in (0,1) */
-inline double NextDouble2(void) {
-  return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0); // NOLINT(*)
-}
-/*! \brief return  x~N(0,1) */
-inline double Normal(void) {
-  double x, y, s;
-  do {
-    x = 2 * NextDouble2() - 1.0;
-    y = 2 * NextDouble2() - 1.0;
-    s = x*x + y*y;
-  } while (s >= 1.0 || s == 0.0);
-
-  return x * sqrt(-2.0 * log(s) / s);
-}
-#else
-// include declarations, to be implemented
-void Seed(unsigned seed);
-double Uniform(void);
-double Normal(void);
-#endif
-
-/*! \brief return a real number uniform in [0,1) */
-inline double NextDouble(void) {
-  return Uniform();
-}
-/*! \brief return a random number in n */
-inline uint32_t NextUInt32(uint32_t n) {
-  return (uint32_t)std::floor(NextDouble() * n);
-}
-/*! \brief return  x~N(mu,sigma^2) */
-inline double SampleNormal(double mu, double sigma) {
-  return Normal() * sigma + mu;
-}
-/*! \brief  return 1 with probability p, coin flip */
-inline int SampleBinary(double p) {
-  return NextDouble() < p;
-}
-
-template<typename T>
-inline void Shuffle(T *data, size_t sz) {
-  if (sz == 0) return;
-  for (uint32_t i = (uint32_t)sz - 1; i > 0; i--) {
-    std::swap(data[i], data[NextUInt32(i + 1)]);
-  }
-}
-// random shuffle the data inside, require PRNG
-template<typename T>
-inline void Shuffle(std::vector<T> &data) { // NOLINT(*)
-  Shuffle(&data[0], data.size());
-}
-
-/*! \brief random number generator with independent random number seed*/
-struct Random{
-  /*! \brief set random number seed */
-  inline void Seed(unsigned sd) {
-    this->rseed = sd;
-#if defined(_MSC_VER) || defined(_WIN32)
-    ::xgboost::random::Seed(sd);
-#endif
-  }
-  /*! \brief return a real number uniform in [0,1) */
-  inline double RandDouble(void) {
-    // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
-    // For cygwin and mingw, this can slows down parallelism,
-    // but rand_r is only used in objective-inl.hpp, won't affect speed in general
-    // todo, replace with another PRNG
-#if defined(_MSC_VER) || defined(_WIN32) || defined(XGBOOST_STRICT_CXX98_)
-    return Uniform();
-#else
-    return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
-#endif
-  }
-  // random number seed
-  unsigned rseed;
-};
-}  // namespace random
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_RANDOM_H_
--- a/src/utils/thread.h
+++ b/src/utils/thread.h
@@ -1,260 +0,0 @@
-/*!
- * Copyright by Contributors
- * \file thread.h
- * \brief this header include the minimum necessary resource
- * for multi-threading that can be compiled in windows, linux, mac
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_THREAD_H_ // NOLINT(*)
-#define XGBOOST_UTILS_THREAD_H_ // NOLINT(*)
-
-#ifdef _MSC_VER
-#include <windows.h>
-#include <process.h>
-#include "./utils.h"
-namespace xgboost {
-namespace utils {
-/*! \brief simple semaphore used for synchronization */
-class Semaphore {
- public :
-  inline void Init(int init_val) {
-    sem = CreateSemaphore(NULL, init_val, 10, NULL);
-    utils::Check(sem != NULL, "create Semaphore error");
-  }
-  inline void Destroy(void) {
-    CloseHandle(sem);
-  }
-  inline void Wait(void) {
-    utils::Check(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
-  }
-  inline void Post(void) {
-    utils::Check(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error");
-  }
-
- private:
-  HANDLE sem;
-};
-
-/*! \brief mutex under windows */
-class Mutex {
- public:
-  inline void Init(void) {
-    utils::Check(InitializeCriticalSectionAndSpinCount(&mutex, 0x00000400) != 0,
-                   "Mutex::Init fail");
-  }
-  inline void Lock(void) {
-    EnterCriticalSection(&mutex);
-  }
-  inline void Unlock(void) {
-    LeaveCriticalSection(&mutex);
-  }
-  inline void Destroy(void) {
-    DeleteCriticalSection(&mutex);
-  }
-
- private:
-  friend class ConditionVariable;
-  CRITICAL_SECTION mutex;
-};
-
-// conditional variable that uses pthread
-class ConditionVariable {
- public:
-  // initialize conditional variable
-  inline void Init(void) {
-    InitializeConditionVariable(&cond);
-  }
-  // destroy the thread
-  inline void Destroy(void) {
-    // DeleteConditionVariable(&cond);
-  }
-  // wait on the conditional variable
-  inline void Wait(Mutex *mutex) {
-    utils::Check(SleepConditionVariableCS(&cond, &(mutex->mutex), INFINITE) != 0,
-                 "ConditionVariable:Wait fail");
-  }
-  inline void Broadcast(void) {
-    WakeAllConditionVariable(&cond);
-  }
-  inline void Signal(void) {
-    WakeConditionVariable(&cond);
-  }
-
- private:
-  CONDITION_VARIABLE cond;
-};
-
-/*! \brief simple thread that wraps windows thread */
-class Thread {
- private:
-  HANDLE    thread_handle;
-  unsigned  thread_id;
- public:
-  inline void Start(unsigned int __stdcall entry(void*p), void *param) {
-    thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
-  }
-  inline int Join(void) {
-    WaitForSingleObject(thread_handle, INFINITE);
-    return 0;
-  }
-};
-/*! \brief exit function called from thread */
-inline void ThreadExit(void *status) {
-  _endthreadex(0);
-}
-#define XGBOOST_THREAD_PREFIX unsigned int __stdcall
-}  // namespace utils
-}  // namespace xgboost
-#else
-// thread interface using g++
-#include <semaphore.h>
-#include <pthread.h>
-#include <errno.h>
-namespace xgboost {
-namespace utils {
-/*!\brief semaphore class */
-class Semaphore {
-  #ifdef __APPLE__
-
- private:
-  sem_t* semPtr;
-  char sema_name[20];
-
- private:
-  inline void GenRandomString(char *s, const int len) {
-    static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-    for (int i = 0; i < len; ++i) {
-      s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
-    }
-    s[len] = 0;
-  }
-
- public:
-  inline void Init(int init_val) {
-    sema_name[0] = '/';
-    sema_name[1] = 's';
-    sema_name[2] = 'e';
-    sema_name[3] = '/';
-    GenRandomString(&sema_name[4], 16);
-    if ((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
-      perror("sem_open");
-      exit(1);
-    }
-    utils::Check(semPtr != NULL, "create Semaphore error");
-  }
-  inline void Destroy(void) {
-    if (sem_close(semPtr) == -1) {
-      perror("sem_close");
-      exit(EXIT_FAILURE);
-    }
-    if (sem_unlink(sema_name) == -1) {
-      perror("sem_unlink");
-      exit(EXIT_FAILURE);
-    }
-  }
-  inline void Wait(void) {
-    sem_wait(semPtr);
-  }
-  inline void Post(void) {
-    sem_post(semPtr);
-  }
-  #else
-
- private:
-  sem_t sem;
-
- public:
-  inline void Init(int init_val) {
-    if (sem_init(&sem, 0, init_val) != 0) {
-      utils::Error("Semaphore.Init:%s", strerror(errno));
-    }
-  }
-  inline void Destroy(void) {
-    if (sem_destroy(&sem) != 0) {
-      utils::Error("Semaphore.Destroy:%s", strerror(errno));
-    }
-  }
-  inline void Wait(void) {
-    if (sem_wait(&sem) != 0) {
-      utils::Error("Semaphore.Wait:%s", strerror(errno));
-    }
-  }
-  inline void Post(void) {
-    if (sem_post(&sem) != 0) {
-      utils::Error("Semaphore.Post:%s", strerror(errno));
-    }
-  }
-  #endif
-};
-
-// mutex that works with pthread
-class Mutex {
- public:
-  inline void Init(void) {
-    pthread_mutex_init(&mutex, NULL);
-  }
-  inline void Lock(void) {
-    pthread_mutex_lock(&mutex);
-  }
-  inline void Unlock(void) {
-    pthread_mutex_unlock(&mutex);
-  }
-  inline void Destroy(void) {
-    pthread_mutex_destroy(&mutex);
-  }
-
- private:
-  friend class ConditionVariable;
-  pthread_mutex_t mutex;
-};
-
-// conditional variable that uses pthread
-class ConditionVariable {
- public:
-  // initialize conditional variable
-  inline void Init(void) {
-    pthread_cond_init(&cond, NULL);
-  }
-  // destroy the thread
-  inline void Destroy(void) {
-    pthread_cond_destroy(&cond);
-  }
-  // wait on the conditional variable
-  inline void Wait(Mutex *mutex) {
-    pthread_cond_wait(&cond, &(mutex->mutex));
-  }
-  inline void Broadcast(void) {
-    pthread_cond_broadcast(&cond);
-  }
-  inline void Signal(void) {
-    pthread_cond_signal(&cond);
-  }
-
- private:
-  pthread_cond_t cond;
-};
-
-/*!\brief simple thread class */
-class Thread {
- private:
-  pthread_t thread;
- public :
-  inline void Start(void * entry(void*), void *param) { // NOLINT(*)
-    pthread_attr_t attr;
-    pthread_attr_init(&attr);
-    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-    pthread_create(&thread, &attr, entry, param);
-  }
-  inline int Join(void) {
-    void *status;
-    return pthread_join(thread, &status);
-  }
-};
-inline void ThreadExit(void *status) {
-  pthread_exit(status);
-}
-}  // namespace utils
-}  // namespace xgboost
-#define XGBOOST_THREAD_PREFIX void *
-#endif  // Linux
-#endif  // XGBOOST_UTILS_THREAD_H_  NOLINT(*)
--- a/src/utils/thread_buffer.h
+++ b/src/utils/thread_buffer.h
@@ -1,257 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file thread_buffer.h
- * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_
-#define XGBOOST_UTILS_THREAD_BUFFER_H_
-
-#include <vector>
-#include <cstring>
-#include <cstdlib>
-#include "./utils.h"
-// threading util could not run on solaris
-#ifndef XGBOOST_STRICT_CXX98_
-#include "./thread.h"
-#endif
-
-namespace xgboost {
-namespace utils {
-#if !defined(XGBOOST_STRICT_CXX98_)
-/*!
- * \brief buffered loading iterator that uses multithread
- * this template method will assume the following parameters
- * \tparam Elem element type to be buffered
- * \tparam ElemFactory factory type to implement in order to use thread buffer
- */
-template<typename Elem, typename ElemFactory>
-class ThreadBuffer {
- public:
-  /*!\brief constructor */
-  ThreadBuffer(void) {
-    this->init_end = false;
-    this->buf_size = 30;
-  }
-  ~ThreadBuffer(void) {
-    if (init_end) this->Destroy();
-  }
-  /*!\brief set parameter, will also pass the parameter to factory */
-  inline void SetParam(const char *name, const char *val) {
-    using namespace std;
-    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
-    factory.SetParam(name, val);
-  }
-  /*!
-   * \brief initalize the buffered iterator
-   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
-   * \return false if the initialization can't be done, e.g. buffer file hasn't been created
-   */
-  inline bool Init(void) {
-    if (!factory.Init()) return false;
-    for (int i = 0; i < buf_size; ++i) {
-      bufA.push_back(factory.Create());
-      bufB.push_back(factory.Create());
-    }
-    this->init_end = true;
-    this->StartLoader();
-    return true;
-  }
-  /*!\brief place the iterator before first value */
-  inline void BeforeFirst(void) {
-    // wait till last loader end
-    loading_end.Wait();
-    // critical zone
-    current_buf = 1;
-    factory.BeforeFirst();
-    // reset terminate limit
-    endA = endB = buf_size;
-    // wake up loader for first part
-    loading_need.Post();
-    // wait til first part is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();
-    // set buffer value
-    buf_index = 0;
-  }
-  /*! \brief destroy the buffer iterator, will deallocate the buffer */
-  inline void Destroy(void) {
-    // wait until the signal is consumed
-    this->destroy_signal = true;
-    loading_need.Post();
-    loader_thread.Join();
-    loading_need.Destroy();
-    loading_end.Destroy();
-    for (size_t i = 0; i < bufA.size(); ++i) {
-      factory.FreeSpace(bufA[i]);
-    }
-    for (size_t i = 0; i < bufB.size(); ++i) {
-      factory.FreeSpace(bufB[i]);
-    }
-    bufA.clear(); bufB.clear();
-    factory.Destroy();
-    this->init_end = false;
-  }
-  /*!
-   * \brief get the next element needed in buffer
-   * \param elem element to store into
-   * \return whether reaches end of data
-   */
-  inline bool Next(Elem &elem) { // NOLINT(*)
-    // end of buffer try to switch
-    if (buf_index == buf_size) {
-      this->SwitchBuffer();
-      buf_index = 0;
-    }
-    if (buf_index >= (current_buf ? endA : endB)) {
-      return false;
-    }
-    std::vector<Elem> &buf = current_buf ? bufA : bufB;
-    elem = buf[buf_index];
-    ++buf_index;
-    return true;
-  }
-  /*!
-   * \brief get the factory object
-   */
-  inline ElemFactory &get_factory(void) {
-    return factory;
-  }
-  inline const ElemFactory &get_factory(void) const {
-    return factory;
-  }
-  // size of buffer
-  int  buf_size;
-
- private:
-  // factory object used to load configures
-  ElemFactory factory;
-  // index in current buffer
-  int buf_index;
-  // indicate which one is current buffer
-  int current_buf;
-  // max limit of visit, also marks termination
-  int endA, endB;
-  // double buffer, one is accessed by loader
-  // the other is accessed by consumer
-  // buffer of the data
-  std::vector<Elem> bufA, bufB;
-  // initialization end
-  bool init_end;
-  // singal whether the data is loaded
-  bool data_loaded;
-  // signal to kill the thread
-  bool destroy_signal;
-  // thread object
-  Thread loader_thread;
-  // signal of the buffer
-  Semaphore loading_end, loading_need;
-  /*!
-   * \brief slave thread
-   * this implementation is like producer-consumer style
-   */
-  inline void RunLoader(void) {
-    while (!destroy_signal) {
-      // sleep until loading is needed
-      loading_need.Wait();
-      std::vector<Elem> &buf = current_buf ? bufB : bufA;
-      int i;
-      for (i = 0; i < buf_size ; ++i) {
-        if (!factory.LoadNext(buf[i])) {
-          int &end = current_buf ? endB : endA;
-          end = i;  // marks the termination
-          break;
-        }
-      }
-      // signal that loading is done
-      data_loaded = true;
-      loading_end.Post();
-    }
-  }
-  /*!\brief entry point of loader thread */
-  inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) {
-    static_cast< ThreadBuffer<Elem, ElemFactory>* >(pthread)->RunLoader();
-    return NULL;
-  }
-  /*!\brief start loader thread */
-  inline void StartLoader(void) {
-    destroy_signal = false;
-    // set param
-    current_buf = 1;
-    loading_need.Init(1);
-    loading_end .Init(0);
-    // reset terminate limit
-    endA = endB = buf_size;
-    loader_thread.Start(LoaderEntry, this);
-    // wait until first part of data is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();
-    buf_index = 0;
-  }
-  /*!\brief switch double buffer */
-  inline void SwitchBuffer(void) {
-    loading_end.Wait();
-    // loader shall be sleep now, critcal zone!
-    current_buf = !current_buf;
-    // wake up loader
-    data_loaded = false;
-    loading_need.Post();
-  }
-};
-#else
-// a dummy single threaded ThreadBuffer
-// use this to resolve R's solaris compatibility for now
-template<typename Elem, typename ElemFactory>
-class ThreadBuffer {
- public:
-  ThreadBuffer() : init_end_(false) {}
-  ~ThreadBuffer() {
-    if (init_end_) {
-      factory_.FreeSpace(data_);
-      factory_.Destroy();
-    }
-  }
-  inline void SetParam(const char *name, const char *val) {
-  }
-  inline bool Init(void) {
-    if (!factory_.Init()) return false;
-    data_ = factory_.Create();
-    return (init_end_ = true);
-  }
-  inline void BeforeFirst(void) {
-    factory_.BeforeFirst();
-  }
-  inline bool Next(Elem &elem) { // NOLINT(*)
-    if (factory_.LoadNext(data_)) {
-      elem = data_; return true;
-    } else {
-      return false;
-    }
-  }
-  inline ElemFactory &get_factory() {
-    return factory_;
-  }
-  inline const ElemFactory &get_factory() const {
-    return factory_;
-  }
-
- private:
-  // initialized
-  bool init_end_;
-  // current data
-  Elem data_;
-  // factory object used to load configures
-  ElemFactory factory_;
-};
-#endif  // !defined(XGBOOST_STRICT_CXX98_)
-}  // namespace utils
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_THREAD_BUFFER_H_
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -1,188 +0,0 @@
-/*!
- * Copyright 2014 by Contributors
- * \file utils.h
- * \brief simple utils to support the code
- * \author Tianqi Chen
- */
-#ifndef XGBOOST_UTILS_UTILS_H_
-#define XGBOOST_UTILS_UTILS_H_
-
-#define _CRT_SECURE_NO_WARNINGS
-#include <cstdio>
-#include <string>
-#include <cstdlib>
-#include <vector>
-#include <stdexcept>
-
-#ifndef XGBOOST_STRICT_CXX98_
-#include <cstdarg>
-#endif
-
-#if !defined(__GNUC__)
-#define fopen64 std::fopen
-#endif
-#ifdef _MSC_VER
-// NOTE: sprintf_s is not equivalent to snprintf,
-// they are equivalent when success, which is sufficient for our case
-#define snprintf sprintf_s
-#define vsnprintf vsprintf_s
-#else
-#ifdef _FILE_OFFSET_BITS
-#if _FILE_OFFSET_BITS == 32
-#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
-#endif
-#endif
-
-#ifdef __APPLE__
-#define off64_t off_t
-#define fopen64 std::fopen
-#endif
-
-extern "C" {
-#include <sys/types.h>
-}
-#endif
-
-#ifdef _MSC_VER
-typedef unsigned char uint8_t;
-typedef unsigned __int16 uint16_t;
-typedef unsigned __int32 uint32_t;
-typedef unsigned __int64 uint64_t;
-typedef __int64 int64_t;
-#else
-#include <inttypes.h>
-#endif
-
-namespace xgboost {
-/*! \brief namespace for helper utils of the project */
-namespace utils {
-
-/*! \brief error message buffer length */
-const int kPrintBuffer = 1 << 12;
-
-#ifndef XGBOOST_CUSTOMIZE_MSG_
-/*!
- * \brief handling of Assert error, caused by inappropriate input
- * \param msg error message
- */
-inline void HandleAssertError(const char *msg) {
-  fprintf(stderr, "AssertError:%s\n", msg);
-  exit(-1);
-}
-/*!
- * \brief handling of Check error, caused by inappropriate input
- * \param msg error message
- */
-inline void HandleCheckError(const char *msg) {
-  throw std::runtime_error(msg);
-}
-inline void HandlePrint(const char *msg) {
-  printf("%s", msg);
-}
-#else
-#ifndef XGBOOST_STRICT_CXX98_
-// include declarations, some one must implement this
-void HandleAssertError(const char *msg);
-void HandleCheckError(const char *msg);
-void HandlePrint(const char *msg);
-#endif
-#endif
-#ifdef XGBOOST_STRICT_CXX98_
-// these function pointers are to be assigned
-extern "C" void (*Printf)(const char *fmt, ...);
-extern "C" int (*SPrintf)(char *buf, size_t size, const char *fmt, ...);
-extern "C" void (*Assert)(int exp, const char *fmt, ...);
-extern "C" void (*Check)(int exp, const char *fmt, ...);
-extern "C" void (*Error)(const char *fmt, ...);
-#else
-/*! \brief printf, print message to the console */
-inline void Printf(const char *fmt, ...) {
-  std::string msg(kPrintBuffer, '\0');
-  va_list args;
-  va_start(args, fmt);
-  vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-  va_end(args);
-  HandlePrint(msg.c_str());
-}
-/*! \brief portable version of snprintf */
-inline int SPrintf(char *buf, size_t size, const char *fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-  int ret = vsnprintf(buf, size, fmt, args);
-  va_end(args);
-  return ret;
-}
-
-/*! \brief assert an condition is true, use this to handle debug information */
-inline void Assert(bool exp, const char *fmt, ...) {
-  if (!exp) {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleAssertError(msg.c_str());
-  }
-}
-
-/*!\brief same as assert, but this is intended to be used as message for user*/
-inline void Check(bool exp, const char *fmt, ...) {
-  if (!exp) {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleCheckError(msg.c_str());
-  }
-}
-
-/*! \brief report error message, same as check */
-inline void Error(const char *fmt, ...) {
-  {
-    std::string msg(kPrintBuffer, '\0');
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(&msg[0], kPrintBuffer, fmt, args);
-    va_end(args);
-    HandleCheckError(msg.c_str());
-  }
-}
-#endif
-
-/*! \brief replace fopen, report error when the file open fails */
-inline std::FILE *FopenCheck(const char *fname, const char *flag) {
-  std::FILE *fp = fopen64(fname, flag);
-  Check(fp != NULL, "can not open file \"%s\"\n", fname);
-  return fp;
-}
-}  // namespace utils
-// easy utils that can be directly accessed in xgboost
-/*! \brief get the beginning address of a vector */
-template<typename T>
-inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
-  if (vec.size() == 0) {
-    return NULL;
-  } else {
-    return &vec[0];
-  }
-}
-/*! \brief get the beginning address of a vector */
-template<typename T>
-inline const T *BeginPtr(const std::vector<T> &vec) {
-  if (vec.size() == 0) {
-    return NULL;
-  } else {
-    return &vec[0];
-  }
-}
-inline char* BeginPtr(std::string &str) { // NOLINT(*)
-  if (str.length() == 0) return NULL;
-  return &str[0];
-}
-inline const char* BeginPtr(const std::string &str) {
-  if (str.length() == 0) return NULL;
-  return &str[0];
-}
-}  // namespace xgboost
-#endif  // XGBOOST_UTILS_UTILS_H_
--- a/src/xgboost_main.cpp
+++ b/src/xgboost_main.cpp
@@ -1,335 +0,0 @@
-// Copyright 2014 by Contributors
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-#define NOMINMAX
-#include <ctime>
-#include <string>
-#include <cstring>
-#include <vector>
-#include "./sync/sync.h"
-#include "./io/io.h"
-#include "./utils/utils.h"
-#include "./utils/config.h"
-#include "./learner/learner-inl.hpp"
-
-namespace xgboost {
-/*!
- * \brief wrapping the training process
- */
-class BoostLearnTask {
- public:
-  inline int Run(int argc, char *argv[]) {
-    if (argc < 2) {
-      printf("Usage: <config>\n");
-      return 0;
-    }
-    utils::ConfigIterator itr(argv[1]);
-    while (itr.Next()) {
-      this->SetParam(itr.name(), itr.val());
-    }
-    for (int i = 2; i < argc; ++i) {
-      char name[256], val[256];
-      if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) {
-        this->SetParam(name, val);
-      }
-    }
-    // do not save anything when save to stdout
-    if (model_out == "stdout" || name_pred == "stdout") {
-      this->SetParam("silent", "1");
-      save_period = 0;
-    }
-    // initialized the result
-    rabit::Init(argc, argv);
-    if (rabit::IsDistributed()) {
-      std::string pname = rabit::GetProcessorName();
-      fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
-    }
-    if (rabit::IsDistributed() && data_split == "NONE") {
-      this->SetParam("dsplit", "row");
-    }
-    if (rabit::GetRank() != 0) {
-      this->SetParam("silent", "2");
-    }
-    this->InitData();
-
-    if (task == "train") {
-      // if task is training, will try recover from checkpoint
-      this->TaskTrain();
-      return 0;
-    } else {
-      this->InitLearner();
-    }
-    if (task == "dump") {
-      this->TaskDump(); return 0;
-    }
-    if (task == "eval") {
-      this->TaskEval(); return 0;
-    }
-    if (task == "pred") {
-      this->TaskPred();
-    }
-    return 0;
-  }
-  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp("silent", name)) silent = atoi(val);
-    if (!strcmp("use_buffer", name)) use_buffer = atoi(val);
-    if (!strcmp("num_round", name)) num_round = atoi(val);
-    if (!strcmp("pred_margin", name)) pred_margin = atoi(val);
-    if (!strcmp("ntree_limit", name)) ntree_limit = atoi(val);
-    if (!strcmp("save_period", name)) save_period = atoi(val);
-    if (!strcmp("eval_train", name)) eval_train = atoi(val);
-    if (!strcmp("task", name)) task = val;
-    if (!strcmp("data", name)) train_path = val;
-    if (!strcmp("test:data", name)) test_path = val;
-    if (!strcmp("model_in", name)) model_in = val;
-    if (!strcmp("model_out", name)) model_out = val;
-    if (!strcmp("model_dir", name)) model_dir_path = val;
-    if (!strcmp("fmap", name)) name_fmap = val;
-    if (!strcmp("name_dump", name)) name_dump = val;
-    if (!strcmp("name_pred", name)) name_pred = val;
-    if (!strcmp("dsplit", name)) data_split = val;
-    if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
-    if (!strcmp("save_pbuffer", name)) save_with_pbuffer = atoi(val);
-    if (!strncmp("eval[", name, 5)) {
-      char evname[256];
-      utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1,
-                    "must specify evaluation name for display");
-      eval_data_names.push_back(std::string(evname));
-      eval_data_paths.push_back(std::string(val));
-    }
-    learner.SetParam(name, val);
-  }
-
- public:
-  BoostLearnTask(void) {
-    // default parameters
-    silent = 0;
-    use_buffer = 1;
-    num_round = 10;
-    save_period = 0;
-    eval_train = 0;
-    pred_margin = 0;
-    ntree_limit = 0;
-    dump_model_stats = 0;
-    task = "train";
-    model_in = "NULL";
-    model_out = "NULL";
-    name_fmap = "NULL";
-    name_pred = "pred.txt";
-    name_dump = "dump.txt";
-    model_dir_path = "./";
-    data_split = "NONE";
-    load_part = 0;
-    save_with_pbuffer = 0;
-    data = NULL;
-  }
-  ~BoostLearnTask(void) {
-    for (size_t i = 0; i < deval.size(); i++) {
-      delete deval[i];
-    }
-    if (data != NULL) delete data;
-  }
-
- private:
-  inline void InitData(void) {
-    if (strchr(train_path.c_str(), '%') != NULL) {
-      char s_tmp[256];
-      utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), rabit::GetRank());
-      train_path = s_tmp;
-      load_part = 1;
-    }
-    bool loadsplit = data_split == "row";
-    if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
-    if (task == "dump") return;
-    if (task == "pred") {
-      data = io::LoadDataMatrix(test_path.c_str(), silent != 0, use_buffer != 0, loadsplit);
-    } else {
-      // training
-      data = io::LoadDataMatrix(train_path.c_str(),
-                                silent != 0 && load_part == 0,
-                                use_buffer != 0, loadsplit);
-      utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
-      for (size_t i = 0; i < eval_data_names.size(); ++i) {
-        deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(),
-                                           silent != 0,
-                                           use_buffer != 0,
-                                           loadsplit));
-        devalall.push_back(deval.back());
-      }
-
-      std::vector<io::DataMatrix *> dcache(1, data);
-      for (size_t i = 0; i < deval.size(); ++i) {
-        dcache.push_back(deval[i]);
-      }
-      // set cache data to be all training and evaluation data
-      learner.SetCacheData(dcache);
-
-      // add training set to evaluation set if needed
-      if (eval_train != 0) {
-        devalall.push_back(data);
-        eval_data_names.push_back(std::string("train"));
-      }
-    }
-  }
-  inline void InitLearner(void) {
-    if (model_in != "NULL") {
-      learner.LoadModel(model_in.c_str());
-    } else {
-      utils::Assert(task == "train", "model_in not specified");
-      learner.InitModel();
-    }
-  }
-  inline void TaskTrain(void) {
-    int version = rabit::LoadCheckPoint(&learner);
-    if (version == 0) this->InitLearner();
-    const time_t start = time(NULL);
-    unsigned long elapsed = 0;  // NOLINT(*)
-    learner.CheckInit(data);
-
-    bool allow_lazy = learner.AllowLazyCheckPoint();
-    for (int i = version / 2; i < num_round; ++i) {
-      elapsed = (unsigned long)(time(NULL) - start);  // NOLINT(*)
-      if (version % 2 == 0) {
-        if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
-        learner.UpdateOneIter(i, *data);
-        if (allow_lazy) {
-          rabit::LazyCheckPoint(&learner);
-        } else {
-          rabit::CheckPoint(&learner);
-        }
-        version += 1;
-      }
-      utils::Assert(version == rabit::VersionNumber(), "consistent check");
-      std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
-      if (rabit::IsDistributed()) {
-        if (rabit::GetRank() == 0) {
-          rabit::TrackerPrintf("%s\n", res.c_str());
-        }
-      } else {
-        if (silent < 2) {
-          fprintf(stderr, "%s\n", res.c_str());
-        }
-      }
-      if (save_period != 0 && (i + 1) % save_period == 0) {
-        this->SaveModel(i);
-      }
-      if (allow_lazy) {
-        rabit::LazyCheckPoint(&learner);
-      } else {
-        rabit::CheckPoint(&learner);
-      }
-      version += 1;
-      utils::Assert(version == rabit::VersionNumber(), "consistent check");
-      elapsed = (unsigned long)(time(NULL) - start);  // NOLINT(*)
-    }
-    // always save final round
-    if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE") {
-      if (model_out == "NULL") {
-        this->SaveModel(num_round - 1);
-      } else {
-        this->SaveModel(model_out.c_str());
-      }
-    }
-    if (!silent) {
-      printf("\nupdating end, %lu sec in all\n", elapsed);
-    }
-  }
-  inline void TaskEval(void) {
-    learner.EvalOneIter(0, devalall, eval_data_names);
-  }
-  inline void TaskDump(void) {
-    FILE *fo = utils::FopenCheck(name_dump.c_str(), "w");
-    std::vector<std::string> dump = learner.DumpModel(fmap, dump_model_stats != 0);
-    for (size_t i = 0; i < dump.size(); ++i) {
-      fprintf(fo, "booster[%lu]:\n", i);
-      fprintf(fo, "%s", dump[i].c_str());
-    }
-    fclose(fo);
-  }
-  inline void SaveModel(const char *fname) const {
-    if (rabit::GetRank() != 0) return;
-    learner.SaveModel(fname, save_with_pbuffer != 0);
-  }
-  inline void SaveModel(int i) const {
-    char fname[256];
-    utils::SPrintf(fname, sizeof(fname),
-                   "%s/%04d.model", model_dir_path.c_str(), i + 1);
-    this->SaveModel(fname);
-  }
-  inline void TaskPred(void) {
-    std::vector<float> preds;
-    if (!silent) printf("start prediction...\n");
-    learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
-    if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
-    FILE *fo;
-    if (name_pred != "stdout") {
-      fo = utils::FopenCheck(name_pred.c_str(), "w");
-    } else {
-      fo = stdout;
-    }
-    for (size_t i = 0; i < preds.size(); ++i) {
-      fprintf(fo, "%g\n", preds[i]);
-    }
-    if (fo != stdout) fclose(fo);
-  }
-
- private:
-  /*! \brief whether silent */
-  int silent;
-  /*! \brief special load */
-  int load_part;
-  /*! \brief whether use auto binary buffer */
-  int use_buffer;
-  /*! \brief whether evaluate training statistics */
-  int eval_train;
-  /*! \brief number of boosting iterations */
-  int num_round;
-  /*! \brief the period to save the model, 0 means only save the final round model */
-  int save_period;
-  /*! \brief the path of training/test data set */
-  std::string train_path, test_path;
-  /*! \brief the path of test model file, or file to restart training */
-  std::string model_in;
-  /*! \brief the path of final model file, to be saved */
-  std::string model_out;
-  /*! \brief the path of directory containing the saved models */
-  std::string model_dir_path;
-  /*! \brief task to perform */
-  std::string task;
-  /*! \brief name of predict file */
-  std::string name_pred;
-  /*! \brief data split mode */
-  std::string data_split;
-  /*!\brief limit number of trees in prediction */
-  int ntree_limit;
-  /*!\brief whether to directly output margin value */
-  int pred_margin;
-  /*! \brief whether dump statistics along with model */
-  int dump_model_stats;
-  /*! \brief whether save prediction buffer */
-  int save_with_pbuffer;
-  /*! \brief name of feature map */
-  std::string name_fmap;
-  /*! \brief name of dump file */
-  std::string name_dump;
-  /*! \brief the paths of validation data sets */
-  std::vector<std::string> eval_data_paths;
-  /*! \brief the names of the evaluation data used in output log */
-  std::vector<std::string> eval_data_names;
-
- private:
-  io::DataMatrix* data;
-  std::vector<io::DataMatrix*> deval;
-  std::vector<const io::DataMatrix*> devalall;
-  utils::FeatMap fmap;
-  learner::BoostLearner learner;
-};
-}  // namespace xgboost
-
-int main(int argc, char *argv[]) {
-  xgboost::BoostLearnTask tsk;
-  tsk.SetParam("seed", "0");
-  int ret = tsk.Run(argc, argv);
-  rabit::Finalize();
-  return ret;
-}