Patch to improve multithreaded performance scaling (#2493)

* Patch to improve multithreaded performance scaling Change parallel strategy for histogram construction. Instead of partitioning data rows among multiple threads, partition feature columns instead. Useful heuristics for assigning partitions have been adopted from LightGBM project. * Add missing header to satisfy MSVC * Restore max_bin and related parameters to TrainParam * Fix lint error * inline functions do not require static keyword * Feature grouping algorithm accepting FastHistParam Feature grouping algorithm accepts many parameters (3+), and it gets annoying to pass them one by one. Instead, simply pass the reference to FastHistParam. The definition of FastHistParam has been moved to a separate header file to accomodate this change.
2017-07-07 08:25:07 -07:00 · 2017-07-07 08:25:07 -07:00 · ba820847f9
commit ba820847f9
parent 6bfc472bec
6 changed files with 466 additions and 52 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@ -29,6 +29,9 @@ switch (dtype) {                \
 #include <limits>
 #include <vector>
 #include "hist_util.h"
+#include "../tree/fast_hist_param.h"
+
+using xgboost::tree::FastHistParam;

 namespace xgboost {
 namespace common {
@ -68,8 +71,9 @@ class ColumnMatrix {
  }

  // construct column matrix from GHistIndexMatrix
-  inline void Init(const GHistIndexMatrix& gmat, DataType dtype) {
-    this->dtype = dtype;
+  inline void Init(const GHistIndexMatrix& gmat,
+                   const FastHistParam& param) {
+    this->dtype = static_cast<DataType>(param.colmat_dtype);
    /* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
       slot of internal buffer. */
    packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
@ -93,7 +97,8 @@ class ColumnMatrix {
    gmat.GetFeatureCounts(&feature_counts_[0]);
    // classify features
    for (uint32_t fid = 0; fid < nfeature; ++fid) {
-      if (static_cast<double>(feature_counts_[fid]) < 0.5*nrow) {
+      if (static_cast<double>(feature_counts_[fid])
+                 < param.sparse_threshold * nrow) {
        type_[fid] = kSparseColumn;
      } else {
        type_[fid] = kDenseColumn;
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -5,10 +5,12 @@
 * \author Philip Cho, Tianqi Chen
 */
 #include <dmlc/omp.h>
+#include <numeric>
 #include <vector>
 #include "./sync.h"
-#include "./hist_util.h"
+#include "./random.h"
 #include "./column_matrix.h"
+#include "./hist_util.h"
 #include "./quantile.h"

 namespace xgboost {
@ -154,6 +156,246 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
  }
 }

+template <typename T>
+static unsigned GetConflictCount(const std::vector<bool>& mark,
+                                 const Column<T>& column,
+                                 unsigned max_cnt) {
+  unsigned ret = 0;
+  if (column.type == xgboost::common::kDenseColumn) {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (column.index[i] != std::numeric_limits<T>::max() && mark[i]) {
+        ++ret;
+        if (ret > max_cnt) {
+          return max_cnt + 1;
+        }
+      }
+    }
+  } else {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (mark[column.row_ind[i]]) {
+        ++ret;
+        if (ret > max_cnt) {
+          return max_cnt + 1;
+        }
+      }
+    }
+  }
+  return ret;
+}
+
+template <typename T>
+inline void
+MarkUsed(std::vector<bool>* p_mark, const Column<T>& column) {
+  std::vector<bool>& mark = *p_mark;
+  if (column.type == xgboost::common::kDenseColumn) {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (column.index[i] != std::numeric_limits<T>::max()) {
+        mark[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < column.len; ++i) {
+      mark[column.row_ind[i]] = true;
+    }
+  }
+}
+
+template <typename T>
+inline std::vector<std::vector<unsigned>>
+FindGroups_(const std::vector<unsigned>& feature_list,
+            const std::vector<bst_uint>& feature_nnz,
+            const ColumnMatrix& colmat,
+            unsigned nrow,
+            const FastHistParam& param) {
+  /* Goal: Bundle features together that has little or no "overlap", i.e.
+           only a few data points should have nonzero values for
+           member features.
+           Note that one-hot encoded features will be grouped together. */
+
+  std::vector<std::vector<unsigned>> groups;
+  std::vector<std::vector<bool>> conflict_marks;
+  std::vector<unsigned> group_nnz;
+  std::vector<unsigned> group_conflict_cnt;
+  const unsigned max_conflict_cnt
+    = static_cast<unsigned>(param.max_conflict_rate * nrow);
+
+  for (auto fid : feature_list) {
+    const Column<T>& column = colmat.GetColumn<T>(fid);
+
+    const size_t cur_fid_nnz = feature_nnz[fid];
+    bool need_new_group = true;
+
+    // randomly choose some of existing groups as candidates
+    std::vector<unsigned> search_groups;
+    for (size_t gid = 0; gid < groups.size(); ++gid) {
+      if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
+        search_groups.push_back(gid);
+      }
+    }
+    std::shuffle(search_groups.begin(), search_groups.end(), common::GlobalRandom());
+    if (param.max_search_group > 0 && search_groups.size() > param.max_search_group) {
+      search_groups.resize(param.max_search_group);
+    }
+
+    // examine each candidate group: is it okay to insert fid?
+    for (auto gid : search_groups) {
+      const unsigned rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
+      const unsigned cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
+      if (cnt <= rest_max_cnt) {
+        need_new_group = false;
+        groups[gid].push_back(fid);
+        group_conflict_cnt[gid] += cnt;
+        group_nnz[gid] += cur_fid_nnz - cnt;
+        MarkUsed(&conflict_marks[gid], column);
+        break;
+      }
+    }
+
+    // create new group if necessary
+    if (need_new_group) {
+      groups.emplace_back();
+      groups.back().push_back(fid);
+      group_conflict_cnt.push_back(0);
+      conflict_marks.emplace_back(nrow, false);
+      MarkUsed(&conflict_marks.back(), column);
+      group_nnz.emplace_back(cur_fid_nnz);
+    }
+  }
+
+  return groups;
+}
+
+inline std::vector<std::vector<unsigned>>
+FindGroups(const std::vector<unsigned>& feature_list,
+           const std::vector<bst_uint>& feature_nnz,
+           const ColumnMatrix& colmat,
+           unsigned nrow,
+           const FastHistParam& param) {
+  XGBOOST_TYPE_SWITCH(colmat.dtype, {
+    return FindGroups_<DType>(feature_list, feature_nnz, colmat, nrow, param);
+  });
+  return std::vector<std::vector<unsigned>>();  // to avoid warning message
+}
+
+inline std::vector<std::vector<unsigned>>
+FastFeatureGrouping(const GHistIndexMatrix& gmat,
+                    const ColumnMatrix& colmat,
+                    const FastHistParam& param) {
+  const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nfeature = gmat.cut->row_ptr.size() - 1;
+
+  std::vector<unsigned> feature_list(nfeature);
+  std::iota(feature_list.begin(), feature_list.end(), 0);
+
+  // sort features by nonzero counts, descending order
+  std::vector<bst_uint> feature_nnz(nfeature);
+  std::vector<unsigned> features_by_nnz(feature_list);
+  gmat.GetFeatureCounts(&feature_nnz[0]);
+  std::sort(features_by_nnz.begin(), features_by_nnz.end(),
+            [&feature_nnz](int a, int b) {
+    return feature_nnz[a] > feature_nnz[b];
+  });
+
+  auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
+  auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;
+
+  // take apart small, sparse groups, as it won't help speed
+  {
+    std::vector<std::vector<unsigned>> ret;
+    for (const auto& group : groups) {
+      if (group.size() <= 1 || group.size() >= 5) {
+        ret.push_back(group);  // keep singleton groups and large (5+) groups
+      } else {
+        unsigned nnz = 0;
+        for (auto fid : group) {
+          nnz += feature_nnz[fid];
+        }
+        double nnz_rate = static_cast<double>(nnz) / nrow;
+        // take apart small sparse group, due it will not gain on speed
+        if (nnz_rate <= param.sparse_threshold) {
+          for (auto fid : group) {
+            ret.emplace_back();
+            ret.back().push_back(fid);
+          }
+        } else {
+          ret.push_back(group);
+        }
+      }
+    }
+    groups = std::move(ret);
+  }
+
+  // shuffle groups
+  std::shuffle(groups.begin(), groups.end(), common::GlobalRandom());
+
+  return groups;
+}
+
+void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
+                                 const ColumnMatrix& colmat,
+                                 const FastHistParam& param) {
+  cut = gmat.cut;
+
+  const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nbins = gmat.cut->row_ptr.back();
+
+  /* step 1: form feature groups */
+  auto groups = FastFeatureGrouping(gmat, colmat, param);
+  const size_t nblock = groups.size();
+
+  /* step 2: build a new CSR matrix for each feature group */
+  std::vector<unsigned> bin2block(nbins);  // lookup table [bin id] => [block id]
+  for (size_t group_id = 0; group_id < nblock; ++group_id) {
+    for (auto& fid : groups[group_id]) {
+      const unsigned bin_begin = gmat.cut->row_ptr[fid];
+      const unsigned bin_end = gmat.cut->row_ptr[fid + 1];
+      for (unsigned bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
+        bin2block[bin_id] = group_id;
+      }
+    }
+  }
+  std::vector<std::vector<unsigned>> index_temp(nblock);
+  std::vector<std::vector<unsigned>> row_ptr_temp(nblock);
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    row_ptr_temp[block_id].push_back(0);
+  }
+  for (size_t rid = 0; rid < nrow; ++rid) {
+    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    for (size_t j = ibegin; j < iend; ++j) {
+      const size_t bin_id = gmat.index[j];
+      const size_t block_id = bin2block[bin_id];
+      index_temp[block_id].push_back(bin_id);
+    }
+    for (size_t block_id = 0; block_id < nblock; ++block_id) {
+      row_ptr_temp[block_id].push_back(index_temp[block_id].size());
+    }
+  }
+
+  /* step 3: concatenate CSR matrices into one (index, row_ptr) pair */
+  std::vector<size_t> index_blk_ptr;
+  std::vector<size_t> row_ptr_blk_ptr;
+  index_blk_ptr.push_back(0);
+  row_ptr_blk_ptr.push_back(0);
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
+    row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
+    index_blk_ptr.push_back(index.size());
+    row_ptr_blk_ptr.push_back(row_ptr.size());
+  }
+
+  // save shortcut for each block
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    Block blk;
+    blk.index_begin = &index[index_blk_ptr[block_id]];
+    blk.row_ptr_begin = &row_ptr[row_ptr_blk_ptr[block_id]];
+    blk.index_end = &index[index_blk_ptr[block_id + 1]];
+    blk.row_ptr_end = &row_ptr[row_ptr_blk_ptr[block_id + 1]];
+    blocks.push_back(blk);
+  }
+}
+
 void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
@ -161,33 +403,12 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                             GHistRow hist) {
  data_.resize(nbins_ * nthread_, GHistEntry());
  std::fill(data_.begin(), data_.end(), GHistEntry());
-  stat_buf_.resize(row_indices.size());

  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
  const bst_omp_uint rest = nrows % K;

-  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
-    bst_uint rid[K];
-    bst_gpair stat[K];
-    for (int k = 0; k < K; ++k) {
-      rid[k] = row_indices.begin[i + k];
-    }
-    for (int k = 0; k < K; ++k) {
-      stat[k] = gpair[rid[k]];
-    }
-    for (int k = 0; k < K; ++k) {
-      stat_buf_[i + k] = stat[k];
-    }
-  }
-  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
-    const bst_uint rid = row_indices.begin[i];
-    const bst_gpair stat = gpair[rid];
-    stat_buf_[i] = stat;
-  }
-
  #pragma omp parallel for num_threads(nthread) schedule(guided)
  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
    const bst_omp_uint tid = omp_get_thread_num();
@ -204,7 +425,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
      iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
    }
    for (int k = 0; k < K; ++k) {
-      stat[k] = stat_buf_[i + k];
+      stat[k] = gpair[rid[k]];
    }
    for (int k = 0; k < K; ++k) {
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
@ -217,7 +438,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
    const bst_uint rid = row_indices.begin[i];
    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
-    const bst_gpair stat = stat_buf_[i];
+    const bst_gpair stat = gpair[rid];
    for (size_t j = ibegin; j < iend; ++j) {
      const size_t bin = gmat.index[j];
      data_[bin].Add(stat);
@ -234,10 +455,60 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  }
 }

+void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
+                                  const RowSetCollection::Elem row_indices,
+                                  const GHistIndexBlockMatrix& gmatb,
+                                  const std::vector<bst_uint>& feat_set,
+                                  GHistRow hist) {
+  const int K = 8;  // loop unrolling factor
+  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
+  const bst_omp_uint nblock = gmatb.GetNumBlock();
+  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
+  const bst_omp_uint rest = nrows % K;
+
+  #pragma omp parallel for num_threads(nthread) schedule(guided)
+  for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
+    auto gmat = gmatb[bid];
+
+    for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
+      bst_uint rid[K];
+      size_t ibegin[K];
+      size_t iend[K];
+      bst_gpair stat[K];
+      for (int k = 0; k < K; ++k) {
+        rid[k] = row_indices.begin[i + k];
+      }
+      for (int k = 0; k < K; ++k) {
+        ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
+        iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
+      }
+      for (int k = 0; k < K; ++k) {
+        stat[k] = gpair[rid[k]];
+      }
+      for (int k = 0; k < K; ++k) {
+        for (size_t j = ibegin[k]; j < iend[k]; ++j) {
+          const size_t bin = gmat.index[j];
+          hist.begin[bin].Add(stat[k]);
+        }
+      }
+    }
+    for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+      const bst_uint rid = row_indices.begin[i];
+      const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+      const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+      const bst_gpair stat = gpair[rid];
+      for (size_t j = ibegin; j < iend; ++j) {
+        const size_t bin = gmat.index[j];
+        hist.begin[bin].Add(stat);
+      }
+    }
+  }
+}
+
 void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
  const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
-  const int K = 8;
+  const int K = 8;  // loop unrolling factor
  const bst_omp_uint rest = nbins % K;
  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@ -11,6 +11,9 @@
 #include <limits>
 #include <vector>
 #include "row_set.h"
+#include "../tree/fast_hist_param.h"
+
+using xgboost::tree::FastHistParam;

 namespace xgboost {
 namespace common {
@ -24,6 +27,10 @@ struct GHistEntry {

  GHistEntry() : sum_grad(0), sum_hess(0) {}

+  inline void Clear() {
+    sum_grad = sum_hess = 0;
+  }
+
  /*! \brief add a bst_gpair to the sum */
  inline void Add(const bst_gpair& e) {
    sum_grad += e.grad;
@ -125,6 +132,48 @@ struct GHistIndexMatrix {
  std::vector<unsigned> hit_count_tloc_;
 };

+struct GHistIndexBlock {
+  const unsigned* row_ptr;
+  const unsigned* index;
+
+  inline GHistIndexBlock(const unsigned* row_ptr, const unsigned* index)
+    : row_ptr(row_ptr), index(index) {}
+
+  // get i-th row
+  inline GHistIndexRow operator[](bst_uint i) const {
+    return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
+  }
+};
+
+class ColumnMatrix;
+
+class GHistIndexBlockMatrix {
+ public:
+  void Init(const GHistIndexMatrix& gmat,
+            const ColumnMatrix& colmat,
+            const FastHistParam& param);
+
+  inline GHistIndexBlock operator[](bst_uint i) const {
+    return GHistIndexBlock(blocks[i].row_ptr_begin, blocks[i].index_begin);
+  }
+
+  inline unsigned GetNumBlock() const {
+    return blocks.size();
+  }
+
+ private:
+  std::vector<unsigned> row_ptr;
+  std::vector<unsigned> index;
+  const HistCutMatrix* cut;
+  struct Block {
+    const unsigned* row_ptr_begin;
+    const unsigned* row_ptr_end;
+    const unsigned* index_begin;
+    const unsigned* index_end;
+  };
+  std::vector<Block> blocks;
+};
+
 /*!
 * \brief histogram of graident statistics for a single node.
 *  Consists of multiple GHistEntry's, each entry showing total graident statistics 
@ -206,6 +255,12 @@ class GHistBuilder {
                 const GHistIndexMatrix& gmat,
                 const std::vector<bst_uint>& feat_set,
                 GHistRow hist);
+  // same, with feature grouping
+  void BuildBlockHist(const std::vector<bst_gpair>& gpair,
+                      const RowSetCollection::Elem row_indices,
+                      const GHistIndexBlockMatrix& gmatb,
+                      const std::vector<bst_uint>& feat_set,
+                      GHistRow hist);
  // construct a histogram via subtraction trick
  void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);

@ -215,7 +270,6 @@ class GHistBuilder {
  /*! \brief number of all bins over all features */
  size_t nbins_;
  std::vector<GHistEntry> data_;
-  std::vector<bst_gpair> stat_buf_;
 };


--- a/src/tree/fast_hist_param.h
+++ b/src/tree/fast_hist_param.h
@ -0,0 +1,64 @@
+/*!
+ * Copyright 2017 by Contributors
+ * \file updater_fast_hist.h
+ * \brief parameters for histogram-based training
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_FAST_HIST_PARAM_H_
+#define XGBOOST_TREE_FAST_HIST_PARAM_H_
+
+namespace xgboost {
+namespace tree {
+
+/*! \brief training parameters for histogram-based training */
+struct FastHistParam : public dmlc::Parameter<FastHistParam> {
+  // integral data type to be used with columnar data storage
+  enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
+  int colmat_dtype;
+  // percentage threshold for treating a feature as sparse
+  // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
+  double sparse_threshold;
+  // use feature grouping? (default yes)
+  int enable_feature_grouping;
+  // when grouping features, how many "conflicts" to allow.
+  // conflict is when an instance has nonzero values for two or more features
+  // default is 0, meaning features should be strictly complementary
+  double max_conflict_rate;
+  // when grouping features, how much effort to expend to prevent singleton groups
+  // we'll try to insert each feature into existing groups before creating a new group
+  // for that feature; to save time, only up to (max_search_group) of existing groups
+  // will be considered. If set to zero, ALL existing groups will be examined
+  unsigned max_search_group;
+
+  // declare the parameters
+  DMLC_DECLARE_PARAMETER(FastHistParam) {
+    DMLC_DECLARE_FIELD(colmat_dtype)
+        .set_default(static_cast<int>(DataType::uint32))
+        .add_enum("uint8", static_cast<int>(DataType::uint8))
+        .add_enum("uint16", static_cast<int>(DataType::uint16))
+        .add_enum("uint32", static_cast<int>(DataType::uint32))
+        .describe("Integral data type to be used with columnar data storage."
+                  "May carry marginal performance implications. Reserved for "
+                  "advanced use");
+    DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
+        .describe("percentage threshold for treating a feature as sparse");
+    DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(1)
+        .describe("if >0, enable feature grouping to ameliorate work imbalance "
+                  "among worker threads");
+    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
+        .describe("when grouping features, how many \"conflicts\" to allow."
+       "conflict is when an instance has nonzero values for two or more features."
+       "default is 0, meaning features should be strictly complementary.");
+    DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
+        .describe("when grouping features, how much effort to expend to prevent "
+                  "singleton groups. We'll try to insert each feature into existing "
+                  "groups before creating a new group for that feature; to save time, "
+                  "only up to (max_search_group) of existing groups will be "
+                  "considered. If set to zero, ALL existing groups will be examined.");
+  }
+};
+
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // XGBOOST_TREE_FAST_HIST_PARAM_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@ -30,8 +30,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  int max_leaves;
  // if using histogram based algorithm, maximum number of bins per feature
  int max_bin;
-  enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
-  int colmat_dtype;
  // growing policy
  enum TreeGrowPolicy { kDepthWise = 0, kLossGuide = 1 };
  int grow_policy;
@ -111,14 +109,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
            "Tree growing policy. 0: favor splitting at nodes closest to the node, "
            "i.e. grow depth-wise. 1: favor splitting at nodes with highest loss "
            "change. (cf. LightGBM)");
-    DMLC_DECLARE_FIELD(colmat_dtype)
-        .set_default(static_cast<int>(DataType::uint32))
-        .add_enum("uint8", static_cast<int>(DataType::uint8))
-        .add_enum("uint16", static_cast<int>(DataType::uint16))
-        .add_enum("uint32", static_cast<int>(DataType::uint32))
-        .describe("Integral data type to be used with columnar data storage."
-                  "May carry marginal performance implications. Reserved for "
-                  "advanced use");
    DMLC_DECLARE_FIELD(min_child_weight)
        .set_lower_bound(0.0f)
        .set_default(1.0f)
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@ -13,6 +13,7 @@
 #include <iomanip>
 #include <numeric>
 #include "./param.h"
+#include "./fast_hist_param.h"
 #include "../common/random.h"
 #include "../common/bitmap.h"
 #include "../common/sync.h"
@ -25,6 +26,7 @@ namespace tree {

 using xgboost::common::HistCutMatrix;
 using xgboost::common::GHistIndexMatrix;
+using xgboost::common::GHistIndexBlockMatrix;
 using xgboost::common::GHistIndexRow;
 using xgboost::common::GHistEntry;
 using xgboost::common::HistCollection;
@ -36,6 +38,8 @@ using xgboost::common::Column;

 DMLC_REGISTRY_FILE_TAG(updater_fast_hist);

+DMLC_REGISTER_PARAMETER(FastHistParam);
+
 /*! \brief construct a tree using quantized feature values */
 template<typename TStats, typename TConstraint>
 class FastHistMaker: public TreeUpdater {
@ -47,6 +51,7 @@ class FastHistMaker: public TreeUpdater {
    }
    pruner_->Init(args);
    param.InitAllowUnknown(args);
+    fhparam.InitAllowUnknown(args);
    is_gmat_initialized_ = false;
  }

@ -59,7 +64,10 @@ class FastHistMaker: public TreeUpdater {
      hmat_.Init(dmat, param.max_bin);
      gmat_.cut = &hmat_;
      gmat_.Init(dmat);
-      column_matrix_.Init(gmat_, static_cast<xgboost::common::DataType>(param.colmat_dtype));
+      column_matrix_.Init(gmat_, fhparam);
+      if (fhparam.enable_feature_grouping > 0) {
+        gmatb_.Init(gmat_, column_matrix_, fhparam);
+      }
      is_gmat_initialized_ = true;
      if (param.debug_verbose > 0) {
        LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
@ -71,10 +79,10 @@ class FastHistMaker: public TreeUpdater {
    TConstraint::Init(&param, dmat->info().num_col);
    // build tree
    if (!builder_) {
-      builder_.reset(new Builder(param, std::move(pruner_)));
+      builder_.reset(new Builder(param, fhparam, std::move(pruner_)));
    }
    for (size_t i = 0; i < trees.size(); ++i) {
-      builder_->Update(gmat_, column_matrix_, gpair, dmat, trees[i]);
+      builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
    }
    param.learning_rate = lr;
  }
@ -91,9 +99,13 @@ class FastHistMaker: public TreeUpdater {
 protected:
  // training parameter
  TrainParam param;
+  FastHistParam fhparam;
  // data sketch
  HistCutMatrix hmat_;
+  // quantized data matrix
  GHistIndexMatrix gmat_;
+  // (optional) data matrix with feature grouping
+  GHistIndexBlockMatrix gmatb_;
  // column accessor
  ColumnMatrix column_matrix_;
  bool is_gmat_initialized_;
@ -136,11 +148,13 @@ class FastHistMaker: public TreeUpdater {
   public:
    // constructor
    explicit Builder(const TrainParam& param,
+                     const FastHistParam& fhparam,
                     std::unique_ptr<TreeUpdater> pruner)
-      : param(param), pruner_(std::move(pruner)),
+      : param(param), fhparam(fhparam), pruner_(std::move(pruner)),
        p_last_tree_(nullptr), p_last_fmat_(nullptr) {}
    // update one tree, growing
    virtual void Update(const GHistIndexMatrix& gmat,
+                        const GHistIndexBlockMatrix& gmatb,
                        const ColumnMatrix& column_matrix,
                        const std::vector<bst_gpair>& gpair,
                        DMatrix* p_fmat,
@ -168,7 +182,7 @@ class FastHistMaker: public TreeUpdater {
      for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
        tstart = dmlc::GetTime();
        hist_.AddHistRow(nid);
-        builder_.BuildHist(gpair, row_set_collection_[nid], gmat, feat_set, hist_[nid]);
+        BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
        time_build_hist += dmlc::GetTime() - tstart;

        tstart = dmlc::GetTime();
@ -203,13 +217,11 @@ class FastHistMaker: public TreeUpdater {
          hist_.AddHistRow(cleft);
          hist_.AddHistRow(cright);
          if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
-            builder_.BuildHist(gpair, row_set_collection_[cleft], gmat, feat_set,
-                               hist_[cleft]);
-            builder_.SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
+            BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
+            SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
          } else {
-            builder_.BuildHist(gpair, row_set_collection_[cright], gmat, feat_set,
-                               hist_[cright]);
-            builder_.SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
+            BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
+            SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
          }
          time_build_hist += dmlc::GetTime() - tstart;

@ -280,6 +292,23 @@ class FastHistMaker: public TreeUpdater {
      }
    }

+    inline void BuildHist(const std::vector<bst_gpair>& gpair,
+                          const RowSetCollection::Elem row_indices,
+                          const GHistIndexMatrix& gmat,
+                          const GHistIndexBlockMatrix& gmatb,
+                          const std::vector<bst_uint>& feat_set,
+                          GHistRow hist) {
+      if (fhparam.enable_feature_grouping > 0) {
+        hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, feat_set, hist);
+      } else {
+        hist_builder_.BuildHist(gpair, row_indices, gmat, feat_set, hist);
+      }
+    }
+
+    inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
+      hist_builder_.SubtractionTrick(self, sibling, parent);
+    }
+
    inline bool UpdatePredictionCache(const DMatrix* data,
                                      std::vector<bst_float>* p_out_preds) {
      std::vector<bst_float>& out_preds = *p_out_preds;
@ -351,7 +380,7 @@ class FastHistMaker: public TreeUpdater {
        {
          this->nthread = omp_get_num_threads();
        }
-        builder_.Init(this->nthread, nbins);
+        hist_builder_.Init(this->nthread, nbins);

        CHECK_EQ(info.root_index.size(), 0U);
        std::vector<bst_uint>& row_indices = row_set_collection_.row_indices_;
@ -885,6 +914,7 @@ class FastHistMaker: public TreeUpdater {

    //  --data fields--
    const TrainParam& param;
+    const FastHistParam& fhparam;
    // number of omp thread used during training
    int nthread;
    // Per feature: shuffle index of each feature index
@ -904,7 +934,7 @@ class FastHistMaker: public TreeUpdater {
    /*! \brief local prediction cache; maps node id to leaf value */
    std::vector<float> leaf_value_cache_;

-    GHistBuilder builder_;
+    GHistBuilder hist_builder_;
    std::unique_ptr<TreeUpdater> pruner_;

    // back pointers to tree and data matrix