diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index bebc70c8c..5d3fea87e 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -29,6 +29,9 @@ switch (dtype) {                \
 #include <limits>
 #include <vector>
 #include "hist_util.h"
+#include "../tree/fast_hist_param.h"
+
+using xgboost::tree::FastHistParam;
 
 namespace xgboost {
 namespace common {
@@ -68,8 +71,9 @@ class ColumnMatrix {
   }
 
   // construct column matrix from GHistIndexMatrix
-  inline void Init(const GHistIndexMatrix& gmat, DataType dtype) {
-    this->dtype = dtype;
+  inline void Init(const GHistIndexMatrix& gmat,
+                   const FastHistParam& param) {
+    this->dtype = static_cast<DataType>(param.colmat_dtype);
     /* if dtype is smaller than uint32_t, multiple bin_id's will be stored in each
        slot of internal buffer. */
     packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
@@ -93,7 +97,8 @@ class ColumnMatrix {
     gmat.GetFeatureCounts(&feature_counts_[0]);
     // classify features
     for (uint32_t fid = 0; fid < nfeature; ++fid) {
-      if (static_cast<double>(feature_counts_[fid]) < 0.5*nrow) {
+      if (static_cast<double>(feature_counts_[fid])
+                 < param.sparse_threshold * nrow) {
         type_[fid] = kSparseColumn;
       } else {
         type_[fid] = kDenseColumn;
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 9c8228ab9..fe27ac8c5 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -5,10 +5,12 @@
  * \author Philip Cho, Tianqi Chen
  */
 #include <dmlc/omp.h>
+#include <numeric>
 #include <vector>
 #include "./sync.h"
-#include "./hist_util.h"
+#include "./random.h"
 #include "./column_matrix.h"
+#include "./hist_util.h"
 #include "./quantile.h"
 
 namespace xgboost {
@@ -154,6 +156,246 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
   }
 }
 
+template <typename T>
+static unsigned GetConflictCount(const std::vector<bool>& mark,
+                                 const Column<T>& column,
+                                 unsigned max_cnt) {
+  unsigned ret = 0;
+  if (column.type == xgboost::common::kDenseColumn) {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (column.index[i] != std::numeric_limits<T>::max() && mark[i]) {
+        ++ret;
+        if (ret > max_cnt) {
+          return max_cnt + 1;
+        }
+      }
+    }
+  } else {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (mark[column.row_ind[i]]) {
+        ++ret;
+        if (ret > max_cnt) {
+          return max_cnt + 1;
+        }
+      }
+    }
+  }
+  return ret;
+}
+
+template <typename T>
+inline void
+MarkUsed(std::vector<bool>* p_mark, const Column<T>& column) {
+  std::vector<bool>& mark = *p_mark;
+  if (column.type == xgboost::common::kDenseColumn) {
+    for (size_t i = 0; i < column.len; ++i) {
+      if (column.index[i] != std::numeric_limits<T>::max()) {
+        mark[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < column.len; ++i) {
+      mark[column.row_ind[i]] = true;
+    }
+  }
+}
+
+template <typename T>
+inline std::vector<std::vector<unsigned>>
+FindGroups_(const std::vector<unsigned>& feature_list,
+            const std::vector<bst_uint>& feature_nnz,
+            const ColumnMatrix& colmat,
+            unsigned nrow,
+            const FastHistParam& param) {
+  /* Goal: Bundle features together that has little or no "overlap", i.e.
+           only a few data points should have nonzero values for
+           member features.
+           Note that one-hot encoded features will be grouped together. */
+
+  std::vector<std::vector<unsigned>> groups;
+  std::vector<std::vector<bool>> conflict_marks;
+  std::vector<unsigned> group_nnz;
+  std::vector<unsigned> group_conflict_cnt;
+  const unsigned max_conflict_cnt
+    = static_cast<unsigned>(param.max_conflict_rate * nrow);
+
+  for (auto fid : feature_list) {
+    const Column<T>& column = colmat.GetColumn<T>(fid);
+
+    const size_t cur_fid_nnz = feature_nnz[fid];
+    bool need_new_group = true;
+
+    // randomly choose some of existing groups as candidates
+    std::vector<unsigned> search_groups;
+    for (size_t gid = 0; gid < groups.size(); ++gid) {
+      if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
+        search_groups.push_back(gid);
+      }
+    }
+    std::shuffle(search_groups.begin(), search_groups.end(), common::GlobalRandom());
+    if (param.max_search_group > 0 && search_groups.size() > param.max_search_group) {
+      search_groups.resize(param.max_search_group);
+    }
+
+    // examine each candidate group: is it okay to insert fid?
+    for (auto gid : search_groups) {
+      const unsigned rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
+      const unsigned cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
+      if (cnt <= rest_max_cnt) {
+        need_new_group = false;
+        groups[gid].push_back(fid);
+        group_conflict_cnt[gid] += cnt;
+        group_nnz[gid] += cur_fid_nnz - cnt;
+        MarkUsed(&conflict_marks[gid], column);
+        break;
+      }
+    }
+
+    // create new group if necessary
+    if (need_new_group) {
+      groups.emplace_back();
+      groups.back().push_back(fid);
+      group_conflict_cnt.push_back(0);
+      conflict_marks.emplace_back(nrow, false);
+      MarkUsed(&conflict_marks.back(), column);
+      group_nnz.emplace_back(cur_fid_nnz);
+    }
+  }
+
+  return groups;
+}
+
+inline std::vector<std::vector<unsigned>>
+FindGroups(const std::vector<unsigned>& feature_list,
+           const std::vector<bst_uint>& feature_nnz,
+           const ColumnMatrix& colmat,
+           unsigned nrow,
+           const FastHistParam& param) {
+  XGBOOST_TYPE_SWITCH(colmat.dtype, {
+    return FindGroups_<DType>(feature_list, feature_nnz, colmat, nrow, param);
+  });
+  return std::vector<std::vector<unsigned>>();  // to avoid warning message
+}
+
+inline std::vector<std::vector<unsigned>>
+FastFeatureGrouping(const GHistIndexMatrix& gmat,
+                    const ColumnMatrix& colmat,
+                    const FastHistParam& param) {
+  const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nfeature = gmat.cut->row_ptr.size() - 1;
+
+  std::vector<unsigned> feature_list(nfeature);
+  std::iota(feature_list.begin(), feature_list.end(), 0);
+
+  // sort features by nonzero counts, descending order
+  std::vector<bst_uint> feature_nnz(nfeature);
+  std::vector<unsigned> features_by_nnz(feature_list);
+  gmat.GetFeatureCounts(&feature_nnz[0]);
+  std::sort(features_by_nnz.begin(), features_by_nnz.end(),
+            [&feature_nnz](int a, int b) {
+    return feature_nnz[a] > feature_nnz[b];
+  });
+
+  auto groups_alt1 = FindGroups(feature_list, feature_nnz, colmat, nrow, param);
+  auto groups_alt2 = FindGroups(features_by_nnz, feature_nnz, colmat, nrow, param);
+  auto& groups = (groups_alt1.size() > groups_alt2.size()) ? groups_alt2 : groups_alt1;
+
+  // take apart small, sparse groups, as it won't help speed
+  {
+    std::vector<std::vector<unsigned>> ret;
+    for (const auto& group : groups) {
+      if (group.size() <= 1 || group.size() >= 5) {
+        ret.push_back(group);  // keep singleton groups and large (5+) groups
+      } else {
+        unsigned nnz = 0;
+        for (auto fid : group) {
+          nnz += feature_nnz[fid];
+        }
+        double nnz_rate = static_cast<double>(nnz) / nrow;
+        // take apart small sparse group, due it will not gain on speed
+        if (nnz_rate <= param.sparse_threshold) {
+          for (auto fid : group) {
+            ret.emplace_back();
+            ret.back().push_back(fid);
+          }
+        } else {
+          ret.push_back(group);
+        }
+      }
+    }
+    groups = std::move(ret);
+  }
+
+  // shuffle groups
+  std::shuffle(groups.begin(), groups.end(), common::GlobalRandom());
+
+  return groups;
+}
+
+void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
+                                 const ColumnMatrix& colmat,
+                                 const FastHistParam& param) {
+  cut = gmat.cut;
+
+  const size_t nrow = gmat.row_ptr.size() - 1;
+  const size_t nbins = gmat.cut->row_ptr.back();
+
+  /* step 1: form feature groups */
+  auto groups = FastFeatureGrouping(gmat, colmat, param);
+  const size_t nblock = groups.size();
+
+  /* step 2: build a new CSR matrix for each feature group */
+  std::vector<unsigned> bin2block(nbins);  // lookup table [bin id] => [block id]
+  for (size_t group_id = 0; group_id < nblock; ++group_id) {
+    for (auto& fid : groups[group_id]) {
+      const unsigned bin_begin = gmat.cut->row_ptr[fid];
+      const unsigned bin_end = gmat.cut->row_ptr[fid + 1];
+      for (unsigned bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
+        bin2block[bin_id] = group_id;
+      }
+    }
+  }
+  std::vector<std::vector<unsigned>> index_temp(nblock);
+  std::vector<std::vector<unsigned>> row_ptr_temp(nblock);
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    row_ptr_temp[block_id].push_back(0);
+  }
+  for (size_t rid = 0; rid < nrow; ++rid) {
+    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    for (size_t j = ibegin; j < iend; ++j) {
+      const size_t bin_id = gmat.index[j];
+      const size_t block_id = bin2block[bin_id];
+      index_temp[block_id].push_back(bin_id);
+    }
+    for (size_t block_id = 0; block_id < nblock; ++block_id) {
+      row_ptr_temp[block_id].push_back(index_temp[block_id].size());
+    }
+  }
+
+  /* step 3: concatenate CSR matrices into one (index, row_ptr) pair */
+  std::vector<size_t> index_blk_ptr;
+  std::vector<size_t> row_ptr_blk_ptr;
+  index_blk_ptr.push_back(0);
+  row_ptr_blk_ptr.push_back(0);
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
+    row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
+    index_blk_ptr.push_back(index.size());
+    row_ptr_blk_ptr.push_back(row_ptr.size());
+  }
+
+  // save shortcut for each block
+  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    Block blk;
+    blk.index_begin = &index[index_blk_ptr[block_id]];
+    blk.row_ptr_begin = &row_ptr[row_ptr_blk_ptr[block_id]];
+    blk.index_end = &index[index_blk_ptr[block_id + 1]];
+    blk.row_ptr_end = &row_ptr[row_ptr_blk_ptr[block_id + 1]];
+    blocks.push_back(blk);
+  }
+}
+
 void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                              const RowSetCollection::Elem row_indices,
                              const GHistIndexMatrix& gmat,
@@ -161,33 +403,12 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
                              GHistRow hist) {
   data_.resize(nbins_ * nthread_, GHistEntry());
   std::fill(data_.begin(), data_.end(), GHistEntry());
-  stat_buf_.resize(row_indices.size());
 
   const int K = 8;  // loop unrolling factor
   const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
   const bst_omp_uint nrows = row_indices.end - row_indices.begin;
   const bst_omp_uint rest = nrows % K;
 
-  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
-    bst_uint rid[K];
-    bst_gpair stat[K];
-    for (int k = 0; k < K; ++k) {
-      rid[k] = row_indices.begin[i + k];
-    }
-    for (int k = 0; k < K; ++k) {
-      stat[k] = gpair[rid[k]];
-    }
-    for (int k = 0; k < K; ++k) {
-      stat_buf_[i + k] = stat[k];
-    }
-  }
-  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
-    const bst_uint rid = row_indices.begin[i];
-    const bst_gpair stat = gpair[rid];
-    stat_buf_[i] = stat;
-  }
-
   #pragma omp parallel for num_threads(nthread) schedule(guided)
   for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
     const bst_omp_uint tid = omp_get_thread_num();
@@ -204,7 +425,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
       iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
     }
     for (int k = 0; k < K; ++k) {
-      stat[k] = stat_buf_[i + k];
+      stat[k] = gpair[rid[k]];
     }
     for (int k = 0; k < K; ++k) {
       for (size_t j = ibegin[k]; j < iend[k]; ++j) {
@@ -217,7 +438,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
     const bst_uint rid = row_indices.begin[i];
     const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
     const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
-    const bst_gpair stat = stat_buf_[i];
+    const bst_gpair stat = gpair[rid];
     for (size_t j = ibegin; j < iend; ++j) {
       const size_t bin = gmat.index[j];
       data_[bin].Add(stat);
@@ -234,10 +455,60 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
   }
 }
 
+void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
+                                  const RowSetCollection::Elem row_indices,
+                                  const GHistIndexBlockMatrix& gmatb,
+                                  const std::vector<bst_uint>& feat_set,
+                                  GHistRow hist) {
+  const int K = 8;  // loop unrolling factor
+  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
+  const bst_omp_uint nblock = gmatb.GetNumBlock();
+  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
+  const bst_omp_uint rest = nrows % K;
+
+  #pragma omp parallel for num_threads(nthread) schedule(guided)
+  for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
+    auto gmat = gmatb[bid];
+
+    for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
+      bst_uint rid[K];
+      size_t ibegin[K];
+      size_t iend[K];
+      bst_gpair stat[K];
+      for (int k = 0; k < K; ++k) {
+        rid[k] = row_indices.begin[i + k];
+      }
+      for (int k = 0; k < K; ++k) {
+        ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
+        iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
+      }
+      for (int k = 0; k < K; ++k) {
+        stat[k] = gpair[rid[k]];
+      }
+      for (int k = 0; k < K; ++k) {
+        for (size_t j = ibegin[k]; j < iend[k]; ++j) {
+          const size_t bin = gmat.index[j];
+          hist.begin[bin].Add(stat[k]);
+        }
+      }
+    }
+    for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+      const bst_uint rid = row_indices.begin[i];
+      const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
+      const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+      const bst_gpair stat = gpair[rid];
+      for (size_t j = ibegin; j < iend; ++j) {
+        const size_t bin = gmat.index[j];
+        hist.begin[bin].Add(stat);
+      }
+    }
+  }
+}
+
 void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
   const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
   const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
-  const int K = 8;
+  const int K = 8;  // loop unrolling factor
   const bst_omp_uint rest = nbins % K;
   #pragma omp parallel for num_threads(nthread) schedule(static)
   for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 0a9c74c26..9c58cca73 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -11,6 +11,9 @@
 #include <limits>
 #include <vector>
 #include "row_set.h"
+#include "../tree/fast_hist_param.h"
+
+using xgboost::tree::FastHistParam;
 
 namespace xgboost {
 namespace common {
@@ -24,6 +27,10 @@ struct GHistEntry {
 
   GHistEntry() : sum_grad(0), sum_hess(0) {}
 
+  inline void Clear() {
+    sum_grad = sum_hess = 0;
+  }
+
   /*! \brief add a bst_gpair to the sum */
   inline void Add(const bst_gpair& e) {
     sum_grad += e.grad;
@@ -125,6 +132,48 @@ struct GHistIndexMatrix {
   std::vector<unsigned> hit_count_tloc_;
 };
 
+struct GHistIndexBlock {
+  const unsigned* row_ptr;
+  const unsigned* index;
+
+  inline GHistIndexBlock(const unsigned* row_ptr, const unsigned* index)
+    : row_ptr(row_ptr), index(index) {}
+
+  // get i-th row
+  inline GHistIndexRow operator[](bst_uint i) const {
+    return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
+  }
+};
+
+class ColumnMatrix;
+
+class GHistIndexBlockMatrix {
+ public:
+  void Init(const GHistIndexMatrix& gmat,
+            const ColumnMatrix& colmat,
+            const FastHistParam& param);
+
+  inline GHistIndexBlock operator[](bst_uint i) const {
+    return GHistIndexBlock(blocks[i].row_ptr_begin, blocks[i].index_begin);
+  }
+
+  inline unsigned GetNumBlock() const {
+    return blocks.size();
+  }
+
+ private:
+  std::vector<unsigned> row_ptr;
+  std::vector<unsigned> index;
+  const HistCutMatrix* cut;
+  struct Block {
+    const unsigned* row_ptr_begin;
+    const unsigned* row_ptr_end;
+    const unsigned* index_begin;
+    const unsigned* index_end;
+  };
+  std::vector<Block> blocks;
+};
+
 /*!
  * \brief histogram of graident statistics for a single node.
  *  Consists of multiple GHistEntry's, each entry showing total graident statistics 
@@ -206,6 +255,12 @@ class GHistBuilder {
                  const GHistIndexMatrix& gmat,
                  const std::vector<bst_uint>& feat_set,
                  GHistRow hist);
+  // same, with feature grouping
+  void BuildBlockHist(const std::vector<bst_gpair>& gpair,
+                      const RowSetCollection::Elem row_indices,
+                      const GHistIndexBlockMatrix& gmatb,
+                      const std::vector<bst_uint>& feat_set,
+                      GHistRow hist);
   // construct a histogram via subtraction trick
   void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
 
@@ -215,7 +270,6 @@ class GHistBuilder {
   /*! \brief number of all bins over all features */
   size_t nbins_;
   std::vector<GHistEntry> data_;
-  std::vector<bst_gpair> stat_buf_;
 };
 
 
diff --git a/src/tree/fast_hist_param.h b/src/tree/fast_hist_param.h
new file mode 100644
index 000000000..7bd3be9f2
--- /dev/null
+++ b/src/tree/fast_hist_param.h
@@ -0,0 +1,64 @@
+/*!
+ * Copyright 2017 by Contributors
+ * \file updater_fast_hist.h
+ * \brief parameters for histogram-based training
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_TREE_FAST_HIST_PARAM_H_
+#define XGBOOST_TREE_FAST_HIST_PARAM_H_
+
+namespace xgboost {
+namespace tree {
+
+/*! \brief training parameters for histogram-based training */
+struct FastHistParam : public dmlc::Parameter<FastHistParam> {
+  // integral data type to be used with columnar data storage
+  enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
+  int colmat_dtype;
+  // percentage threshold for treating a feature as sparse
+  // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
+  double sparse_threshold;
+  // use feature grouping? (default yes)
+  int enable_feature_grouping;
+  // when grouping features, how many "conflicts" to allow.
+  // conflict is when an instance has nonzero values for two or more features
+  // default is 0, meaning features should be strictly complementary
+  double max_conflict_rate;
+  // when grouping features, how much effort to expend to prevent singleton groups
+  // we'll try to insert each feature into existing groups before creating a new group
+  // for that feature; to save time, only up to (max_search_group) of existing groups
+  // will be considered. If set to zero, ALL existing groups will be examined
+  unsigned max_search_group;
+
+  // declare the parameters
+  DMLC_DECLARE_PARAMETER(FastHistParam) {
+    DMLC_DECLARE_FIELD(colmat_dtype)
+        .set_default(static_cast<int>(DataType::uint32))
+        .add_enum("uint8", static_cast<int>(DataType::uint8))
+        .add_enum("uint16", static_cast<int>(DataType::uint16))
+        .add_enum("uint32", static_cast<int>(DataType::uint32))
+        .describe("Integral data type to be used with columnar data storage."
+                  "May carry marginal performance implications. Reserved for "
+                  "advanced use");
+    DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
+        .describe("percentage threshold for treating a feature as sparse");
+    DMLC_DECLARE_FIELD(enable_feature_grouping).set_lower_bound(0).set_default(1)
+        .describe("if >0, enable feature grouping to ameliorate work imbalance "
+                  "among worker threads");
+    DMLC_DECLARE_FIELD(max_conflict_rate).set_range(0, 1.0).set_default(0)
+        .describe("when grouping features, how many \"conflicts\" to allow."
+       "conflict is when an instance has nonzero values for two or more features."
+       "default is 0, meaning features should be strictly complementary.");
+    DMLC_DECLARE_FIELD(max_search_group).set_lower_bound(0).set_default(100)
+        .describe("when grouping features, how much effort to expend to prevent "
+                  "singleton groups. We'll try to insert each feature into existing "
+                  "groups before creating a new group for that feature; to save time, "
+                  "only up to (max_search_group) of existing groups will be "
+                  "considered. If set to zero, ALL existing groups will be examined.");
+  }
+};
+
+}  // namespace tree
+}  // namespace xgboost
+
+#endif  // XGBOOST_TREE_FAST_HIST_PARAM_H_
diff --git a/src/tree/param.h b/src/tree/param.h
index 12baa1c1b..8995c9ee9 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -30,8 +30,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
   int max_leaves;
   // if using histogram based algorithm, maximum number of bins per feature
   int max_bin;
-  enum class DataType { uint8 = 1, uint16 = 2, uint32 = 4 };
-  int colmat_dtype;
   // growing policy
   enum TreeGrowPolicy { kDepthWise = 0, kLossGuide = 1 };
   int grow_policy;
@@ -111,14 +109,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
             "Tree growing policy. 0: favor splitting at nodes closest to the node, "
             "i.e. grow depth-wise. 1: favor splitting at nodes with highest loss "
             "change. (cf. LightGBM)");
-    DMLC_DECLARE_FIELD(colmat_dtype)
-        .set_default(static_cast<int>(DataType::uint32))
-        .add_enum("uint8", static_cast<int>(DataType::uint8))
-        .add_enum("uint16", static_cast<int>(DataType::uint16))
-        .add_enum("uint32", static_cast<int>(DataType::uint32))
-        .describe("Integral data type to be used with columnar data storage."
-                  "May carry marginal performance implications. Reserved for "
-                  "advanced use");
     DMLC_DECLARE_FIELD(min_child_weight)
         .set_lower_bound(0.0f)
         .set_default(1.0f)
diff --git a/src/tree/updater_fast_hist.cc b/src/tree/updater_fast_hist.cc
index 37ba03736..95c2142a8 100644
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@@ -13,6 +13,7 @@
 #include <iomanip>
 #include <numeric>
 #include "./param.h"
+#include "./fast_hist_param.h"
 #include "../common/random.h"
 #include "../common/bitmap.h"
 #include "../common/sync.h"
@@ -25,6 +26,7 @@ namespace tree {
 
 using xgboost::common::HistCutMatrix;
 using xgboost::common::GHistIndexMatrix;
+using xgboost::common::GHistIndexBlockMatrix;
 using xgboost::common::GHistIndexRow;
 using xgboost::common::GHistEntry;
 using xgboost::common::HistCollection;
@@ -36,6 +38,8 @@ using xgboost::common::Column;
 
 DMLC_REGISTRY_FILE_TAG(updater_fast_hist);
 
+DMLC_REGISTER_PARAMETER(FastHistParam);
+
 /*! \brief construct a tree using quantized feature values */
 template<typename TStats, typename TConstraint>
 class FastHistMaker: public TreeUpdater {
@@ -47,6 +51,7 @@ class FastHistMaker: public TreeUpdater {
     }
     pruner_->Init(args);
     param.InitAllowUnknown(args);
+    fhparam.InitAllowUnknown(args);
     is_gmat_initialized_ = false;
   }
 
@@ -59,7 +64,10 @@ class FastHistMaker: public TreeUpdater {
       hmat_.Init(dmat, param.max_bin);
       gmat_.cut = &hmat_;
       gmat_.Init(dmat);
-      column_matrix_.Init(gmat_, static_cast<xgboost::common::DataType>(param.colmat_dtype));
+      column_matrix_.Init(gmat_, fhparam);
+      if (fhparam.enable_feature_grouping > 0) {
+        gmatb_.Init(gmat_, column_matrix_, fhparam);
+      }
       is_gmat_initialized_ = true;
       if (param.debug_verbose > 0) {
         LOG(INFO) << "Generating gmat: " << dmlc::GetTime() - tstart << " sec";
@@ -71,10 +79,10 @@ class FastHistMaker: public TreeUpdater {
     TConstraint::Init(&param, dmat->info().num_col);
     // build tree
     if (!builder_) {
-      builder_.reset(new Builder(param, std::move(pruner_)));
+      builder_.reset(new Builder(param, fhparam, std::move(pruner_)));
     }
     for (size_t i = 0; i < trees.size(); ++i) {
-      builder_->Update(gmat_, column_matrix_, gpair, dmat, trees[i]);
+      builder_->Update(gmat_, gmatb_, column_matrix_, gpair, dmat, trees[i]);
     }
     param.learning_rate = lr;
   }
@@ -91,9 +99,13 @@ class FastHistMaker: public TreeUpdater {
  protected:
   // training parameter
   TrainParam param;
+  FastHistParam fhparam;
   // data sketch
   HistCutMatrix hmat_;
+  // quantized data matrix
   GHistIndexMatrix gmat_;
+  // (optional) data matrix with feature grouping
+  GHistIndexBlockMatrix gmatb_;
   // column accessor
   ColumnMatrix column_matrix_;
   bool is_gmat_initialized_;
@@ -136,11 +148,13 @@ class FastHistMaker: public TreeUpdater {
    public:
     // constructor
     explicit Builder(const TrainParam& param,
+                     const FastHistParam& fhparam,
                      std::unique_ptr<TreeUpdater> pruner)
-      : param(param), pruner_(std::move(pruner)),
+      : param(param), fhparam(fhparam), pruner_(std::move(pruner)),
         p_last_tree_(nullptr), p_last_fmat_(nullptr) {}
     // update one tree, growing
     virtual void Update(const GHistIndexMatrix& gmat,
+                        const GHistIndexBlockMatrix& gmatb,
                         const ColumnMatrix& column_matrix,
                         const std::vector<bst_gpair>& gpair,
                         DMatrix* p_fmat,
@@ -168,7 +182,7 @@ class FastHistMaker: public TreeUpdater {
       for (int nid = 0; nid < p_tree->param.num_roots; ++nid) {
         tstart = dmlc::GetTime();
         hist_.AddHistRow(nid);
-        builder_.BuildHist(gpair, row_set_collection_[nid], gmat, feat_set, hist_[nid]);
+        BuildHist(gpair, row_set_collection_[nid], gmat, gmatb, feat_set, hist_[nid]);
         time_build_hist += dmlc::GetTime() - tstart;
 
         tstart = dmlc::GetTime();
@@ -203,13 +217,11 @@ class FastHistMaker: public TreeUpdater {
           hist_.AddHistRow(cleft);
           hist_.AddHistRow(cright);
           if (row_set_collection_[cleft].size() < row_set_collection_[cright].size()) {
-            builder_.BuildHist(gpair, row_set_collection_[cleft], gmat, feat_set,
-                               hist_[cleft]);
-            builder_.SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
+            BuildHist(gpair, row_set_collection_[cleft], gmat, gmatb, feat_set, hist_[cleft]);
+            SubtractionTrick(hist_[cright], hist_[cleft], hist_[nid]);
           } else {
-            builder_.BuildHist(gpair, row_set_collection_[cright], gmat, feat_set,
-                               hist_[cright]);
-            builder_.SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
+            BuildHist(gpair, row_set_collection_[cright], gmat, gmatb, feat_set, hist_[cright]);
+            SubtractionTrick(hist_[cleft], hist_[cright], hist_[nid]);
           }
           time_build_hist += dmlc::GetTime() - tstart;
 
@@ -280,6 +292,23 @@ class FastHistMaker: public TreeUpdater {
       }
     }
 
+    inline void BuildHist(const std::vector<bst_gpair>& gpair,
+                          const RowSetCollection::Elem row_indices,
+                          const GHistIndexMatrix& gmat,
+                          const GHistIndexBlockMatrix& gmatb,
+                          const std::vector<bst_uint>& feat_set,
+                          GHistRow hist) {
+      if (fhparam.enable_feature_grouping > 0) {
+        hist_builder_.BuildBlockHist(gpair, row_indices, gmatb, feat_set, hist);
+      } else {
+        hist_builder_.BuildHist(gpair, row_indices, gmat, feat_set, hist);
+      }
+    }
+
+    inline void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
+      hist_builder_.SubtractionTrick(self, sibling, parent);
+    }
+
     inline bool UpdatePredictionCache(const DMatrix* data,
                                       std::vector<bst_float>* p_out_preds) {
       std::vector<bst_float>& out_preds = *p_out_preds;
@@ -351,7 +380,7 @@ class FastHistMaker: public TreeUpdater {
         {
           this->nthread = omp_get_num_threads();
         }
-        builder_.Init(this->nthread, nbins);
+        hist_builder_.Init(this->nthread, nbins);
 
         CHECK_EQ(info.root_index.size(), 0U);
         std::vector<bst_uint>& row_indices = row_set_collection_.row_indices_;
@@ -885,6 +914,7 @@ class FastHistMaker: public TreeUpdater {
 
     //  --data fields--
     const TrainParam& param;
+    const FastHistParam& fhparam;
     // number of omp thread used during training
     int nthread;
     // Per feature: shuffle index of each feature index
@@ -904,7 +934,7 @@ class FastHistMaker: public TreeUpdater {
     /*! \brief local prediction cache; maps node id to leaf value */
     std::vector<float> leaf_value_cache_;
 
-    GHistBuilder builder_;
+    GHistBuilder hist_builder_;
     std::unique_ptr<TreeUpdater> pruner_;
 
     // back pointers to tree and data matrix