Reducing memory consumption for 'hist' method on CPU (#5334)

2020-03-28 04:45:52 +03:00 · 2020-03-28 04:45:52 +03:00 · 27a8e36fc3
commit 27a8e36fc3
parent 13b10a6370
7 changed files with 849 additions and 241 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@ -10,13 +10,13 @@

 #include <limits>
 #include <vector>
+#include <memory>
 #include "hist_util.h"

-
 namespace xgboost {
 namespace common {

-
+class ColumnMatrix;
 /*! \brief column type */
 enum ColumnType {
  kDenseColumn,
@ -24,40 +24,72 @@ enum ColumnType {
 };

 /*! \brief a column storage, to be used with ApplySplit. Note that each
-    bin id is stored as index[i] + index_base. */
+    bin id is stored as index[i] + index_base.
+    Different types of column index for each column allow
+    to reduce the memory usage. */
+template <typename BinIdxType>
 class Column {
 public:
-  Column(ColumnType type, const uint32_t* index, uint32_t index_base,
-         const size_t* row_ind, size_t len)
+  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
      : type_(type),
        index_(index),
-        index_base_(index_base),
-        row_ind_(row_ind),
-        len_(len) {}
-  size_t Size() const { return len_; }
-  uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
-  uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
-  common::Span<const uint32_t> GetFeatureBinIdxPtr() const { return { index_, len_ }; }
-  // column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
-  // column.GetGlobalBinIdx(idx)
-  uint32_t GetBaseIdx() const { return index_base_; }
+        index_base_(index_base) {}
+
+  uint32_t GetGlobalBinIdx(size_t idx) const {
+    return index_base_ + static_cast<uint32_t>(index_[idx]);
+  }
+
+  BinIdxType GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
+
+  const uint32_t GetBaseIdx() const { return index_base_; }
+
+  common::Span<const BinIdxType> GetFeatureBinIdxPtr() const { return index_; }
+
  ColumnType GetType() const { return type_; }
-  size_t GetRowIdx(size_t idx) const {
-    // clang-tidy worries that row_ind_ might be a nullptr, which is possible,
-    // but low level structure is not safe anyway.
-    return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx];  // NOLINT
-  }
-  bool IsMissing(size_t idx) const {
-    return index_[idx] == std::numeric_limits<uint32_t>::max();
-  }
-  const size_t* GetRowData() const { return row_ind_; }
+
+  /* returns number of elements in column */
+  size_t Size() const { return index_.size(); }

 private:
+  /* type of column */
  ColumnType type_;
-  const uint32_t* index_;
-  uint32_t index_base_;
-  const size_t* row_ind_;
-  const size_t len_;
+  /* bin indexes in range [0, max_bins - 1] */
+  common::Span<const BinIdxType> index_;
+  /* bin index offset for specific feature */
+  const uint32_t index_base_;
+};
+
+template <typename BinIdxType>
+class SparseColumn: public Column<BinIdxType> {
+ public:
+  SparseColumn(ColumnType type, common::Span<const BinIdxType> index,
+              uint32_t index_base, common::Span<const size_t> row_ind)
+      : Column<BinIdxType>(type, index, index_base),
+        row_ind_(row_ind) {}
+
+  const size_t* GetRowData() const { return row_ind_.data(); }
+
+  size_t GetRowIdx(size_t idx) const {
+    return row_ind_.data()[idx];
+  }
+
+ private:
+  /* indexes of rows */
+  common::Span<const size_t> row_ind_;
+};
+
+template <typename BinIdxType>
+class DenseColumn: public Column<BinIdxType> {
+ public:
+  DenseColumn(ColumnType type, common::Span<const BinIdxType> index,
+              uint32_t index_base,
+              const std::vector<bool>::const_iterator missing_flags)
+      : Column<BinIdxType>(type, index, index_base),
+        missing_flags_(missing_flags) {}
+  bool IsMissing(size_t idx) const { return missing_flags_[idx]; }
+ private:
+  /* flags for missing values in dense columns */
+  std::vector<bool>::const_iterator missing_flags_;
 };

 /*! \brief a collection of columns, with support for construction from
@ -74,23 +106,22 @@ class ColumnMatrix {
                   double  sparse_threshold) {
    const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
    const size_t nrow = gmat.row_ptr.size() - 1;
-
    // identify type of each column
    feature_counts_.resize(nfeature);
    type_.resize(nfeature);
    std::fill(feature_counts_.begin(), feature_counts_.end(), 0);
-
    uint32_t max_val = std::numeric_limits<uint32_t>::max();
    for (int32_t fid = 0; fid < nfeature; ++fid) {
      CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
    }
-
+    bool all_dense = gmat.IsDense();
    gmat.GetFeatureCounts(&feature_counts_[0]);
    // classify features
    for (int32_t fid = 0; fid < nfeature; ++fid) {
      if (static_cast<double>(feature_counts_[fid])
                 < sparse_threshold * nrow) {
        type_[fid] = kSparseColumn;
+        all_dense = false;
      } else {
        type_[fid] = kDenseColumn;
      }
@ -98,101 +129,207 @@ class ColumnMatrix {

    // want to compute storage boundary for each feature
    // using variants of prefix sum scan
-    boundary_.resize(nfeature);
+    feature_offsets_.resize(nfeature + 1);
    size_t accum_index_ = 0;
-    size_t accum_row_ind_ = 0;
-    for (int32_t fid = 0; fid < nfeature; ++fid) {
-      boundary_[fid].index_begin = accum_index_;
-      boundary_[fid].row_ind_begin = accum_row_ind_;
-      if (type_[fid] == kDenseColumn) {
+    feature_offsets_[0] = accum_index_;
+    for (int32_t fid = 1; fid < nfeature + 1; ++fid) {
+      if (type_[fid - 1] == kDenseColumn) {
        accum_index_ += static_cast<size_t>(nrow);
-        accum_row_ind_ += static_cast<size_t>(nrow);
      } else {
-        accum_index_ += feature_counts_[fid];
-        accum_row_ind_ += feature_counts_[fid];
+        accum_index_ += feature_counts_[fid - 1];
      }
-      boundary_[fid].index_end = accum_index_;
-      boundary_[fid].row_ind_end = accum_row_ind_;
+      feature_offsets_[fid] = accum_index_;
    }

-    index_.resize(boundary_[nfeature - 1].index_end);
-    row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
+    SetTypeSize(gmat.max_num_bins_);
+
+    index_.resize(feature_offsets_[nfeature] * bins_type_size_, 0);
+    if (!all_dense) {
+      row_ind_.resize(feature_offsets_[nfeature]);
+    }

    // store least bin id for each feature
-    index_base_.resize(nfeature);
-    for (int32_t fid = 0; fid < nfeature; ++fid) {
-      index_base_[fid] = gmat.cut.Ptrs()[fid];
+    index_base_ = const_cast<uint32_t*>(gmat.cut.Ptrs().data());
+
+    const bool noMissingValues = NoMissingValues(gmat.row_ptr[nrow], nrow, nfeature);
+
+    if (noMissingValues) {
+      missing_flags_.resize(feature_offsets_[nfeature], false);
+    } else {
+      missing_flags_.resize(feature_offsets_[nfeature], true);
    }

    // pre-fill index_ for dense columns
-
-    #pragma omp parallel for
-    for (int32_t fid = 0; fid < nfeature; ++fid) {
-      if (type_[fid] == kDenseColumn) {
-        const size_t ibegin = boundary_[fid].index_begin;
-        uint32_t* begin = &index_[ibegin];
-        uint32_t* end = begin + nrow;
-        std::fill(begin, end, std::numeric_limits<uint32_t>::max());
-        // max() indicates missing values
+    if (all_dense) {
+      BinTypeSize gmat_bin_size = gmat.index.getBinTypeSize();
+      if (gmat_bin_size == UINT8_BINS_TYPE_SIZE) {
+          SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow, nfeature, noMissingValues);
+      } else if (gmat_bin_size == UINT16_BINS_TYPE_SIZE) {
+          SetIndexAllDense(gmat.index.data<uint16_t>(), gmat, nrow, nfeature, noMissingValues);
+      } else {
+          CHECK_EQ(gmat_bin_size, UINT32_BINS_TYPE_SIZE);
+          SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow, nfeature, noMissingValues);
+      }
+    /* For sparse DMatrix gmat.index.getBinTypeSize() returns always UINT32_BINS_TYPE_SIZE
+       but for ColumnMatrix we still have a chance to reduce the memory consumption */
+    } else {
+      if (bins_type_size_ == UINT8_BINS_TYPE_SIZE) {
+          SetIndex<uint8_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
+      } else if (bins_type_size_ == UINT16_BINS_TYPE_SIZE) {
+          SetIndex<uint16_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
+      } else {
+          CHECK_EQ(bins_type_size_, UINT32_BINS_TYPE_SIZE);
+          SetIndex<uint32_t>(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
      }
    }
+  }

-    // loop over all rows and fill column entries
-    // num_nonzeros[fid] = how many nonzeros have this feature accumulated so far?
+  /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
+  void SetTypeSize(size_t max_num_bins) {
+    if ( (max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max()) ) {
+      bins_type_size_ = UINT8_BINS_TYPE_SIZE;
+    } else if ((max_num_bins - 1) <= static_cast<int>(std::numeric_limits<uint16_t>::max())) {
+      bins_type_size_ = UINT16_BINS_TYPE_SIZE;
+    } else {
+      bins_type_size_ = UINT32_BINS_TYPE_SIZE;
+    }
+  }
+
+  /* Fetch an individual column. This code should be used with type swith
+     to determine type of bin id's */
+  template <typename BinIdxType>
+  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
+    CHECK_EQ(sizeof(BinIdxType), bins_type_size_);
+
+    const size_t feature_offset = feature_offsets_[fid];  // to get right place for certain feature
+    const size_t column_size = feature_offsets_[fid + 1] - feature_offset;
+    common::Span<const BinIdxType> bin_index = { reinterpret_cast<const BinIdxType*>(
+                                                 &index_[feature_offset * bins_type_size_]),
+                                                 column_size };
+    std::unique_ptr<const Column<BinIdxType> > res;
+    if (type_[fid] == ColumnType::kDenseColumn) {
+      std::vector<bool>::const_iterator column_iterator = missing_flags_.begin();
+      advance(column_iterator, feature_offset);  // increment iterator to right position
+      res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
+                                            column_iterator));
+    } else {
+      res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
+                                       {&row_ind_[feature_offset], column_size}));
+    }
+    return res;
+  }
+
+  template<typename T>
+  inline void SetIndexAllDense(T* index, const GHistIndexMatrix& gmat,  const size_t nrow,
+                               const size_t nfeature,  const bool noMissingValues) {
+    T* local_index = reinterpret_cast<T*>(&index_[0]);
+
+    /* missing values make sense only for column with type kDenseColumn,
+       and if no missing values were observed it could be handled much faster. */
+    if (noMissingValues) {
+      const int32_t nthread = omp_get_max_threads();
+      #pragma omp parallel for num_threads(nthread)
+      for (omp_ulong rid = 0; rid < nrow; ++rid) {
+        const size_t ibegin = rid*nfeature;
+        const size_t iend = (rid+1)*nfeature;
+        size_t j = 0;
+        for (size_t i = ibegin; i < iend; ++i, ++j) {
+            const size_t idx = feature_offsets_[j];
+            local_index[idx + rid] = index[i];
+        }
+      }
+    } else {
+      /* to handle rows in all batches, sum of all batch sizes equal to gmat.row_ptr.size() - 1 */
+      size_t rbegin = 0;
+      for (const auto &batch : gmat.p_fmat_->GetBatches<SparsePage>()) {
+        const xgboost::Entry* data_ptr = batch.data.HostVector().data();
+        const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
+        const size_t batch_size = batch.Size();
+        CHECK_LT(batch_size, offset_vec.size());
+        for (size_t rid = 0; rid < batch_size; ++rid) {
+          const size_t size = offset_vec[rid + 1] - offset_vec[rid];
+          SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
+          const size_t ibegin = gmat.row_ptr[rbegin + rid];
+          const size_t iend = gmat.row_ptr[rbegin + rid + 1];
+          CHECK_EQ(ibegin + inst.size(), iend);
+          size_t j = 0;
+          size_t fid = 0;
+          for (size_t i = ibegin; i < iend; ++i, ++j) {
+              fid = inst[j].index;
+              const size_t idx = feature_offsets_[fid];
+              /* rbegin allows to store indexes from specific SparsePage batch */
+              local_index[idx + rbegin + rid] = index[i];
+              missing_flags_[idx + rbegin + rid] = false;
+          }
+        }
+        rbegin += batch.Size();
+      }
+    }
+  }
+
+  template<typename T>
+  inline void SetIndex(uint32_t* index, const GHistIndexMatrix& gmat,
+                       const size_t nrow, const size_t nfeature) {
    std::vector<size_t> num_nonzeros;
    num_nonzeros.resize(nfeature);
    std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
-    for (size_t rid = 0; rid < nrow; ++rid) {
-      const size_t ibegin = gmat.row_ptr[rid];
-      const size_t iend = gmat.row_ptr[rid + 1];
-      size_t fid = 0;
-      for (size_t i = ibegin; i < iend; ++i) {
-        const uint32_t bin_id = gmat.index[i];
-        auto iter = std::upper_bound(gmat.cut.Ptrs().cbegin() + fid,
-                                     gmat.cut.Ptrs().cend(), bin_id);
-        fid = std::distance(gmat.cut.Ptrs().cbegin(), iter) - 1;
-        if (type_[fid] == kDenseColumn) {
-          uint32_t* begin = &index_[boundary_[fid].index_begin];
-          begin[rid] = bin_id - index_base_[fid];
-        } else {
-          uint32_t* begin = &index_[boundary_[fid].index_begin];
-          begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
-          row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
-          ++num_nonzeros[fid];
+
+    T* local_index = reinterpret_cast<T*>(&index_[0]);
+    size_t rbegin = 0;
+    for (const auto &batch : gmat.p_fmat_->GetBatches<SparsePage>()) {
+      const xgboost::Entry* data_ptr = batch.data.HostVector().data();
+      const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
+      const size_t batch_size = batch.Size();
+      CHECK_LT(batch_size, offset_vec.size());
+      for (size_t rid = 0; rid < batch_size; ++rid) {
+        const size_t ibegin = gmat.row_ptr[rbegin + rid];
+        const size_t iend = gmat.row_ptr[rbegin + rid + 1];
+        size_t fid = 0;
+        const size_t size = offset_vec[rid + 1] - offset_vec[rid];
+        SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};
+
+        CHECK_EQ(ibegin + inst.size(), iend);
+        size_t j = 0;
+        for (size_t i = ibegin; i < iend; ++i, ++j) {
+          const uint32_t bin_id = index[i];
+
+          fid = inst[j].index;
+          if (type_[fid] == kDenseColumn) {
+            T* begin = &local_index[feature_offsets_[fid]];
+            begin[rid + rbegin] = bin_id - index_base_[fid];
+            missing_flags_[feature_offsets_[fid] + rid + rbegin] = false;
+          } else {
+            T* begin = &local_index[feature_offsets_[fid]];
+            begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
+            row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid + rbegin;
+            ++num_nonzeros[fid];
+          }
        }
      }
+      rbegin += batch.Size();
    }
  }
-
-  /* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
-     to determine type of bin id's */
-  inline Column GetColumn(unsigned fid) const {
-    Column c(type_[fid], &index_[boundary_[fid].index_begin], index_base_[fid],
-             (type_[fid] == ColumnType::kSparseColumn ?
-              &row_ind_[boundary_[fid].row_ind_begin] : nullptr),
-             boundary_[fid].index_end - boundary_[fid].index_begin);
-    return c;
+  const BinTypeSize GetTypeSize() const {
+    return bins_type_size_;
+  }
+  const bool NoMissingValues(const size_t n_elements,
+                             const size_t n_row, const size_t n_features) {
+    return n_elements == n_features * n_row;
  }

 private:
-  struct ColumnBoundary {
-    // indicate where each column's index and row_ind is stored.
-    // index_begin and index_end are logical offsets, so they should be converted to
-    // actual offsets by scaling with packing_factor_
-    size_t index_begin;
-    size_t index_end;
-    size_t row_ind_begin;
-    size_t row_ind_end;
-  };
+  std::vector<uint8_t> index_;

  std::vector<size_t> feature_counts_;
  std::vector<ColumnType> type_;
-  std::vector<uint32_t> index_;  // index_: may store smaller integers; needs padding
  std::vector<size_t> row_ind_;
-  std::vector<ColumnBoundary> boundary_;
+  /* indicate where each column's index and row_ind is stored. */
+  std::vector<size_t> feature_offsets_;

  // index_base_[fid]: least bin id for feature fid
-  std::vector<uint32_t> index_base_;
+  uint32_t* index_base_;
+  std::vector<bool> missing_flags_;
+  BinTypeSize bins_type_size_;
 };

 }  // namespace common
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -29,6 +29,89 @@
 namespace xgboost {
 namespace common {

+template<typename BinIdxType>
+void GHistIndexMatrix::SetIndexDataForDense(common::Span<BinIdxType> index_data_span,
+                                    size_t batch_threads, const SparsePage& batch,
+                                    size_t rbegin, common::Span<const uint32_t> offsets_span,
+                                    size_t nbins) {
+  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
+  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
+  const size_t batch_size = batch.Size();
+  CHECK_LT(batch_size, offset_vec.size());
+  BinIdxType* index_data = index_data_span.data();
+  const uint32_t* offsets = offsets_span.data();
+  #pragma omp parallel for num_threads(batch_threads) schedule(static)
+    for (omp_ulong i = 0; i < batch_size; ++i) {
+      const int tid = omp_get_thread_num();
+      size_t ibegin = row_ptr[rbegin + i];
+      size_t iend = row_ptr[rbegin + i + 1];
+      const size_t size = offset_vec[i + 1] - offset_vec[i];
+      SparsePage::Inst inst = {data_ptr + offset_vec[i], size};
+      CHECK_EQ(ibegin + inst.size(), iend);
+      for (bst_uint j = 0; j < inst.size(); ++j) {
+        uint32_t idx = cut.SearchBin(inst[j]);
+        index_data[ibegin + j] = static_cast<BinIdxType>(idx - offsets[j]);
+        ++hit_count_tloc_[tid * nbins + idx];
+      }
+    }
+}
+template void GHistIndexMatrix::SetIndexDataForDense(common::Span<uint8_t> index_data_span,
+                                             size_t batch_threads, const SparsePage& batch,
+                                             size_t rbegin,
+                                             common::Span<const uint32_t> offsets_span,
+                                             size_t nbins);
+template void GHistIndexMatrix::SetIndexDataForDense(common::Span<uint16_t> index_data_span,
+                                             size_t batch_threads, const SparsePage& batch,
+                                             size_t rbegin,
+                                             common::Span<const uint32_t> offsets_span,
+                                             size_t nbins);
+template void GHistIndexMatrix::SetIndexDataForDense(common::Span<uint32_t> index_data_span,
+                                             size_t batch_threads, const SparsePage& batch,
+                                             size_t rbegin,
+                                             common::Span<const uint32_t> offsets_span,
+                                             size_t nbins);
+
+void GHistIndexMatrix::SetIndexDataForSparse(common::Span<uint32_t> index_data_span,
+                                                 size_t batch_threads,
+                                                 const SparsePage& batch, size_t rbegin,
+                                                 size_t nbins) {
+  const xgboost::Entry* data_ptr = batch.data.HostVector().data();
+  const std::vector<bst_row_t>& offset_vec = batch.offset.HostVector();
+  const size_t batch_size = batch.Size();
+  CHECK_LT(batch_size, offset_vec.size());
+  uint32_t* index_data = index_data_span.data();
+  #pragma omp parallel for num_threads(batch_threads) schedule(static)
+    for (omp_ulong i = 0; i < batch_size; ++i) {
+      const int tid = omp_get_thread_num();
+      size_t ibegin = row_ptr[rbegin + i];
+      size_t iend = row_ptr[rbegin + i + 1];
+      const size_t size = offset_vec[i + 1] - offset_vec[i];
+      SparsePage::Inst inst = {data_ptr + offset_vec[i], size};
+      CHECK_EQ(ibegin + inst.size(), iend);
+      for (bst_uint j = 0; j < inst.size(); ++j) {
+        uint32_t idx = cut.SearchBin(inst[j]);
+        index_data[ibegin + j] = idx;
+        ++hit_count_tloc_[tid * nbins + idx];
+      }
+    }
+}
+
+void GHistIndexMatrix::ResizeIndex(const size_t rbegin, const SparsePage& batch,
+                                   const size_t n_offsets, const size_t n_index,
+                                   const bool isDense) {
+  if ((max_num_bins_ - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+    index.setBinTypeSize(UINT8_BINS_TYPE_SIZE);
+    index.resize((sizeof(uint8_t)) * n_index);
+  } else if ((max_num_bins_ - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
+    max_num_bins_ - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    index.setBinTypeSize(UINT16_BINS_TYPE_SIZE);
+    index.resize((sizeof(uint16_t)) * n_index);
+  } else {
+    index.setBinTypeSize(UINT32_BINS_TYPE_SIZE);
+    index.resize((sizeof(uint32_t)) * n_index);
+  }
+}
+
 HistogramCuts::HistogramCuts() {
  monitor_.Init(__FUNCTION__);
  cut_ptrs_.HostVector().emplace_back(0);
@ -260,7 +343,7 @@ void DenseCuts::Build(DMatrix* p_fmat, uint32_t max_num_bins) {
  size_t const num_groups = group_ptr.size() == 0 ? 0 : group_ptr.size() - 1;
  // Use group index for weights?
  bool const use_group = UseGroup(p_fmat);
-
+  const bool isDense = p_fmat->IsDense();
  for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
    size_t group_ind = 0;
    if (use_group) {
@ -285,10 +368,18 @@ void DenseCuts::Build(DMatrix* p_fmat, uint32_t max_num_bins) {
            // move to next group
            group_ind++;
          }
-          for (auto const& entry : inst) {
-            if (entry.index >= begin && entry.index < end) {
-              size_t w_idx = use_group ? group_ind : ridx;
-              sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx));
+          size_t w_idx = use_group ? group_ind : ridx;
+          auto w = info.GetWeight(w_idx);
+          if (isDense) {
+            auto data = inst.data();
+            for (size_t ii = begin; ii < end; ii++) {
+              sketchs[ii].Push(data[ii].fvalue, w);
+            }
+          } else {
+            for (auto const& entry : inst) {
+              if (entry.index >= begin && entry.index < end) {
+                sketchs[entry.index].Push(entry.fvalue, w);
+              }
            }
          }
        }
@ -360,12 +451,13 @@ void DenseCuts::Init

 void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
  cut.Build(p_fmat, max_num_bins);
+  max_num_bins_ = max_num_bins;
  const int32_t nthread = omp_get_max_threads();
  const uint32_t nbins = cut.Ptrs().back();
  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(nthread * nbins, 0);

-
+  this->p_fmat_ = p_fmat;
  size_t new_size = 1;
  for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
    new_size += batch.Size();
@ -376,6 +468,8 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {

  size_t rbegin = 0;
  size_t prev_sum = 0;
+  const bool isDense = p_fmat->IsDense();
+  this->isDense_ = isDense;

  for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
    // The number of threads is pegged to the batch size. If the OMP
@ -422,25 +516,41 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
      }
    }

-    index.resize(row_ptr[rbegin + batch.Size()]);
+    const size_t n_offsets = cut.Ptrs().size() - 1;
+    const size_t n_index = row_ptr[rbegin + batch.Size()];
+    ResizeIndex(rbegin, batch, n_offsets, n_index, isDense);

    CHECK_GT(cut.Values().size(), 0U);

-    #pragma omp parallel for num_threads(batch_threads) schedule(static)
-    for (omp_ulong i = 0; i < batch.Size(); ++i) { // NOLINT(*)
-      const int tid = omp_get_thread_num();
-      size_t ibegin = row_ptr[rbegin + i];
-      size_t iend = row_ptr[rbegin + i + 1];
-      SparsePage::Inst inst = batch[i];
-
-      CHECK_EQ(ibegin + inst.size(), iend);
-      for (bst_uint j = 0; j < inst.size(); ++j) {
-        uint32_t idx = cut.SearchBin(inst[j]);
-
-        index[ibegin + j] = idx;
-        ++hit_count_tloc_[tid * nbins + idx];
+    uint32_t* offsets = nullptr;
+    if (isDense) {
+      index.resizeOffset(n_offsets);
+      offsets = index.offset();
+      for (size_t i = 0; i < n_offsets; ++i) {
+        offsets[i] = cut.Ptrs()[i];
      }
-      std::sort(index.begin() + ibegin, index.begin() + iend);
+    }
+
+    if (isDense) {
+      BinTypeSize curent_bin_size = index.getBinTypeSize();
+      common::Span<const uint32_t> offsets_span = {offsets, n_offsets};
+      if (curent_bin_size == UINT8_BINS_TYPE_SIZE) {
+          common::Span<uint8_t> index_data_span = {index.data<uint8_t>(), n_index};
+          SetIndexDataForDense(index_data_span, batch_threads, batch, rbegin, offsets_span, nbins);
+      } else if (curent_bin_size == UINT16_BINS_TYPE_SIZE) {
+          common::Span<uint16_t> index_data_span = {index.data<uint16_t>(), n_index};
+          SetIndexDataForDense(index_data_span, batch_threads, batch, rbegin, offsets_span, nbins);
+      } else {
+          CHECK_EQ(curent_bin_size, UINT32_BINS_TYPE_SIZE);
+          common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
+          SetIndexDataForDense(index_data_span, batch_threads, batch, rbegin, offsets_span, nbins);
+      }
+
+    /* For sparse DMatrix we have to store index of feature for each bin
+       in index field to chose right offset. So offset is nullptr and index is not reduced */
+    } else {
+      common::Span<uint32_t> index_data_span = {index.data<uint32_t>(), n_index};
+      SetIndexDataForSparse(index_data_span, batch_threads, batch, rbegin, nbins);
    }

    #pragma omp parallel for num_threads(nthread) schedule(static)
@ -456,13 +566,16 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
  }
 }

+template <typename BinIdxType>
 static size_t GetConflictCount(const std::vector<bool>& mark,
-                               const Column& column,
+                               const Column<BinIdxType>& column_input,
                               size_t max_cnt) {
  size_t ret = 0;
-  if (column.GetType() == xgboost::common::kDenseColumn) {
+  if (column_input.GetType() == xgboost::common::kDenseColumn) {
+    const DenseColumn<BinIdxType>& column
+      = static_cast<const DenseColumn<BinIdxType>& >(column_input);
    for (size_t i = 0; i < column.Size(); ++i) {
-      if (column.GetFeatureBinIdx(i) != std::numeric_limits<uint32_t>::max() && mark[i]) {
+      if ((!column.IsMissing(i)) && mark[i]) {
        ++ret;
        if (ret > max_cnt) {
          return max_cnt + 1;
@ -470,6 +583,8 @@ static size_t GetConflictCount(const std::vector<bool>& mark,
      }
    }
  } else {
+    const SparseColumn<BinIdxType>& column
+      = static_cast<const SparseColumn<BinIdxType>& >(column_input);
    for (size_t i = 0; i < column.Size(); ++i) {
      if (mark[column.GetRowIdx(i)]) {
        ++ret;
@ -482,22 +597,64 @@ static size_t GetConflictCount(const std::vector<bool>& mark,
  return ret;
 }

+template <typename BinIdxType>
 inline void
-MarkUsed(std::vector<bool>* p_mark, const Column& column) {
+MarkUsed(std::vector<bool>* p_mark, const Column<BinIdxType>& column_input) {
  std::vector<bool>& mark = *p_mark;
-  if (column.GetType() == xgboost::common::kDenseColumn) {
+  if (column_input.GetType() == xgboost::common::kDenseColumn) {
+    const DenseColumn<BinIdxType>& column
+      = static_cast<const DenseColumn<BinIdxType>& >(column_input);
    for (size_t i = 0; i < column.Size(); ++i) {
-      if (column.GetFeatureBinIdx(i) != std::numeric_limits<uint32_t>::max()) {
+      if (!column.IsMissing(i)) {
        mark[i] = true;
      }
    }
  } else {
+    const SparseColumn<BinIdxType>& column
+      = static_cast<const SparseColumn<BinIdxType>& >(column_input);
    for (size_t i = 0; i < column.Size(); ++i) {
      mark[column.GetRowIdx(i)] = true;
    }
  }
 }

+template <typename BinIdxType>
+inline void SetGroup(const unsigned fid, const Column<BinIdxType>& column,
+  const size_t max_conflict_cnt, const std::vector<size_t>& search_groups,
+  std::vector<size_t>* p_group_conflict_cnt,
+  std::vector<std::vector<bool>>* p_conflict_marks,
+  std::vector<std::vector<unsigned>>* p_groups,
+  std::vector<size_t>* p_group_nnz, const size_t cur_fid_nnz, const size_t nrow) {
+  bool need_new_group = true;
+  std::vector<size_t>& group_conflict_cnt = *p_group_conflict_cnt;
+  std::vector<std::vector<bool>>& conflict_marks = *p_conflict_marks;
+  std::vector<std::vector<unsigned>>& groups = *p_groups;
+  std::vector<size_t>& group_nnz = *p_group_nnz;
+
+  // examine each candidate group: is it okay to insert fid?
+  for (auto gid : search_groups) {
+    const size_t rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
+    const size_t cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
+    if (cnt <= rest_max_cnt) {
+      need_new_group = false;
+      groups[gid].push_back(fid);
+      group_conflict_cnt[gid] += cnt;
+      group_nnz[gid] += cur_fid_nnz - cnt;
+      MarkUsed(&conflict_marks[gid], column);
+      break;
+    }
+  }
+  // create new group if necessary
+  if (need_new_group) {
+    groups.emplace_back();
+    groups.back().push_back(fid);
+    group_conflict_cnt.push_back(0);
+    conflict_marks.emplace_back(nrow, false);
+    MarkUsed(&conflict_marks.back(), column);
+    group_nnz.emplace_back(cur_fid_nnz);
+  }
+}
+
 inline std::vector<std::vector<unsigned>>
 FindGroups(const std::vector<unsigned>& feature_list,
           const std::vector<size_t>& feature_nnz,
@ -517,10 +674,7 @@ FindGroups(const std::vector<unsigned>& feature_list,
    = static_cast<size_t>(param.max_conflict_rate * nrow);

  for (auto fid : feature_list) {
-    const Column& column = colmat.GetColumn(fid);
-
    const size_t cur_fid_nnz = feature_nnz[fid];
-    bool need_new_group = true;

    // randomly choose some of existing groups as candidates
    std::vector<size_t> search_groups;
@ -534,31 +688,22 @@ FindGroups(const std::vector<unsigned>& feature_list,
      search_groups.resize(param.max_search_group);
    }

-    // examine each candidate group: is it okay to insert fid?
-    for (auto gid : search_groups) {
-      const size_t rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
-      const size_t cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
-      if (cnt <= rest_max_cnt) {
-        need_new_group = false;
-        groups[gid].push_back(fid);
-        group_conflict_cnt[gid] += cnt;
-        group_nnz[gid] += cur_fid_nnz - cnt;
-        MarkUsed(&conflict_marks[gid], column);
-        break;
-      }
-    }
-
-    // create new group if necessary
-    if (need_new_group) {
-      groups.emplace_back();
-      groups.back().push_back(fid);
-      group_conflict_cnt.push_back(0);
-      conflict_marks.emplace_back(nrow, false);
-      MarkUsed(&conflict_marks.back(), column);
-      group_nnz.emplace_back(cur_fid_nnz);
+    BinTypeSize bins_type_size = colmat.GetTypeSize();
+    if (bins_type_size == UINT8_BINS_TYPE_SIZE) {
+        const auto column = colmat.GetColumn<uint8_t>(fid);
+        SetGroup(fid, *(column.get()), max_conflict_cnt, search_groups,
+                 &group_conflict_cnt, &conflict_marks, &groups, &group_nnz, cur_fid_nnz, nrow);
+    } else if (bins_type_size == UINT16_BINS_TYPE_SIZE) {
+        const auto column = colmat.GetColumn<uint16_t>(fid);
+        SetGroup(fid, *(column.get()), max_conflict_cnt, search_groups,
+                 &group_conflict_cnt, &conflict_marks, &groups, &group_nnz, cur_fid_nnz, nrow);
+    } else {
+        CHECK_EQ(bins_type_size, UINT32_BINS_TYPE_SIZE);
+        const auto column = colmat.GetColumn<uint32_t>(fid);
+        SetGroup(fid, *(column.get()), max_conflict_cnt, search_groups,
+                 &group_conflict_cnt, &conflict_marks, &groups, &group_nnz, cur_fid_nnz, nrow);
    }
  }
-
  return groups;
 }

@ -640,6 +785,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
      }
    }
  }
+
  std::vector<std::vector<uint32_t>> index_temp(nblock);
  std::vector<std::vector<size_t>> row_ptr_temp(nblock);
  for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
@ -733,8 +879,6 @@ struct Prefetch {
 public:
  static constexpr size_t kCacheLineSize = 64;
  static constexpr size_t kPrefetchOffset = 10;
-  static constexpr size_t kPrefetchStep =
-      kCacheLineSize / sizeof(decltype(GHistIndexMatrix::index)::value_type);

 private:
  static constexpr size_t kNoPrefetchSize =
@ -745,11 +889,17 @@ struct Prefetch {
  static size_t NoPrefetchSize(size_t rows) {
    return std::min(rows, kNoPrefetchSize);
  }
+
+  template <typename T>
+  static constexpr size_t GetPrefetchStep() {
+    return Prefetch::kCacheLineSize / sizeof(T);
+  }
 };

 constexpr size_t Prefetch::kNoPrefetchSize;

-template<typename FPType, bool do_prefetch>
+
+template<typename FPType, bool do_prefetch, typename BinIdxType>
 void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
                          const RowSetCollection::Elem row_indices,
                          const GHistIndexMatrix& gmat,
@ -758,9 +908,9 @@ void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
  const size_t size = row_indices.Size();
  const size_t* rid = row_indices.begin;
  const float* pgh = reinterpret_cast<const float*>(gpair.data());
-  const uint32_t* gradient_index = gmat.index.data();
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const uint32_t* offsets = gmat.index.offset();
  FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
-
  const uint32_t two {2};  // Each element from 'gpair' and 'hist' contains
                           // 2 FP values: gradient and hessian.
                           // So we need to multiply each row-index/bin-index by 2
@ -775,13 +925,14 @@ void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,

      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
      for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
-           j += Prefetch::kPrefetchStep) {
+           j += Prefetch::GetPrefetchStep<BinIdxType>()) {
        PREFETCH_READ_T0(gradient_index + j);
      }
    }
-
-    for (size_t j = icol_start; j < icol_start + n_features; ++j) {
-      const uint32_t idx_bin = two * gradient_index[j];
+    const BinIdxType* gr_index_local = gradient_index + icol_start;
+    for (size_t j = 0; j < n_features; ++j) {
+      const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
+                                      offsets[j]);

      hist_data[idx_bin]   += pgh[idx_gh];
      hist_data[idx_bin+1] += pgh[idx_gh+1];
@ -797,10 +948,9 @@ void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
  const size_t size = row_indices.Size();
  const size_t* rid = row_indices.begin;
  const float* pgh = reinterpret_cast<const float*>(gpair.data());
-  const uint32_t* gradient_index = gmat.index.data();
+  const uint32_t* gradient_index = gmat.index.data<uint32_t>();
  const size_t* row_ptr =  gmat.row_ptr.data();
  FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
-
  const uint32_t two {2};  // Each element from 'gpair' and 'hist' contains
                           // 2 FP values: gradient and hessian.
                           // So we need to multiply each row-index/bin-index by 2
@ -816,11 +966,11 @@ void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
      const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];

      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
-      for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=Prefetch::kPrefetchStep) {
+      for (size_t j = icol_start_prftch; j < icol_end_prefect;
+        j+=Prefetch::GetPrefetchStep<uint32_t>()) {
        PREFETCH_READ_T0(gradient_index + j);
      }
    }
-
    for (size_t j = icol_start; j < icol_end; ++j) {
      const uint32_t idx_bin = two * gradient_index[j];
      hist_data[idx_bin]   += pgh[idx_gh];
@ -829,16 +979,42 @@ void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
  }
 }

+
+template<typename FPType, bool do_prefetch, typename BinIdxType>
+void BuildHistDispatchKernel(const std::vector<GradientPair>& gpair,
+                     const RowSetCollection::Elem row_indices,
+                     const GHistIndexMatrix& gmat, GHistRow hist, bool isDense) {
+  if (isDense) {
+    const size_t* row_ptr =  gmat.row_ptr.data();
+    const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
+    BuildHistDenseKernel<FPType, do_prefetch, BinIdxType>(gpair, row_indices,
+                                                       gmat, n_features, hist);
+  } else {
+    BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices,
+                                                        gmat, hist);
+  }
+}
+
 template<typename FPType, bool do_prefetch>
 void BuildHistKernel(const std::vector<GradientPair>& gpair,
                     const RowSetCollection::Elem row_indices,
                     const GHistIndexMatrix& gmat, const bool isDense, GHistRow hist) {
-  if (row_indices.Size() && isDense) {
-    const size_t* row_ptr =  gmat.row_ptr.data();
-    const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
-    BuildHistDenseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, n_features, hist);
-  } else {
-    BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, hist);
+  const bool is_dense = row_indices.Size() && isDense;
+  switch (gmat.index.getBinTypeSize()) {
+    case UINT8_BINS_TYPE_SIZE:
+      BuildHistDispatchKernel<FPType, do_prefetch, uint8_t>(gpair, row_indices,
+                                                            gmat, hist, is_dense);
+      break;
+    case UINT16_BINS_TYPE_SIZE:
+      BuildHistDispatchKernel<FPType, do_prefetch, uint16_t>(gpair, row_indices,
+                                                             gmat, hist, is_dense);
+      break;
+    case UINT32_BINS_TYPE_SIZE:
+      BuildHistDispatchKernel<FPType, do_prefetch, uint32_t>(gpair, row_indices,
+                                                             gmat, hist, is_dense);
+      break;
+    default:
+      CHECK(false);  // no default behavior
  }
 }

@ -875,7 +1051,6 @@ void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
  const size_t nblock = gmatb.GetNumBlock();
  const size_t nrows = row_indices.end - row_indices.begin;
  const size_t rest = nrows % kUnroll;
-
 #if defined(_OPENMP)
  const auto nthread = static_cast<bst_omp_uint>(this->nthread_);  // NOLINT
 #endif  // defined(_OPENMP)
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@ -209,6 +209,101 @@ HistogramCuts AdapterDeviceSketch(AdapterT* adapter, int num_bins,
                                  float missing,
                                  size_t sketch_batch_num_elements = 0);

+
+enum BinTypeSize {
+  UINT8_BINS_TYPE_SIZE  = 1,
+  UINT16_BINS_TYPE_SIZE = 2,
+  UINT32_BINS_TYPE_SIZE = 4
+};
+
+struct Index {
+  Index(): binTypeSize_(UINT8_BINS_TYPE_SIZE), p_(1), offset_ptr_(nullptr) {
+    setBinTypeSize(binTypeSize_);
+  }
+  Index(const Index& i) = delete;
+  Index& operator=(Index i) = delete;
+  Index(Index&& i) = delete;
+  Index& operator=(Index&& i) = delete;
+  uint32_t operator[](size_t i) const {
+    if (offset_ptr_ != nullptr) {
+      return func_(data_ptr_, i) + offset_ptr_[i%p_];
+    } else {
+      return func_(data_ptr_, i);
+    }
+  }
+  void setBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case UINT8_BINS_TYPE_SIZE:
+        func_ = &getValueFromUint8;
+        break;
+      case UINT16_BINS_TYPE_SIZE:
+        func_ = &getValueFromUint16;
+        break;
+      case UINT32_BINS_TYPE_SIZE:
+        func_ = &getValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == UINT8_BINS_TYPE_SIZE  ||
+              binTypeSize == UINT16_BINS_TYPE_SIZE ||
+              binTypeSize == UINT32_BINS_TYPE_SIZE);
+    }
+  }
+  BinTypeSize getBinTypeSize() const {
+    return binTypeSize_;
+  }
+  template<typename T>
+  T* data() const {
+    return static_cast<T*>(data_ptr_);
+  }
+  uint32_t* offset() const {
+    return offset_ptr_;
+  }
+  size_t offsetSize() const {
+    return offset_.size();
+  }
+  size_t size() const {
+    return data_.size() / (binTypeSize_);
+  }
+  void resize(const size_t nBytesData) {
+    data_.resize(nBytesData);
+    data_ptr_ = reinterpret_cast<void*>(data_.data());
+  }
+  void resizeOffset(const size_t nDisps) {
+    offset_.resize(nDisps);
+    offset_ptr_ = offset_.data();
+    p_ = nDisps;
+  }
+  std::vector<uint8_t>::const_iterator begin() const {
+    return data_.begin();
+  }
+  std::vector<uint8_t>::const_iterator end() const {
+    return data_.end();
+  }
+
+ private:
+  static uint32_t getValueFromUint8(void *t, size_t i) {
+    return reinterpret_cast<uint8_t*>(t)[i];
+  }
+  static uint32_t getValueFromUint16(void* t, size_t i) {
+    return reinterpret_cast<uint16_t*>(t)[i];
+  }
+  static uint32_t getValueFromUint32(void* t, size_t i) {
+    return reinterpret_cast<uint32_t*>(t)[i];
+  }
+
+  typedef uint32_t (*Func)(void*, size_t);
+
+  std::vector<uint8_t> data_;
+  std::vector<uint32_t> offset_;  // size of this field is equal to number of features
+  void* data_ptr_;
+  uint32_t* offset_ptr_;
+  size_t p_;
+  BinTypeSize binTypeSize_;
+  Func func_;
+};
+
+
 /*!
 * \brief preprocessed global index matrix, in CSR format
 *
@ -219,19 +314,31 @@ struct GHistIndexMatrix {
  /*! \brief row pointer to rows by element position */
  std::vector<size_t> row_ptr;
  /*! \brief The index data */
-  std::vector<uint32_t> index;
+  Index index;
  /*! \brief hit count of each index */
  std::vector<size_t> hit_count;
  /*! \brief The corresponding cuts */
  HistogramCuts cut;
+  DMatrix* p_fmat_;
+  size_t max_num_bins_;
  // Create a global histogram matrix, given cut
  void Init(DMatrix* p_fmat, int max_num_bins);
-  // get i-th row
-  inline GHistIndexRow operator[](size_t i) const {
-    return {&index[0] + row_ptr[i],
-            static_cast<GHistIndexRow::index_type>(
-                row_ptr[i + 1] - row_ptr[i])};
-  }
+
+  template<typename BinIdxType>
+  void SetIndexDataForDense(common::Span<BinIdxType> index_data_span,
+                    size_t batch_threads, const SparsePage& batch,
+                    size_t rbegin, common::Span<const uint32_t> offsets_span,
+                    size_t nbins);
+
+  // specific method for sparse data as no posibility to reduce allocated memory
+  void SetIndexDataForSparse(common::Span<uint32_t> index_data_span,
+                             size_t batch_threads, const SparsePage& batch,
+                             size_t rbegin, size_t nbins);
+
+  void ResizeIndex(const size_t rbegin, const SparsePage& batch,
+                   const size_t n_offsets, const size_t n_index,
+                   const bool isDense);
+
  inline void GetFeatureCounts(size_t* counts) const {
    auto nfeature = cut.Ptrs().size() - 1;
    for (unsigned fid = 0; fid < nfeature; ++fid) {
@ -242,9 +349,13 @@ struct GHistIndexMatrix {
      }
    }
  }
+  inline bool IsDense() const {
+    return isDense_;
+  }

 private:
  std::vector<size_t> hit_count_tloc_;
+  bool isDense_;
 };

 struct GHistIndexBlock {
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@ -30,7 +30,6 @@
 #include "../common/column_matrix.h"
 #include "../common/threading_utils.h"

-
 namespace xgboost {
 namespace tree {

@ -58,6 +57,7 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair,
  if (dmat != p_last_dmat_ || is_gmat_initialized_ == false) {
    gmat_.Init(dmat, static_cast<uint32_t>(param_.max_bin));
    column_matrix_.Init(gmat_, param_.sparse_threshold);
+
    if (param_.enable_feature_grouping > 0) {
      gmatb_.Init(gmat_, column_matrix_, param_);
    }
@ -184,7 +184,6 @@ void QuantileHistMaker::Builder::BuildLocalHistograms(
  builder_monitor_.Start("BuildLocalHistograms");

  const size_t n_nodes = nodes_for_explicit_hist_build_.size();
-
  // create space of size (# rows in each node)
  common::BlockedSpace2d space(n_nodes, [&](size_t node) {
    const int32_t nid = nodes_for_explicit_hist_build_[node].nid;
@ -292,7 +291,6 @@ void QuantileHistMaker::Builder::EvaluateAndApplySplits(
  std::vector<ExpandEntry> nodes_for_apply_split;
  AddSplitsToTree(gmat, p_tree, num_leaves, depth, timestamp,
                  &nodes_for_apply_split, temp_qexpand_depth);
-
  ApplySplit(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree);
 }

@ -777,69 +775,66 @@ void QuantileHistMaker::Builder::EvaluateSplits(const std::vector<ExpandEntry>&
 // on comparison of indexes values (idx_span) and split point (split_cond)
 // Handle dense columns
 // Analog of std::stable_partition, but in no-inplace manner
-template <bool default_left>
-inline std::pair<size_t, size_t> PartitionDenseKernel(
-      common::Span<const size_t> rid_span, common::Span<const uint32_t> idx_span,
-      const int32_t split_cond, const uint32_t offset,
+template <bool default_left, typename BinIdxType>
+inline std::pair<size_t, size_t> PartitionDenseKernel(const common::DenseColumn<BinIdxType>& column,
+      common::Span<const size_t> rid_span, const int32_t split_cond,
      common::Span<size_t> left_part, common::Span<size_t> right_part) {
-  const uint32_t* idx = idx_span.data();
+  const int32_t offset = column.GetBaseIdx();
+  const BinIdxType* idx = column.GetFeatureBinIdxPtr().data();
  size_t* p_left_part = left_part.data();
  size_t* p_right_part = right_part.data();
  size_t nleft_elems = 0;
  size_t nright_elems = 0;

-  const uint32_t missing_val = std::numeric_limits<uint32_t>::max();
-
  for (auto rid : rid_span) {
-    if (idx[rid] == missing_val) {
+    if (column.IsMissing(rid)) {
      if (default_left) {
        p_left_part[nleft_elems++] = rid;
      } else {
        p_right_part[nright_elems++] = rid;
      }
    } else {
-      if (static_cast<int32_t>(idx[rid] + offset) <= split_cond) {
+      if ((static_cast<int32_t>(idx[rid]) + offset) <= split_cond) {
        p_left_part[nleft_elems++] = rid;
      } else {
        p_right_part[nright_elems++] = rid;
      }
    }
  }
-
  return {nleft_elems, nright_elems};
 }

 // Split row indexes (rid_span) to 2 parts (left_part, right_part) depending
 // on comparison of indexes values (idx_span) and split point (split_cond).
 // Handle sparse columns
-template<bool default_left>
+template<bool default_left, typename BinIdxType>
 inline std::pair<size_t, size_t> PartitionSparseKernel(
-      common::Span<const size_t> rid_span, const int32_t split_cond, const Column& column,
-      common::Span<size_t> left_part, common::Span<size_t> right_part) {
+  common::Span<const size_t> rid_span, const int32_t split_cond,
+  const common::SparseColumn<BinIdxType>& column, common::Span<size_t> left_part,
+  common::Span<size_t> right_part) {
  size_t* p_left_part  = left_part.data();
  size_t* p_right_part = right_part.data();

  size_t nleft_elems = 0;
  size_t nright_elems = 0;
-
+  const size_t* row_data = column.GetRowData();
+  const size_t column_size = column.Size();
  if (rid_span.size()) {  // ensure that rid_span is nonempty range
    // search first nonzero row with index >= rid_span.front()
-    const size_t* p = std::lower_bound(column.GetRowData(),
-                                       column.GetRowData() + column.Size(),
+    const size_t* p = std::lower_bound(row_data, row_data + column_size,
                                       rid_span.front());

-    if (p != column.GetRowData() + column.Size() && *p <= rid_span.back()) {
-      size_t cursor = p - column.GetRowData();
+    if (p != row_data + column_size && *p <= rid_span.back()) {
+      size_t cursor = p - row_data;

      for (auto rid : rid_span) {
-        while (cursor < column.Size()
+        while (cursor < column_size
               && column.GetRowIdx(cursor) < rid
               && column.GetRowIdx(cursor) <= rid_span.back()) {
          ++cursor;
        }
-        if (cursor < column.Size() && column.GetRowIdx(cursor) == rid) {
-          const uint32_t rbin = column.GetFeatureBinIdx(cursor);
-          if (static_cast<int32_t>(rbin + column.GetBaseIdx()) <= split_cond) {
+        if (cursor < column_size && column.GetRowIdx(cursor) == rid) {
+          if (static_cast<int32_t>(column.GetGlobalBinIdx(cursor)) <= split_cond) {
            p_left_part[nleft_elems++] = rid;
          } else {
            p_right_part[nright_elems++] = rid;
@ -868,10 +863,10 @@ inline std::pair<size_t, size_t> PartitionSparseKernel(
  return {nleft_elems, nright_elems};
 }

+template <typename BinIdxType>
 void QuantileHistMaker::Builder::PartitionKernel(
    const size_t node_in_set, const size_t nid, common::Range1d range,
-    const int32_t split_cond, const ColumnMatrix& column_matrix,
-    const GHistIndexMatrix& gmat, const RegTree& tree) {
+    const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree) {
  const size_t* rid = row_set_collection_[nid].begin;
  common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
  common::Span<size_t> left  = partition_builder_.GetLeftBuffer(node_in_set,
@ -880,21 +875,21 @@ void QuantileHistMaker::Builder::PartitionKernel(
                                                                 range.begin(), range.end());
  const bst_uint fid = tree[nid].SplitIndex();
  const bool default_left = tree[nid].DefaultLeft();
-  const auto column = column_matrix.GetColumn(fid);
-  const uint32_t offset = column.GetBaseIdx();
-  common::Span<const uint32_t> idx_spin = column.GetFeatureBinIdxPtr();
+  const auto column_ptr = column_matrix.GetColumn<BinIdxType>(fid);

  std::pair<size_t, size_t> child_nodes_sizes;

-  if (column.GetType() == xgboost::common::kDenseColumn) {
+  if (column_ptr->GetType() == xgboost::common::kDenseColumn) {
+    const common::DenseColumn<BinIdxType>& column =
+          static_cast<const common::DenseColumn<BinIdxType>& >(*(column_ptr.get()));
    if (default_left) {
-      child_nodes_sizes = PartitionDenseKernel<true>(
-                            rid_span, idx_spin, split_cond, offset, left, right);
+      child_nodes_sizes = PartitionDenseKernel<true>(column, rid_span, split_cond, left, right);
    } else {
-      child_nodes_sizes = PartitionDenseKernel<false>(
-                            rid_span, idx_spin, split_cond, offset, left, right);
+      child_nodes_sizes = PartitionDenseKernel<false>(column, rid_span, split_cond, left, right);
    }
  } else {
+    const common::SparseColumn<BinIdxType>& column
+      = static_cast<const common::SparseColumn<BinIdxType>& >(*(column_ptr.get()));
    if (default_left) {
      child_nodes_sizes = PartitionSparseKernel<true>(rid_span, split_cond, column, left, right);
    } else {
@ -982,9 +977,23 @@ void QuantileHistMaker::Builder::ApplySplit(const std::vector<ExpandEntry> nodes
  // Store results in intermediate buffers from partition_builder_
  common::ParallelFor2d(space, this->nthread_, [&](size_t node_in_set, common::Range1d r) {
    const int32_t nid = nodes[node_in_set].nid;
-    PartitionKernel(node_in_set, nid, r,
-                    split_conditions[node_in_set], column_matrix, gmat, *p_tree);
-  });
+      switch (column_matrix.GetTypeSize()) {
+      case common::UINT8_BINS_TYPE_SIZE:
+        PartitionKernel<uint8_t>(node_in_set, nid, r,
+                  split_conditions[node_in_set], column_matrix, *p_tree);
+        break;
+      case common::UINT16_BINS_TYPE_SIZE:
+        PartitionKernel<uint16_t>(node_in_set, nid, r,
+                  split_conditions[node_in_set], column_matrix, *p_tree);
+        break;
+      case common::UINT32_BINS_TYPE_SIZE:
+        PartitionKernel<uint32_t>(node_in_set, nid, r,
+                  split_conditions[node_in_set], column_matrix, *p_tree);
+        break;
+      default:
+        CHECK(false);  // no default behavior
+    }
+    });

  // 3. Compute offsets to copy blocks of row-indexes
  // from partition_builder_ to row_set_collection_
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@ -212,10 +212,10 @@ class QuantileHistMaker: public TreeUpdater {
                        const HistCollection& hist,
                        RegTree* p_tree);

+    template <typename BinIdxType>
    void PartitionKernel(const size_t node_in_set, const size_t nid, common::Range1d range,
                         const int32_t split_cond,
-                         const ColumnMatrix& column_matrix, const GHistIndexMatrix& gmat,
-                         const RegTree& tree);
+                         const ColumnMatrix& column_matrix, const RegTree& tree);

    void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree* p_tree);

--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@ -9,28 +9,46 @@ namespace xgboost {
 namespace common {

 TEST(DenseColumn, Test) {
-  auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatix();
-  GHistIndexMatrix gmat;
-  gmat.Init(dmat.get(), 256);
-  ColumnMatrix column_matrix;
-  column_matrix.Init(gmat, 0.2);
+  uint64_t max_num_bins[] = {static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                          static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                          static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  for (size_t max_num_bin : max_num_bins) {
+    auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatix();
+    GHistIndexMatrix gmat;
+    gmat.Init(dmat.get(), max_num_bin);
+    ColumnMatrix column_matrix;
+    column_matrix.Init(gmat, 0.2);

-  for (auto i = 0ull; i < dmat->Info().num_row_; i++) {
-    for (auto j = 0ull; j < dmat->Info().num_col_; j++) {
-        auto col = column_matrix.GetColumn(j);
-        ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
-                  col.GetGlobalBinIdx(i));
+    for (auto i = 0ull; i < dmat->Info().num_row_; i++) {
+      for (auto j = 0ull; j < dmat->Info().num_col_; j++) {
+          switch (column_matrix.GetTypeSize()) {
+            case UINT8_BINS_TYPE_SIZE: {
+                auto col = column_matrix.GetColumn<uint8_t>(j);
+                ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
+                          (*col.get()).GetGlobalBinIdx(i));
+              }
+              break;
+            case UINT16_BINS_TYPE_SIZE: {
+                auto col = column_matrix.GetColumn<uint16_t>(j);
+                ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
+                          (*col.get()).GetGlobalBinIdx(i));
+              }
+              break;
+            case UINT32_BINS_TYPE_SIZE: {
+                auto col = column_matrix.GetColumn<uint32_t>(j);
+                ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
+                          (*col.get()).GetGlobalBinIdx(i));
+              }
+              break;
+        }
+      }
    }
  }
 }

-TEST(SparseColumn, Test) {
-  auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatix();
-  GHistIndexMatrix gmat;
-  gmat.Init(dmat.get(), 256);
-  ColumnMatrix column_matrix;
-  column_matrix.Init(gmat, 0.5);
-  auto col = column_matrix.GetColumn(0);
+template<typename BinIdxType>
+inline void CheckSparseColumn(const Column<BinIdxType>& col_input, const GHistIndexMatrix& gmat) {
+  const SparseColumn<BinIdxType>& col = static_cast<const SparseColumn<BinIdxType>& >(col_input);
  ASSERT_EQ(col.Size(), gmat.index.size());
  for (auto i = 0ull; i < col.Size(); i++) {
    ASSERT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]],
@ -38,20 +56,77 @@ TEST(SparseColumn, Test) {
  }
 }

-TEST(DenseColumnWithMissing, Test) {
-  auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatix();
-  GHistIndexMatrix gmat;
-  gmat.Init(dmat.get(), 256);
-  ColumnMatrix column_matrix;
-  column_matrix.Init(gmat, 0.2);
-  auto col = column_matrix.GetColumn(0);
+TEST(SparseColumn, Test) {
+  uint64_t max_num_bins[] = {static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                          static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                          static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2};
+  for (size_t max_num_bin : max_num_bins) {
+    auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatix();
+    GHistIndexMatrix gmat;
+    gmat.Init(dmat.get(), max_num_bin);
+    ColumnMatrix column_matrix;
+    column_matrix.Init(gmat, 0.5);
+    switch (column_matrix.GetTypeSize()) {
+      case UINT8_BINS_TYPE_SIZE: {
+          auto col = column_matrix.GetColumn<uint8_t>(0);
+          CheckSparseColumn(*col.get(), gmat);
+        }
+        break;
+      case UINT16_BINS_TYPE_SIZE: {
+          auto col = column_matrix.GetColumn<uint16_t>(0);
+          CheckSparseColumn(*col.get(), gmat);
+        }
+        break;
+      case UINT32_BINS_TYPE_SIZE: {
+          auto col = column_matrix.GetColumn<uint32_t>(0);
+          CheckSparseColumn(*col.get(), gmat);
+        }
+        break;
+    }
+  }
+}
+
+template<typename BinIdxType>
+inline void CheckColumWithMissingValue(const Column<BinIdxType>& col_input,
+                                       const GHistIndexMatrix& gmat) {
+  const DenseColumn<BinIdxType>& col = static_cast<const DenseColumn<BinIdxType>& >(col_input);
  for (auto i = 0ull; i < col.Size(); i++) {
    if (col.IsMissing(i)) continue;
-    EXPECT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]],
+    EXPECT_EQ(gmat.index[gmat.row_ptr[i]],
              col.GetGlobalBinIdx(i));
  }
 }

+TEST(DenseColumnWithMissing, Test) {
+  uint64_t max_num_bins[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                              static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                              static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
+  for (size_t max_num_bin : max_num_bins) {
+    auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatix();
+    GHistIndexMatrix gmat;
+    gmat.Init(dmat.get(), max_num_bin);
+    ColumnMatrix column_matrix;
+    column_matrix.Init(gmat, 0.2);
+    switch (column_matrix.GetTypeSize()) {
+      case UINT8_BINS_TYPE_SIZE: {
+          auto col = column_matrix.GetColumn<uint8_t>(0);
+          CheckColumWithMissingValue(*col.get(), gmat);
+        }
+        break;
+      case UINT16_BINS_TYPE_SIZE: {
+          auto col = column_matrix.GetColumn<uint16_t>(0);
+          CheckColumWithMissingValue(*col.get(), gmat);
+        }
+        break;
+      case UINT32_BINS_TYPE_SIZE: {
+          auto col = column_matrix.GetColumn<uint32_t>(0);
+          CheckColumWithMissingValue(*col.get(), gmat);
+        }
+        break;
+    }
+  }
+}
+
 void TestGHistIndexMatrixCreation(size_t nthreads) {
  dmlc::TemporaryDirectory tmpdir;
  std::string filename = tmpdir.path + "/big.libsvm";
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@ -347,5 +347,106 @@ TEST(hist_util, SparseCutsExternalMemory) {
    }
  }
 }
+
+TEST(hist_util, IndexBinBound) {
+  uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                           static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                           static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
+  BinTypeSize expected_bin_type_sizes[] = {UINT8_BINS_TYPE_SIZE,
+                                           UINT16_BINS_TYPE_SIZE,
+                                           UINT32_BINS_TYPE_SIZE};
+  size_t constexpr kRows = 100;
+  size_t constexpr kCols = 10;
+
+  size_t bin_id = 0;
+  for (auto max_bin : bin_sizes) {
+    auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatix();
+
+    common::GHistIndexMatrix hmat;
+    hmat.Init(p_fmat.get(), max_bin);
+    EXPECT_EQ(hmat.index.size(), kRows*kCols);
+    EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.getBinTypeSize());
+  }
+}
+
+TEST(hist_util, SparseIndexBinBound) {
+  uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                           static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                           static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
+  BinTypeSize expected_bin_type_sizes[] = { UINT32_BINS_TYPE_SIZE,
+                                            UINT32_BINS_TYPE_SIZE,
+                                            UINT32_BINS_TYPE_SIZE };
+  size_t constexpr kRows = 100;
+  size_t constexpr kCols = 10;
+
+  size_t bin_id = 0;
+  for (auto max_bin : bin_sizes) {
+    auto p_fmat = RandomDataGenerator(kRows, kCols, 0.2).GenerateDMatix();
+    common::GHistIndexMatrix hmat;
+    hmat.Init(p_fmat.get(), max_bin);
+    EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.getBinTypeSize());
+  }
+}
+
+template <typename T>
+void CheckIndexData(T* data_ptr, uint32_t* offsets,
+                    const common::GHistIndexMatrix& hmat, size_t n_cols) {
+  for (size_t i = 0; i < hmat.index.size(); ++i) {
+    EXPECT_EQ(data_ptr[i] + offsets[i % n_cols], hmat.index[i]);
+  }
+}
+
+TEST(hist_util, IndexBinData) {
+  uint64_t constexpr kBinSizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                                     static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                                     static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
+  size_t constexpr kRows = 100;
+  size_t constexpr kCols = 10;
+
+  size_t bin_id = 0;
+  for (auto max_bin : kBinSizes) {
+    auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatix();
+    common::GHistIndexMatrix hmat;
+    hmat.Init(p_fmat.get(), max_bin);
+    uint32_t* offsets = hmat.index.offset();
+    EXPECT_EQ(hmat.index.size(), kRows*kCols);
+    switch (max_bin) {
+      case kBinSizes[0]:
+        CheckIndexData(hmat.index.data<uint8_t>(),
+                       offsets, hmat, kCols);
+        break;
+      case kBinSizes[1]:
+        CheckIndexData(hmat.index.data<uint16_t>(),
+                       offsets, hmat, kCols);
+        break;
+      case kBinSizes[2]:
+        CheckIndexData(hmat.index.data<uint32_t>(),
+                       offsets, hmat, kCols);
+        break;
+    }
+  }
+}
+
+TEST(hist_util, SparseIndexBinData) {
+  uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
+                           static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
+                           static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
+  size_t constexpr kRows = 100;
+  size_t constexpr kCols = 10;
+
+  size_t bin_id = 0;
+  for (auto max_bin : bin_sizes) {
+    auto p_fmat = RandomDataGenerator(kRows, kCols, 0.2).GenerateDMatix();
+    common::GHistIndexMatrix hmat;
+    hmat.Init(p_fmat.get(), max_bin);
+    EXPECT_EQ(hmat.index.offset(), nullptr);
+
+    uint32_t* data_ptr = hmat.index.data<uint32_t>();
+    for (size_t i = 0; i < hmat.index.size(); ++i) {
+      EXPECT_EQ(data_ptr[i], hmat.index[i]);
+    }
+  }
+}
+
 }  // namespace common
 }  // namespace xgboost