[GPU-Plugin] Fix gpu_hist to allow matrices with more than just 2^{32} elements. Also fixed CPU hist algorithm. (#2518)

2017-07-17 16:19:27 -07:00
parent c85bf9859e
commit ca7fc9fda3
11 changed files with 413 additions and 283 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -57,7 +57,7 @@ class Column {
  ColumnType type;
  const T* index;
  uint32_t index_base;
-  const uint32_t* row_ind;
+  const size_t* row_ind;
  size_t len;
 };

@@ -66,8 +66,8 @@ class Column {
 class ColumnMatrix {
 public:
  // get number of features
-  inline uint32_t GetNumFeature() const {
-    return type_.size();
+  inline bst_uint GetNumFeature() const {
+    return static_cast<bst_uint>(type_.size());
  }

  // construct column matrix from GHistIndexMatrix
@@ -78,8 +78,8 @@ class ColumnMatrix {
       slot of internal buffer. */
    packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);

-    const uint32_t nfeature = gmat.cut->row_ptr.size() - 1;
-    const omp_ulong nrow = static_cast<omp_ulong>(gmat.row_ptr.size() - 1);
+    const bst_uint nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
+    const size_t nrow = gmat.row_ptr.size() - 1;

    // identify type of each column
    feature_counts_.resize(nfeature);
@@ -90,13 +90,13 @@ class ColumnMatrix {
    XGBOOST_TYPE_SWITCH(this->dtype, {
      max_val = static_cast<uint32_t>(std::numeric_limits<DType>::max());
    });
-    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+    for (bst_uint fid = 0; fid < nfeature; ++fid) {
      CHECK_LE(gmat.cut->row_ptr[fid + 1] - gmat.cut->row_ptr[fid], max_val);
    }

    gmat.GetFeatureCounts(&feature_counts_[0]);
    // classify features
-    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+    for (bst_uint fid = 0; fid < nfeature; ++fid) {
      if (static_cast<double>(feature_counts_[fid])
                 < param.sparse_threshold * nrow) {
        type_[fid] = kSparseColumn;
@@ -108,13 +108,13 @@ class ColumnMatrix {
    // want to compute storage boundary for each feature
    // using variants of prefix sum scan
    boundary_.resize(nfeature);
-    bst_uint accum_index_ = 0;
-    bst_uint accum_row_ind_ = 0;
-    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+    size_t accum_index_ = 0;
+    size_t accum_row_ind_ = 0;
+    for (bst_uint fid = 0; fid < nfeature; ++fid) {
      boundary_[fid].index_begin = accum_index_;
      boundary_[fid].row_ind_begin = accum_row_ind_;
      if (type_[fid] == kDenseColumn) {
-        accum_index_ += nrow;
+        accum_index_ += static_cast<size_t>(nrow);
      } else {
        accum_index_ += feature_counts_[fid];
        accum_row_ind_ += feature_counts_[fid];
@@ -129,14 +129,14 @@ class ColumnMatrix {

    // store least bin id for each feature
    index_base_.resize(nfeature);
-    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+    for (bst_uint fid = 0; fid < nfeature; ++fid) {
      index_base_[fid] = gmat.cut->row_ptr[fid];
    }

-    // fill index_ for dense columns
-    for (uint32_t fid = 0; fid < nfeature; ++fid) {
+    // pre-fill index_ for dense columns
+    for (bst_uint fid = 0; fid < nfeature; ++fid) {
      if (type_[fid] == kDenseColumn) {
-        const uint32_t ibegin = boundary_[fid].index_begin;
+        const size_t ibegin = boundary_[fid].index_begin;
        XGBOOST_TYPE_SWITCH(this->dtype, {
          const size_t block_offset = ibegin / packing_factor_;
          const size_t elem_offset = ibegin % packing_factor_;
@@ -150,15 +150,15 @@ class ColumnMatrix {

    // loop over all rows and fill column entries
    // num_nonzeros[fid] = how many nonzeros have this feature accumulated so far?
-    std::vector<uint32_t> num_nonzeros;
+    std::vector<size_t> num_nonzeros;
    num_nonzeros.resize(nfeature);
    std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
-    for (uint32_t rid = 0; rid < nrow; ++rid) {
-      const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
-      const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    for (size_t rid = 0; rid < nrow; ++rid) {
+      const size_t ibegin = gmat.row_ptr[rid];
+      const size_t iend = gmat.row_ptr[rid + 1];
      size_t fid = 0;
      for (size_t i = ibegin; i < iend; ++i) {
-        const size_t bin_id = gmat.index[i];
+        const uint32_t bin_id = gmat.index[i];
        while (bin_id >= gmat.cut->row_ptr[fid + 1]) {
          ++fid;
        }
@@ -167,14 +167,14 @@ class ColumnMatrix {
            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
-            begin[rid] = bin_id - index_base_[fid];
+            begin[rid] = static_cast<DType>(bin_id - index_base_[fid]);
          });
        } else {
          XGBOOST_TYPE_SWITCH(this->dtype, {
            const size_t block_offset = boundary_[fid].index_begin / packing_factor_;
            const size_t elem_offset = boundary_[fid].index_begin % packing_factor_;
            DType* begin = reinterpret_cast<DType*>(&index_[block_offset]) + elem_offset;
-            begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
+            begin[num_nonzeros[fid]] = static_cast<DType>(bin_id - index_base_[fid]);
          });
          row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
          ++num_nonzeros[fid];
@@ -213,16 +213,16 @@ class ColumnMatrix {
    // indicate where each column's index and row_ind is stored.
    // index_begin and index_end are logical offsets, so they should be converted to
    // actual offsets by scaling with packing_factor_
-    unsigned index_begin;
-    unsigned index_end;
-    unsigned row_ind_begin;
-    unsigned row_ind_end;
+    size_t index_begin;
+    size_t index_end;
+    size_t row_ind_begin;
+    size_t row_ind_end;
  };

-  std::vector<bst_uint> feature_counts_;
+  std::vector<size_t> feature_counts_;
  std::vector<ColumnType> type_;
  std::vector<uint32_t> index_;  // index_: may store smaller integers; needs padding
-  std::vector<uint32_t> row_ind_;
+  std::vector<size_t> row_ind_;
  std::vector<ColumnBoundary> boundary_;

  size_t packing_factor_;  // how many integers are stored in each slot of index_
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -46,11 +46,11 @@ static int SymbolBits(int num_symbols) {

 class CompressedBufferWriter {
 private:
-  int symbol_bits_;
+  size_t symbol_bits_;
  size_t offset_;

 public:
-  explicit CompressedBufferWriter(int num_symbols) : offset_(0) {
+  explicit CompressedBufferWriter(size_t num_symbols) : offset_(0) {
    symbol_bits_ = detail::SymbolBits(num_symbols);
  }

@@ -70,9 +70,9 @@ class CompressedBufferWriter {
   * \return  The calculated buffer size.
   */

-  static size_t CalculateBufferSize(int num_elements, int num_symbols) {
+  static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
    const int bits_per_byte = 8;
-    int compressed_size = std::ceil(
+    size_t compressed_size = std::ceil(
        static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
        bits_per_byte);
    return compressed_size + detail::padding;
@@ -82,10 +82,10 @@ class CompressedBufferWriter {
  void WriteSymbol(compressed_byte_t *buffer, T symbol, size_t offset) {
    const int bits_per_byte = 8;

-    for (int i = 0; i < symbol_bits_; i++) {
+    for (size_t i = 0; i < symbol_bits_; i++) {
      size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte;
      byte_idx += detail::padding;
-      int bit_idx =
+      size_t bit_idx =
          ((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte;

      if (detail::CheckBit(symbol, i)) {
@@ -100,14 +100,14 @@ class CompressedBufferWriter {
    uint64_t tmp = 0;
    int stored_bits = 0;
    const int max_stored_bits = 64 - symbol_bits_;
-    int buffer_position = detail::padding;
-    const int num_symbols = input_end - input_begin;
-    for (int i = 0; i < num_symbols; i++) {
+    size_t buffer_position = detail::padding;
+    const size_t num_symbols = input_end - input_begin;
+    for (size_t i = 0; i < num_symbols; i++) {
      typename std::iterator_traits<iter_t>::value_type symbol = input_begin[i];
      if (stored_bits > max_stored_bits) {
        // Eject only full bytes
-        int tmp_bytes = stored_bits / 8;
-        for (int j = 0; j < tmp_bytes; j++) {
+        size_t tmp_bytes = stored_bits / 8;
+        for (size_t j = 0; j < tmp_bytes; j++) {
          buffer[buffer_position] = tmp >> (stored_bits - (j + 1) * 8);
          buffer_position++;
        }
@@ -121,8 +121,8 @@ class CompressedBufferWriter {
    }

    // Eject all bytes
-    int tmp_bytes = std::ceil(static_cast<float>(stored_bits) / 8);
-    for (int j = 0; j < tmp_bytes; j++) {
+    size_t tmp_bytes = std::ceil(static_cast<float>(stored_bits) / 8);
+    for (size_t j = 0; j < tmp_bytes; j++) {
      int shift_bits = stored_bits - (j + 1) * 8;
      if (shift_bits >= 0) {
        buffer[buffer_position] = tmp >> shift_bits;
@@ -159,7 +159,7 @@ class CompressedIterator {
                                 /// iterator can point to
 private:
  compressed_byte_t *buffer_;
-  int symbol_bits_;
+  size_t symbol_bits_;
  size_t offset_;

 public:
@@ -189,7 +189,7 @@ class CompressedIterator {
    return static_cast<T>(tmp & mask);
  }

-  XGBOOST_DEVICE reference operator[](int idx) const {
+  XGBOOST_DEVICE reference operator[](size_t idx) const {
    self_type offset = (*this);
    offset.offset_ += idx;
    return *offset;
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -16,7 +16,7 @@
 namespace xgboost {
 namespace common {

-void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
+void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
  typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
  const MetaInfo& info = p_fmat->info();

@@ -44,7 +44,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, size_t max_num_bins) {
      unsigned begin = std::min(nstep * tid, ncol);
      unsigned end = std::min(nstep * (tid + 1), ncol);
      for (size_t i = 0; i < batch.size; ++i) { // NOLINT(*)
-        bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
+        size_t ridx = batch.base_rowid + i;
        RowBatch::Inst inst = batch[i];
        for (bst_uint j = 0; j < inst.length; ++j) {
          if (inst[j].index >= begin && inst[j].index < end) {
@@ -108,7 +108,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
  dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();

  const int nthread = omp_get_max_threads();
-  const unsigned nbins = cut->row_ptr.back();
+  const uint32_t nbins = cut->row_ptr.back();
  hit_count.resize(nbins, 0);
  hit_count_tloc_.resize(nthread * nbins, 0);

@@ -116,7 +116,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
  row_ptr.push_back(0);
  while (iter->Next()) {
    const RowBatch& batch = iter->Value();
-    size_t rbegin = row_ptr.size() - 1;
+    const size_t rbegin = row_ptr.size() - 1;
    for (size_t i = 0; i < batch.size; ++i) {
      row_ptr.push_back(batch[i].length + row_ptr.back());
    }
@@ -140,7 +140,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
        CHECK(cbegin != cend);
        auto it = std::upper_bound(cbegin, cend, inst[j].fvalue);
        if (it == cend) it = cend - 1;
-        unsigned idx = static_cast<unsigned>(it - cut->cut.begin());
+        uint32_t idx = static_cast<uint32_t>(it - cut->cut.begin());
        index[ibegin + j] = idx;
        ++hit_count_tloc_[tid * nbins + idx];
      }
@@ -148,7 +148,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
    }

    #pragma omp parallel for num_threads(nthread) schedule(static)
-    for (omp_ulong idx = 0; idx < nbins; ++idx) {
+    for (bst_omp_uint idx = 0; idx < nbins; ++idx) {
      for (int tid = 0; tid < nthread; ++tid) {
        hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
      }
@@ -157,10 +157,10 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
 }

 template <typename T>
-static unsigned GetConflictCount(const std::vector<bool>& mark,
-                                 const Column<T>& column,
-                                 unsigned max_cnt) {
-  unsigned ret = 0;
+static size_t GetConflictCount(const std::vector<bool>& mark,
+                               const Column<T>& column,
+                               size_t max_cnt) {
+  size_t ret = 0;
  if (column.type == xgboost::common::kDenseColumn) {
    for (size_t i = 0; i < column.len; ++i) {
      if (column.index[i] != std::numeric_limits<T>::max() && mark[i]) {
@@ -203,9 +203,9 @@ MarkUsed(std::vector<bool>* p_mark, const Column<T>& column) {
 template <typename T>
 inline std::vector<std::vector<unsigned>>
 FindGroups_(const std::vector<unsigned>& feature_list,
-            const std::vector<bst_uint>& feature_nnz,
+            const std::vector<size_t>& feature_nnz,
            const ColumnMatrix& colmat,
-            unsigned nrow,
+            size_t nrow,
            const FastHistParam& param) {
  /* Goal: Bundle features together that has little or no "overlap", i.e.
           only a few data points should have nonzero values for
@@ -214,10 +214,10 @@ FindGroups_(const std::vector<unsigned>& feature_list,

  std::vector<std::vector<unsigned>> groups;
  std::vector<std::vector<bool>> conflict_marks;
-  std::vector<unsigned> group_nnz;
-  std::vector<unsigned> group_conflict_cnt;
-  const unsigned max_conflict_cnt
-    = static_cast<unsigned>(param.max_conflict_rate * nrow);
+  std::vector<size_t> group_nnz;
+  std::vector<size_t> group_conflict_cnt;
+  const size_t max_conflict_cnt
+    = static_cast<size_t>(param.max_conflict_rate * nrow);

  for (auto fid : feature_list) {
    const Column<T>& column = colmat.GetColumn<T>(fid);
@@ -239,8 +239,8 @@ FindGroups_(const std::vector<unsigned>& feature_list,

    // examine each candidate group: is it okay to insert fid?
    for (auto gid : search_groups) {
-      const unsigned rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
-      const unsigned cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
+      const size_t rest_max_cnt = max_conflict_cnt - group_conflict_cnt[gid];
+      const size_t cnt = GetConflictCount(conflict_marks[gid], column, rest_max_cnt);
      if (cnt <= rest_max_cnt) {
        need_new_group = false;
        groups[gid].push_back(fid);
@@ -267,9 +267,9 @@ FindGroups_(const std::vector<unsigned>& feature_list,

 inline std::vector<std::vector<unsigned>>
 FindGroups(const std::vector<unsigned>& feature_list,
-           const std::vector<bst_uint>& feature_nnz,
+           const std::vector<size_t>& feature_nnz,
           const ColumnMatrix& colmat,
-           unsigned nrow,
+           size_t nrow,
           const FastHistParam& param) {
  XGBOOST_TYPE_SWITCH(colmat.dtype, {
    return FindGroups_<DType>(feature_list, feature_nnz, colmat, nrow, param);
@@ -288,11 +288,11 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
  std::iota(feature_list.begin(), feature_list.end(), 0);

  // sort features by nonzero counts, descending order
-  std::vector<bst_uint> feature_nnz(nfeature);
+  std::vector<size_t> feature_nnz(nfeature);
  std::vector<unsigned> features_by_nnz(feature_list);
  gmat.GetFeatureCounts(&feature_nnz[0]);
  std::sort(features_by_nnz.begin(), features_by_nnz.end(),
-            [&feature_nnz](int a, int b) {
+            [&feature_nnz](unsigned a, unsigned b) {
    return feature_nnz[a] > feature_nnz[b];
  });

@@ -307,7 +307,7 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
      if (group.size() <= 1 || group.size() >= 5) {
        ret.push_back(group);  // keep singleton groups and large (5+) groups
      } else {
-        unsigned nnz = 0;
+        size_t nnz = 0;
        for (auto fid : group) {
          nnz += feature_nnz[fid];
        }
@@ -338,37 +338,37 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
  cut = gmat.cut;

  const size_t nrow = gmat.row_ptr.size() - 1;
-  const size_t nbins = gmat.cut->row_ptr.back();
+  const uint32_t nbins = gmat.cut->row_ptr.back();

  /* step 1: form feature groups */
  auto groups = FastFeatureGrouping(gmat, colmat, param);
-  const size_t nblock = groups.size();
+  const uint32_t nblock = static_cast<uint32_t>(groups.size());

  /* step 2: build a new CSR matrix for each feature group */
-  std::vector<unsigned> bin2block(nbins);  // lookup table [bin id] => [block id]
-  for (size_t group_id = 0; group_id < nblock; ++group_id) {
+  std::vector<uint32_t> bin2block(nbins);  // lookup table [bin id] => [block id]
+  for (uint32_t group_id = 0; group_id < nblock; ++group_id) {
    for (auto& fid : groups[group_id]) {
-      const unsigned bin_begin = gmat.cut->row_ptr[fid];
-      const unsigned bin_end = gmat.cut->row_ptr[fid + 1];
-      for (unsigned bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
+      const uint32_t bin_begin = gmat.cut->row_ptr[fid];
+      const uint32_t bin_end = gmat.cut->row_ptr[fid + 1];
+      for (uint32_t bin_id = bin_begin; bin_id < bin_end; ++bin_id) {
        bin2block[bin_id] = group_id;
      }
    }
  }
-  std::vector<std::vector<unsigned>> index_temp(nblock);
-  std::vector<std::vector<unsigned>> row_ptr_temp(nblock);
-  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+  std::vector<std::vector<uint32_t>> index_temp(nblock);
+  std::vector<std::vector<size_t>> row_ptr_temp(nblock);
+  for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
    row_ptr_temp[block_id].push_back(0);
  }
  for (size_t rid = 0; rid < nrow; ++rid) {
-    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
-    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    const size_t ibegin = gmat.row_ptr[rid];
+    const size_t iend = gmat.row_ptr[rid + 1];
    for (size_t j = ibegin; j < iend; ++j) {
-      const size_t bin_id = gmat.index[j];
-      const size_t block_id = bin2block[bin_id];
+      const uint32_t bin_id = gmat.index[j];
+      const uint32_t block_id = bin2block[bin_id];
      index_temp[block_id].push_back(bin_id);
    }
-    for (size_t block_id = 0; block_id < nblock; ++block_id) {
+    for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
      row_ptr_temp[block_id].push_back(index_temp[block_id].size());
    }
  }
@@ -378,7 +378,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
  std::vector<size_t> row_ptr_blk_ptr;
  index_blk_ptr.push_back(0);
  row_ptr_blk_ptr.push_back(0);
-  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+  for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
    index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
    row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
    index_blk_ptr.push_back(index.size());
@@ -386,7 +386,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
  }

  // save shortcut for each block
-  for (size_t block_id = 0; block_id < nblock; ++block_id) {
+  for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
    Block blk;
    blk.index_begin = &index[index_blk_ptr[block_id]];
    blk.row_ptr_begin = &row_ptr[row_ptr_blk_ptr[block_id]];
@@ -406,14 +406,14 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,

  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
-  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
-  const bst_omp_uint rest = nrows % K;
+  const size_t nrows = row_indices.end - row_indices.begin;
+  const size_t rest = nrows % K;

  #pragma omp parallel for num_threads(nthread) schedule(guided)
  for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
    const bst_omp_uint tid = omp_get_thread_num();
    const size_t off = tid * nbins_;
-    bst_uint rid[K];
+    size_t rid[K];
    size_t ibegin[K];
    size_t iend[K];
    bst_gpair stat[K];
@@ -421,32 +421,32 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
      rid[k] = row_indices.begin[i + k];
    }
    for (int k = 0; k < K; ++k) {
-      ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
-      iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
+      ibegin[k] = gmat.row_ptr[rid[k]];
+      iend[k] = gmat.row_ptr[rid[k] + 1];
    }
    for (int k = 0; k < K; ++k) {
      stat[k] = gpair[rid[k]];
    }
    for (int k = 0; k < K; ++k) {
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
-        const size_t bin = gmat.index[j];
+        const uint32_t bin = gmat.index[j];
        data_[off + bin].Add(stat[k]);
      }
    }
  }
  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
-    const bst_uint rid = row_indices.begin[i];
-    const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
-    const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+    const size_t rid = row_indices.begin[i];
+    const size_t ibegin = gmat.row_ptr[rid];
+    const size_t iend = gmat.row_ptr[rid + 1];
    const bst_gpair stat = gpair[rid];
    for (size_t j = ibegin; j < iend; ++j) {
-      const size_t bin = gmat.index[j];
+      const uint32_t bin = gmat.index[j];
      data_[bin].Add(stat);
    }
  }

  /* reduction */
-  const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
+  const uint32_t nbins = nbins_;
  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
    for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
@@ -462,16 +462,16 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
                                  GHistRow hist) {
  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
-  const bst_omp_uint nblock = gmatb.GetNumBlock();
-  const bst_omp_uint nrows = row_indices.end - row_indices.begin;
-  const bst_omp_uint rest = nrows % K;
+  const uint32_t nblock = gmatb.GetNumBlock();
+  const size_t nrows = row_indices.end - row_indices.begin;
+  const size_t rest = nrows % K;

  #pragma omp parallel for num_threads(nthread) schedule(guided)
  for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
    auto gmat = gmatb[bid];

-    for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
-      bst_uint rid[K];
+    for (size_t i = 0; i < nrows - rest; i += K) {
+      size_t rid[K];
      size_t ibegin[K];
      size_t iend[K];
      bst_gpair stat[K];
@@ -479,26 +479,26 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
        rid[k] = row_indices.begin[i + k];
      }
      for (int k = 0; k < K; ++k) {
-        ibegin[k] = static_cast<size_t>(gmat.row_ptr[rid[k]]);
-        iend[k] = static_cast<size_t>(gmat.row_ptr[rid[k] + 1]);
+        ibegin[k] = gmat.row_ptr[rid[k]];
+        iend[k] = gmat.row_ptr[rid[k] + 1];
      }
      for (int k = 0; k < K; ++k) {
        stat[k] = gpair[rid[k]];
      }
      for (int k = 0; k < K; ++k) {
        for (size_t j = ibegin[k]; j < iend[k]; ++j) {
-          const size_t bin = gmat.index[j];
+          const uint32_t bin = gmat.index[j];
          hist.begin[bin].Add(stat[k]);
        }
      }
    }
    for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
-      const bst_uint rid = row_indices.begin[i];
-      const size_t ibegin = static_cast<size_t>(gmat.row_ptr[rid]);
-      const size_t iend = static_cast<size_t>(gmat.row_ptr[rid + 1]);
+      const size_t rid = row_indices.begin[i];
+      const size_t ibegin = gmat.row_ptr[rid];
+      const size_t iend = gmat.row_ptr[rid + 1];
      const bst_gpair stat = gpair[rid];
      for (size_t j = ibegin; j < iend; ++j) {
-        const size_t bin = gmat.index[j];
+        const uint32_t bin = gmat.index[j];
        hist.begin[bin].Add(stat);
      }
    }
@@ -507,9 +507,9 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,

 void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
-  const bst_omp_uint nbins = static_cast<bst_omp_uint>(nbins_);
+  const uint32_t nbins = static_cast<bst_omp_uint>(nbins_);
  const int K = 8;  // loop unrolling factor
-  const bst_omp_uint rest = nbins % K;
+  const uint32_t rest = nbins % K;
  #pragma omp parallel for num_threads(nthread) schedule(static)
  for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
    GHistEntry pb[K];
@@ -524,7 +524,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa
      self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
    }
  }
-  for (bst_omp_uint bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
+  for (uint32_t bin_id = nbins - rest; bin_id < nbins; ++bin_id) {
    self.begin[bin_id].SetSubtract(parent.begin[bin_id], sibling.begin[bin_id]);
  }
 }
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -56,30 +56,30 @@ struct HistCutUnit {
  /*! \brief the index pointer of each histunit */
  const bst_float* cut;
  /*! \brief number of cutting point, containing the maximum point */
-  size_t size;
+  uint32_t size;
  // default constructor
  HistCutUnit() {}
  // constructor
-  HistCutUnit(const bst_float* cut, unsigned size)
+  HistCutUnit(const bst_float* cut, uint32_t size)
      : cut(cut), size(size) {}
 };

 /*! \brief cut configuration for all the features */
 struct HistCutMatrix {
-  /*! \brief actual unit pointer */
-  std::vector<unsigned> row_ptr;
+  /*! \brief unit pointer to rows by element position */
+  std::vector<uint32_t> row_ptr;
  /*! \brief minimum value of each feature */
  std::vector<bst_float> min_val;
  /*! \brief the cut field */
  std::vector<bst_float> cut;
  /*! \brief Get histogram bound for fid */
-  inline HistCutUnit operator[](unsigned fid) const {
+  inline HistCutUnit operator[](bst_uint fid) const {
    return HistCutUnit(dmlc::BeginPtr(cut) + row_ptr[fid],
                       row_ptr[fid + 1] - row_ptr[fid]);
  }
  // create histogram cut matrix given statistics from data
  // using approximate quantile sketch approach
-  void Init(DMatrix* p_fmat, size_t max_num_bins);
+  void Init(DMatrix* p_fmat, uint32_t max_num_bins);
 };


@@ -89,11 +89,11 @@ struct HistCutMatrix {
 */
 struct GHistIndexRow {
  /*! \brief The index of the histogram */
-  const unsigned* index;
+  const uint32_t* index;
  /*! \brief The size of the histogram */
-  unsigned size;
+  size_t size;
  GHistIndexRow() {}
-  GHistIndexRow(const unsigned* index, unsigned size)
+  GHistIndexRow(const uint32_t* index, size_t size)
      : index(index), size(size) {}
 };

@@ -103,21 +103,21 @@ struct GHistIndexRow {
 *  This is a global histogram index.
 */
 struct GHistIndexMatrix {
-  /*! \brief row pointer */
-  std::vector<unsigned> row_ptr;
+  /*! \brief row pointer to rows by element position */
+  std::vector<size_t> row_ptr;
  /*! \brief The index data */
-  std::vector<unsigned> index;
+  std::vector<uint32_t> index;
  /*! \brief hit count of each index */
-  std::vector<unsigned> hit_count;
+  std::vector<size_t> hit_count;
  /*! \brief The corresponding cuts */
  const HistCutMatrix* cut;
  // Create a global histogram matrix, given cut
  void Init(DMatrix* p_fmat);
  // get i-th row
-  inline GHistIndexRow operator[](bst_uint i) const {
+  inline GHistIndexRow operator[](size_t i) const {
    return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
  }
-  inline void GetFeatureCounts(bst_uint* counts) const {
+  inline void GetFeatureCounts(size_t* counts) const {
    const unsigned nfeature = cut->row_ptr.size() - 1;
    for (unsigned fid = 0; fid < nfeature; ++fid) {
      const unsigned ibegin = cut->row_ptr[fid];
@@ -129,18 +129,18 @@ struct GHistIndexMatrix {
  }

 private:
-  std::vector<unsigned> hit_count_tloc_;
+  std::vector<size_t> hit_count_tloc_;
 };

 struct GHistIndexBlock {
-  const unsigned* row_ptr;
-  const unsigned* index;
+  const size_t* row_ptr;
+  const uint32_t* index;

-  inline GHistIndexBlock(const unsigned* row_ptr, const unsigned* index)
+  inline GHistIndexBlock(const size_t* row_ptr, const uint32_t* index)
    : row_ptr(row_ptr), index(index) {}

  // get i-th row
-  inline GHistIndexRow operator[](bst_uint i) const {
+  inline GHistIndexRow operator[](size_t i) const {
    return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
  }
 };
@@ -153,23 +153,23 @@ class GHistIndexBlockMatrix {
            const ColumnMatrix& colmat,
            const FastHistParam& param);

-  inline GHistIndexBlock operator[](bst_uint i) const {
+  inline GHistIndexBlock operator[](size_t i) const {
    return GHistIndexBlock(blocks[i].row_ptr_begin, blocks[i].index_begin);
  }

-  inline unsigned GetNumBlock() const {
+  inline size_t GetNumBlock() const {
    return blocks.size();
  }

 private:
-  std::vector<unsigned> row_ptr;
-  std::vector<unsigned> index;
+  std::vector<size_t> row_ptr;
+  std::vector<uint32_t> index;
  const HistCutMatrix* cut;
  struct Block {
-    const unsigned* row_ptr_begin;
-    const unsigned* row_ptr_end;
-    const unsigned* index_begin;
-    const unsigned* index_end;
+    const size_t* row_ptr_begin;
+    const size_t* row_ptr_end;
+    const uint32_t* index_begin;
+    const uint32_t* index_end;
  };
  std::vector<Block> blocks;
 };
@@ -184,10 +184,10 @@ struct GHistRow {
  /*! \brief base pointer to first entry */
  GHistEntry* begin;
  /*! \brief number of entries */
-  unsigned size;
+  uint32_t size;

  GHistRow() {}
-  GHistRow(GHistEntry* begin, unsigned size)
+  GHistRow(GHistEntry* begin, uint32_t size)
    : begin(begin), size(size) {}
 };

@@ -198,19 +198,19 @@ class HistCollection {
 public:
  // access histogram for i-th node
  inline GHistRow operator[](bst_uint nid) const {
-    const size_t kMax = std::numeric_limits<size_t>::max();
+    const uint32_t kMax = std::numeric_limits<uint32_t>::max();
    CHECK_NE(row_ptr_[nid], kMax);
    return GHistRow(const_cast<GHistEntry*>(dmlc::BeginPtr(data_) + row_ptr_[nid]), nbins_);
  }

  // have we computed a histogram for i-th node?
  inline bool RowExists(bst_uint nid) const {
-    const size_t kMax = std::numeric_limits<size_t>::max();
+    const uint32_t kMax = std::numeric_limits<uint32_t>::max();
    return (nid < row_ptr_.size() && row_ptr_[nid] != kMax);
  }

  // initialize histogram collection
-  inline void Init(size_t nbins) {
+  inline void Init(uint32_t nbins) {
    nbins_ = nbins;
    row_ptr_.clear();
    data_.clear();
@@ -218,7 +218,7 @@ class HistCollection {

  // create an empty histogram for i-th node
  inline void AddHistRow(bst_uint nid) {
-    const size_t kMax = std::numeric_limits<size_t>::max();
+    const uint32_t kMax = std::numeric_limits<uint32_t>::max();
    if (nid >= row_ptr_.size()) {
      row_ptr_.resize(nid + 1, kMax);
    }
@@ -230,12 +230,12 @@ class HistCollection {

 private:
  /*! \brief number of all bins over all features */
-  size_t nbins_;
+  uint32_t nbins_;

  std::vector<GHistEntry> data_;

  /*! \brief row_ptr_[nid] locates bin for historgram of node nid */
-  std::vector<size_t> row_ptr_;
+  std::vector<uint32_t> row_ptr_;
 };

 /*!
@@ -244,7 +244,7 @@ class HistCollection {
 class GHistBuilder {
 public:
  // initialize builder
-  inline void Init(size_t nthread, size_t nbins) {
+  inline void Init(size_t nthread, uint32_t nbins) {
    nthread_ = nthread;
    nbins_ = nbins;
  }
@@ -268,7 +268,7 @@ class GHistBuilder {
  /*! \brief number of threads for parallel computation */
  size_t nthread_;
  /*! \brief number of all bins over all features */
-  size_t nbins_;
+  uint32_t nbins_;
  std::vector<GHistEntry> data_;
 };

--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -21,14 +21,14 @@ class RowSetCollection {
   *  rows (instances) associated with a particular node in a decision
   *  tree. */
  struct Elem {
-    const bst_uint* begin;
-    const bst_uint* end;
+    const size_t* begin;
+    const size_t* end;
    int node_id;
      // id of node associated with this instance set; -1 means uninitialized
    Elem(void)
        : begin(nullptr), end(nullptr), node_id(-1) {}
-    Elem(const bst_uint* begin,
-         const bst_uint* end,
+    Elem(const size_t* begin,
+         const size_t* end,
         int node_id)
        : begin(begin), end(end), node_id(node_id) {}

@@ -38,8 +38,8 @@ class RowSetCollection {
  };
  /* \brief specifies how to split a rowset into two */
  struct Split {
-    std::vector<bst_uint> left;
-    std::vector<bst_uint> right;
+    std::vector<size_t> left;
+    std::vector<size_t> right;
  };

  inline std::vector<Elem>::const_iterator begin() const {
@@ -65,8 +65,8 @@ class RowSetCollection {
  // initialize node id 0->everything
  inline void Init() {
    CHECK_EQ(elem_of_each_node_.size(), 0U);
-    const bst_uint* begin = dmlc::BeginPtr(row_indices_);
-    const bst_uint* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
+    const size_t* begin = dmlc::BeginPtr(row_indices_);
+    const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
  }
  // split rowset into two
@@ -77,16 +77,15 @@ class RowSetCollection {
    const Elem e = elem_of_each_node_[node_id];
    const unsigned nthread = row_split_tloc.size();
    CHECK(e.begin != nullptr);
-    bst_uint* all_begin = dmlc::BeginPtr(row_indices_);
-    bst_uint* begin = all_begin + (e.begin - all_begin);
+    size_t* all_begin = dmlc::BeginPtr(row_indices_);
+    size_t* begin = all_begin + (e.begin - all_begin);

-    bst_uint* it = begin;
-    // TODO(hcho3): parallelize this section
+    size_t* it = begin;
    for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
      std::copy(row_split_tloc[tid].left.begin(), row_split_tloc[tid].left.end(), it);
      it += row_split_tloc[tid].left.size();
    }
-    bst_uint* split_pt = it;
+    size_t* split_pt = it;
    for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
      std::copy(row_split_tloc[tid].right.begin(), row_split_tloc[tid].right.end(), it);
      it += row_split_tloc[tid].right.size();
@@ -105,7 +104,7 @@ class RowSetCollection {
  }

  // stores the row indices in the set
-  std::vector<bst_uint> row_indices_;
+  std::vector<size_t> row_indices_;

 private:
  // vector: node_id -> elements