Simplify sparse and dense CPU hist kernels (#7029)

* Simplify sparse and dense kernels * Extract row partitioner. Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
2021-06-11 13:26:30 +03:00
parent 1faad825f4
commit 2567404ab6
10 changed files with 369 additions and 434 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -30,6 +30,8 @@ enum ColumnType {
 template <typename BinIdxType>
 class Column {
 public:
+  static constexpr int32_t kMissingId = -1;
+
  Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
      : type_(type),
        index_(index),
@@ -71,6 +73,30 @@ class SparseColumn: public Column<BinIdxType> {

  const size_t* GetRowData() const { return row_ind_.data(); }

+  int32_t GetBinIdx(size_t rid, size_t* state) const {
+    const size_t column_size = this->Size();
+    if (!((*state) < column_size)) {
+      return this->kMissingId;
+    }
+    while ((*state) < column_size && GetRowIdx(*state) < rid) {
+      ++(*state);
+    }
+    if (((*state) < column_size) && GetRowIdx(*state) == rid) {
+      return this->GetGlobalBinIdx(*state);
+    } else {
+      return this->kMissingId;
+    }
+  }
+
+  size_t GetInitialState(const size_t first_row_id) const {
+    const size_t* row_data = GetRowData();
+    const size_t column_size = this->Size();
+    // search first nonzero row with index >= rid_span.front()
+    const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
+    // column_size if all messing
+    return p - row_data;
+  }
+
  size_t GetRowIdx(size_t idx) const {
    return row_ind_.data()[idx];
  }
@@ -80,7 +106,7 @@ class SparseColumn: public Column<BinIdxType> {
  common::Span<const size_t> row_ind_;
 };

-template <typename BinIdxType>
+template <typename BinIdxType, bool any_missing>
 class DenseColumn: public Column<BinIdxType> {
 public:
  DenseColumn(ColumnType type, common::Span<const BinIdxType> index,
@@ -90,6 +116,19 @@ class DenseColumn: public Column<BinIdxType> {
        missing_flags_(missing_flags),
        feature_offset_(feature_offset) {}
  bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }
+
+  int32_t GetBinIdx(size_t idx, size_t* state) const {
+    if (any_missing) {
+      return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx);
+    } else {
+      return this->GetGlobalBinIdx(idx);
+    }
+  }
+
+  size_t GetInitialState(const size_t first_row_id) const {
+    return 0;
+  }
+
 private:
  /* flags for missing values in dense columns */
  const std::vector<bool>& missing_flags_;
@@ -202,7 +241,7 @@ class ColumnMatrix {

  /* Fetch an individual column. This code should be used with type swith
     to determine type of bin id's */
-  template <typename BinIdxType>
+  template <typename BinIdxType, bool any_missing>
  std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
    CHECK_EQ(sizeof(BinIdxType), bins_type_size_);

@@ -213,7 +252,8 @@ class ColumnMatrix {
                                                 column_size };
    std::unique_ptr<const Column<BinIdxType> > res;
    if (type_[fid] == ColumnType::kDenseColumn) {
-      res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
+      CHECK_EQ(any_missing, any_missing_);
+      res.reset(new DenseColumn<BinIdxType, any_missing>(type_[fid], bin_index, index_base_[fid],
                                             missing_flags_, feature_offset));
    } else {
      res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -287,57 +287,18 @@ struct Prefetch {
 constexpr size_t Prefetch::kNoPrefetchSize;


-template<typename FPType, bool do_prefetch, typename BinIdxType>
-void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
+template<typename FPType, bool do_prefetch, typename BinIdxType, bool any_missing = true>
+void BuildHistKernel(const std::vector<GradientPair>& gpair,
                          const RowSetCollection::Elem row_indices,
                          const GHistIndexMatrix& gmat,
-                          const size_t n_features,
                          GHistRow<FPType> hist) {
  const size_t size = row_indices.Size();
  const size_t* rid = row_indices.begin;
  const float* pgh = reinterpret_cast<const float*>(gpair.data());
  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
-  const uint32_t* offsets = gmat.index.Offset();
-  FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
-  const uint32_t two {2};  // Each element from 'gpair' and 'hist' contains
-                           // 2 FP values: gradient and hessian.
-                           // So we need to multiply each row-index/bin-index by 2
-                           // to work with gradient pairs as a singe row FP array
-
-  for (size_t i = 0; i < size; ++i) {
-    const size_t icol_start = rid[i] * n_features;
-    const size_t idx_gh = two * rid[i];
-
-    if (do_prefetch) {
-      const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;
-
-      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
-      for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
-           j += Prefetch::GetPrefetchStep<BinIdxType>()) {
-        PREFETCH_READ_T0(gradient_index + j);
-      }
-    }
-    const BinIdxType* gr_index_local = gradient_index + icol_start;
-    for (size_t j = 0; j < n_features; ++j) {
-      const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
-                                      offsets[j]);
-
-      hist_data[idx_bin]   += pgh[idx_gh];
-      hist_data[idx_bin+1] += pgh[idx_gh+1];
-    }
-  }
-}
-
-template<typename FPType, bool do_prefetch>
-void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
-                           const RowSetCollection::Elem row_indices,
-                           const GHistIndexMatrix& gmat,
-                           GHistRow<FPType> hist) {
-  const size_t size = row_indices.Size();
-  const size_t* rid = row_indices.begin;
-  const float* pgh = reinterpret_cast<const float*>(gpair.data());
-  const uint32_t* gradient_index = gmat.index.data<uint32_t>();
  const size_t* row_ptr =  gmat.row_ptr.data();
+  const uint32_t* offsets = gmat.index.Offset();
+  const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
  FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
  const uint32_t two {2};  // Each element from 'gpair' and 'hist' contains
                           // 2 FP values: gradient and hessian.
@@ -345,13 +306,16 @@ void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
                           // to work with gradient pairs as a singe row FP array

  for (size_t i = 0; i < size; ++i) {
-    const size_t icol_start = row_ptr[rid[i]];
-    const size_t icol_end = row_ptr[rid[i]+1];
+    const size_t icol_start = any_missing ? row_ptr[rid[i]] : rid[i] * n_features;
+    const size_t icol_end =  any_missing ? row_ptr[rid[i]+1] : icol_start + n_features;
+    const size_t row_size = icol_end - icol_start;
    const size_t idx_gh = two * rid[i];

    if (do_prefetch) {
-      const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
-      const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];
+      const size_t icol_start_prftch = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]] :
+                                       rid[i + Prefetch::kPrefetchOffset] * n_features;
+      const size_t icol_end_prefect = any_missing ?  row_ptr[rid[i+Prefetch::kPrefetchOffset]+1] :
+                                      icol_start_prftch + n_features;

      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
      for (size_t j = icol_start_prftch; j < icol_end_prefect;
@@ -359,47 +323,34 @@ void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
        PREFETCH_READ_T0(gradient_index + j);
      }
    }
-    for (size_t j = icol_start; j < icol_end; ++j) {
-      const uint32_t idx_bin = two * gradient_index[j];
+    const BinIdxType* gr_index_local = gradient_index + icol_start;
+
+    for (size_t j = 0; j < row_size; ++j) {
+      const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) + (
+                                      any_missing ? 0 : offsets[j]));
+
      hist_data[idx_bin]   += pgh[idx_gh];
      hist_data[idx_bin+1] += pgh[idx_gh+1];
    }
  }
 }

-
-template<typename FPType, bool do_prefetch, typename BinIdxType>
-void BuildHistDispatchKernel(const std::vector<GradientPair>& gpair,
+template<typename FPType, bool do_prefetch, bool any_missing>
+void BuildHistDispatch(const std::vector<GradientPair>& gpair,
                     const RowSetCollection::Elem row_indices,
-                     const GHistIndexMatrix& gmat, GHistRow<FPType> hist, bool isDense) {
-  if (isDense) {
-    const size_t* row_ptr =  gmat.row_ptr.data();
-    const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
-    BuildHistDenseKernel<FPType, do_prefetch, BinIdxType>(gpair, row_indices,
-                                                       gmat, n_features, hist);
-  } else {
-    BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices,
-                                                        gmat, hist);
-  }
-}
-
-template<typename FPType, bool do_prefetch>
-void BuildHistKernel(const std::vector<GradientPair>& gpair,
-                     const RowSetCollection::Elem row_indices,
-                     const GHistIndexMatrix& gmat, const bool isDense, GHistRow<FPType> hist) {
-  const bool is_dense = row_indices.Size() && isDense;
+                     const GHistIndexMatrix& gmat, GHistRow<FPType> hist) {
  switch (gmat.index.GetBinTypeSize()) {
    case kUint8BinsTypeSize:
-      BuildHistDispatchKernel<FPType, do_prefetch, uint8_t>(gpair, row_indices,
-                                                            gmat, hist, is_dense);
+      BuildHistKernel<FPType, do_prefetch, uint8_t, any_missing>(gpair, row_indices,
+                                                                      gmat, hist);
      break;
    case kUint16BinsTypeSize:
-      BuildHistDispatchKernel<FPType, do_prefetch, uint16_t>(gpair, row_indices,
-                                                             gmat, hist, is_dense);
+      BuildHistKernel<FPType, do_prefetch, uint16_t, any_missing>(gpair, row_indices,
+                                                                       gmat, hist);
      break;
    case kUint32BinsTypeSize:
-      BuildHistDispatchKernel<FPType, do_prefetch, uint32_t>(gpair, row_indices,
-                                                             gmat, hist, is_dense);
+      BuildHistKernel<FPType, do_prefetch, uint32_t, any_missing>(gpair, row_indices,
+                                                                       gmat, hist);
      break;
    default:
      CHECK(false);  // no default behavior
@@ -407,10 +358,12 @@ void BuildHistKernel(const std::vector<GradientPair>& gpair,
 }

 template <typename GradientSumT>
+template <bool any_missing>
 void GHistBuilder<GradientSumT>::BuildHist(
    const std::vector<GradientPair> &gpair,
-    const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-    GHistRowT hist, bool isDense) {
+    const RowSetCollection::Elem row_indices,
+    const GHistIndexMatrix &gmat,
+    GHistRowT hist) {
  const size_t nrows = row_indices.Size();
  const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

@@ -419,28 +372,36 @@ void GHistBuilder<GradientSumT>::BuildHist(

  if (contiguousBlock) {
    // contiguous memory access, built-in HW prefetching is enough
-    BuildHistKernel<GradientSumT, false>(gpair, row_indices, gmat, isDense, hist);
+    BuildHistDispatch<GradientSumT, false, any_missing>(gpair, row_indices, gmat, hist);
  } else {
    const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
    const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);

-    BuildHistKernel<GradientSumT, true>(gpair, span1, gmat, isDense, hist);
+    BuildHistDispatch<GradientSumT, true, any_missing>(gpair, span1, gmat, hist);
    // no prefetching to avoid loading extra memory
-    BuildHistKernel<GradientSumT, false>(gpair, span2, gmat, isDense, hist);
+    BuildHistDispatch<GradientSumT, false, any_missing>(gpair, span2, gmat, hist);
  }
 }
 template
-void GHistBuilder<float>::BuildHist(const std::vector<GradientPair>& gpair,
+void GHistBuilder<float>::BuildHist<true>(const std::vector<GradientPair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
-                             GHistRow<float> hist,
-                             bool isDense);
+                             GHistRow<float> hist);
 template
-void GHistBuilder<double>::BuildHist(const std::vector<GradientPair>& gpair,
+void GHistBuilder<float>::BuildHist<false>(const std::vector<GradientPair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
-                             GHistRow<double> hist,
-                             bool isDense);
+                             GHistRow<float> hist);
+template
+void GHistBuilder<double>::BuildHist<true>(const std::vector<GradientPair>& gpair,
+                             const RowSetCollection::Elem row_indices,
+                             const GHistIndexMatrix& gmat,
+                             GHistRow<double> hist);
+template
+void GHistBuilder<double>::BuildHist<false>(const std::vector<GradientPair>& gpair,
+                             const RowSetCollection::Elem row_indices,
+                             const GHistIndexMatrix& gmat,
+                             GHistRow<double> hist);

 template<typename GradientSumT>
 void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -627,11 +627,11 @@ class GHistBuilder {
  GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}

  // construct a histogram via histogram aggregation
+  template <bool any_missing>
  void BuildHist(const std::vector<GradientPair>& gpair,
                 const RowSetCollection::Elem row_indices,
                 const GHistIndexMatrix& gmat,
-                 GHistRowT hist,
-                 bool isDense);
+                 GHistRowT hist);
  // construct a histogram via subtraction trick
  void SubtractionTrick(GHistRowT self,
                        GHistRowT sibling,
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -0,0 +1,228 @@
+
+/*!
+ * Copyright 2021 by Contributors
+ * \file row_set.h
+ * \brief Quick Utility to compute subset of rows
+ * \author Philip Cho, Tianqi Chen
+ */
+#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_
+#define XGBOOST_COMMON_PARTITION_BUILDER_H_
+
+#include <xgboost/data.h>
+#include <algorithm>
+#include <vector>
+#include <utility>
+#include <memory>
+#include "xgboost/tree_model.h"
+#include "../common/column_matrix.h"
+
+namespace xgboost {
+namespace common {
+
+// The builder is required for samples partition to left and rights children for set of nodes
+// Responsible for:
+// 1) Effective memory allocation for intermediate results for multi-thread work
+// 2) Merging partial results produced by threads into original row set (row_set_collection_)
+// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
+template<size_t BlockSize>
+class PartitionBuilder {
+ public:
+  template<typename Func>
+  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) {
+    left_right_nodes_sizes_.resize(n_nodes);
+    blocks_offsets_.resize(n_nodes+1);
+
+    blocks_offsets_[0] = 0;
+    for (size_t i = 1; i < n_nodes+1; ++i) {
+      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1);
+    }
+
+    if (n_tasks > max_n_tasks_) {
+      mem_blocks_.resize(n_tasks);
+      max_n_tasks_ = n_tasks;
+    }
+  }
+
+  // split row indexes (rid_span) to 2 parts (left_part, right_part) depending
+  // on comparison of indexes values (idx_span) and split point (split_cond)
+  // Handle dense columns
+  // Analog of std::stable_partition, but in no-inplace manner
+  template <bool default_left, bool any_missing, typename ColumnType>
+  inline std::pair<size_t, size_t> PartitionKernel(const ColumnType& column,
+        common::Span<const size_t> rid_span, const int32_t split_cond,
+        common::Span<size_t> left_part, common::Span<size_t> right_part) {
+    size_t* p_left_part = left_part.data();
+    size_t* p_right_part = right_part.data();
+    size_t nleft_elems = 0;
+    size_t nright_elems = 0;
+    auto state = column.GetInitialState(rid_span.front());
+
+    for (auto rid : rid_span) {
+      const int32_t bin_id = column.GetBinIdx(rid, &state);
+      if (any_missing && bin_id == ColumnType::kMissingId) {
+        if (default_left) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      } else {
+        if (bin_id <= split_cond) {
+          p_left_part[nleft_elems++] = rid;
+        } else {
+          p_right_part[nright_elems++] = rid;
+        }
+      }
+    }
+
+    return {nleft_elems, nright_elems};
+  }
+
+
+  template <typename BinIdxType, bool any_missing>
+  void Partition(const size_t node_in_set, const size_t nid, const common::Range1d range,
+                       const int32_t split_cond,
+                       const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left  = GetLeftBuffer(node_in_set,
+                                                                  range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set,
+                                                                  range.begin(), range.end());
+    const bst_uint fid = tree[nid].SplitIndex();
+    const bool default_left = tree[nid].DefaultLeft();
+    const auto column_ptr = column_matrix.GetColumn<BinIdxType, any_missing>(fid);
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+
+    if (column_ptr->GetType() == xgboost::common::kDenseColumn) {
+      const common::DenseColumn<BinIdxType, any_missing>& column =
+            static_cast<const common::DenseColumn<BinIdxType, any_missing>& >(*(column_ptr.get()));
+      if (default_left) {
+        child_nodes_sizes = PartitionKernel<true, any_missing>(column, rid_span,
+                                                              split_cond, left, right);
+      } else {
+        child_nodes_sizes = PartitionKernel<false, any_missing>(column, rid_span,
+                                                                split_cond, left, right);
+      }
+    } else {
+      CHECK_EQ(any_missing, true);
+      const common::SparseColumn<BinIdxType>& column
+        = static_cast<const common::SparseColumn<BinIdxType>& >(*(column_ptr.get()));
+      if (default_left) {
+        child_nodes_sizes = PartitionKernel<true, any_missing>(column, rid_span,
+                                                        split_cond, left, right);
+      } else {
+        child_nodes_sizes = PartitionKernel<false, any_missing>(column, rid_span,
+                                                        split_cond, left, right);
+      }
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), range.end(), n_left);
+    SetNRightElems(node_in_set, range.begin(), range.end(), n_right);
+  }
+
+
+  // allocate thread local memory, should be called for each specific task
+  void AllocateForTask(size_t id) {
+    if (mem_blocks_[id].get() == nullptr) {
+      BlockInfo* local_block_ptr = new BlockInfo;
+      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
+      mem_blocks_[id].reset(local_block_ptr);
+    }
+  }
+
+  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Left(), end - begin };
+  }
+
+  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx)->Right(), end - begin };
+  }
+
+  void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_left = n_left;
+  }
+
+  void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx)->n_right = n_right;
+  }
+
+
+  size_t GetNLeftElems(int nid) const {
+    return left_right_nodes_sizes_[nid].first;
+  }
+
+  size_t GetNRightElems(int nid) const {
+    return left_right_nodes_sizes_[nid].second;
+  }
+
+  // Each thread has partial results for some set of tree-nodes
+  // The function decides order of merging partial results into final row set
+  void CalculateRowOffsets() {
+    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
+      size_t n_left = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j]->n_offset_left = n_left;
+        n_left += mem_blocks_[j]->n_left;
+      }
+      size_t n_right = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j]->n_offset_right = n_left + n_right;
+        n_right += mem_blocks_[j]->n_right;
+      }
+      left_right_nodes_sizes_[i] = {n_left, n_right};
+    }
+  }
+
+  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+
+    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
+    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
+
+    const size_t* left = mem_blocks_[task_idx]->Left();
+    const size_t* right = mem_blocks_[task_idx]->Right();
+
+    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
+    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
+  }
+
+  size_t GetTaskIdx(int nid, size_t begin) {
+    return blocks_offsets_[nid] + begin / BlockSize;
+  }
+
+ protected:
+  struct BlockInfo{
+    size_t n_left;
+    size_t n_right;
+
+    size_t n_offset_left;
+    size_t n_offset_right;
+
+    size_t* Left() {
+      return &left_data_[0];
+    }
+
+    size_t* Right() {
+      return &right_data_[0];
+    }
+   private:
+    size_t left_data_[BlockSize];
+    size_t right_data_[BlockSize];
+  };
+  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
+  std::vector<size_t> blocks_offsets_;
+  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
+  size_t max_n_tasks_ = 0;
+};
+
+}  // namespace common
+}  // namespace xgboost
+
+#endif  // XGBOOST_COMMON_PARTITION_BUILDER_H_
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -126,130 +126,6 @@ class RowSetCollection {
  std::vector<Elem> elem_of_each_node_;
 };

-
-// The builder is required for samples partition to left and rights children for set of nodes
-// Responsible for:
-// 1) Effective memory allocation for intermediate results for multi-thread work
-// 2) Merging partial results produced by threads into original row set (row_set_collection_)
-// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
-template<size_t BlockSize>
-class PartitionBuilder {
- public:
-  template<typename Func>
-  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) {
-    left_right_nodes_sizes_.resize(n_nodes);
-    blocks_offsets_.resize(n_nodes+1);
-
-    blocks_offsets_[0] = 0;
-    for (size_t i = 1; i < n_nodes+1; ++i) {
-      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1);
-    }
-
-    if (n_tasks > max_n_tasks_) {
-      mem_blocks_.resize(n_tasks);
-      max_n_tasks_ = n_tasks;
-    }
-  }
-
-  // allocate thread local memory, should be called for each specific task
-  void AllocateForTask(size_t id) {
-    if (mem_blocks_[id].get() == nullptr) {
-      BlockInfo* local_block_ptr = new BlockInfo;
-      CHECK_NE(local_block_ptr, (BlockInfo*)nullptr);
-      mem_blocks_[id].reset(local_block_ptr);
-    }
-  }
-
-  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Left(), end - begin };
-  }
-
-  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
-    const size_t task_idx = GetTaskIdx(nid, begin);
-    return { mem_blocks_.at(task_idx)->Right(), end - begin };
-  }
-
-  void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_left = n_left;
-  }
-
-  void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-    mem_blocks_.at(task_idx)->n_right = n_right;
-  }
-
-
-  size_t GetNLeftElems(int nid) const {
-    return left_right_nodes_sizes_[nid].first;
-  }
-
-  size_t GetNRightElems(int nid) const {
-    return left_right_nodes_sizes_[nid].second;
-  }
-
-  // Each thread has partial results for some set of tree-nodes
-  // The function decides order of merging partial results into final row set
-  void CalculateRowOffsets() {
-    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
-      size_t n_left = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
-        mem_blocks_[j]->n_offset_left = n_left;
-        n_left += mem_blocks_[j]->n_left;
-      }
-      size_t n_right = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
-        mem_blocks_[j]->n_offset_right = n_left + n_right;
-        n_right += mem_blocks_[j]->n_right;
-      }
-      left_right_nodes_sizes_[i] = {n_left, n_right};
-    }
-  }
-
-  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
-    size_t task_idx = GetTaskIdx(nid, begin);
-
-    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
-    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
-
-    const size_t* left = mem_blocks_[task_idx]->Left();
-    const size_t* right = mem_blocks_[task_idx]->Right();
-
-    std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
-    std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
-  }
-
-  size_t GetTaskIdx(int nid, size_t begin) {
-    return blocks_offsets_[nid] + begin / BlockSize;
-  }
-
- protected:
-  struct BlockInfo{
-    size_t n_left;
-    size_t n_right;
-
-    size_t n_offset_left;
-    size_t n_offset_right;
-
-    size_t* Left() {
-      return &left_data_[0];
-    }
-
-    size_t* Right() {
-      return &right_data_[0];
-    }
-   private:
-    size_t left_data_[BlockSize];
-    size_t right_data_[BlockSize];
-  };
-  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
-  std::vector<size_t> blocks_offsets_;
-  std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
-  size_t max_n_tasks_ = 0;
-};
-
-
 }  // namespace common
 }  // namespace xgboost