Optimized ApplySplit, BuildHist and UpdatePredictCache functions on CPU (#5244)

* Split up sparse and dense build hist kernels. * Add `PartitionBuilder`.
2020-02-29 11:11:42 +03:00
parent b81f8cbbc0
commit 1b97eaf7a7
9 changed files with 694 additions and 387 deletions
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -37,6 +37,7 @@ class Column {
  size_t Size() const { return len_; }
  uint32_t GetGlobalBinIdx(size_t idx) const { return index_base_ + index_[idx]; }
  uint32_t GetFeatureBinIdx(size_t idx) const { return index_[idx]; }
+  common::Span<const uint32_t> GetFeatureBinIdxPtr() const { return { index_, len_ }; }
  // column.GetFeatureBinIdx(idx) + column.GetBaseIdx(idx) ==
  // column.GetGlobalBinIdx(idx)
  uint32_t GetBaseIdx() const { return index_base_; }
@@ -186,8 +187,8 @@ class ColumnMatrix {

  std::vector<size_t> feature_counts_;
  std::vector<ColumnType> type_;
-  SimpleArray<uint32_t> index_;  // index_: may store smaller integers; needs padding
-  SimpleArray<size_t> row_ind_;
+  std::vector<uint32_t> index_;  // index_: may store smaller integers; needs padding
+  std::vector<size_t> row_ind_;
  std::vector<ColumnBoundary> boundary_;

  // index_base_[fid]: least bin id for feature fid
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -672,7 +672,7 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
 }

 /*!
- * \brief fill a histogram by zeroes
+ * \brief fill a histogram by zeros in range [begin, end)
 */
 void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
  memset(hist.data() + begin, '\0', (end-begin)*sizeof(tree::GradStats));
@@ -719,40 +719,141 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2,
  }
 }

+struct Prefetch {
+ public:
+  static constexpr size_t kCacheLineSize = 64;
+  static constexpr size_t kPrefetchOffset = 10;
+  static constexpr size_t kPrefetchStep =
+      kCacheLineSize / sizeof(decltype(GHistIndexMatrix::index)::value_type);
+
+ private:
+  static constexpr size_t kNoPrefetchSize =
+      kPrefetchOffset + kCacheLineSize /
+      sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);
+
+ public:
+  static size_t NoPrefetchSize(size_t rows) {
+    return std::min(rows, kNoPrefetchSize);
+  }
+};
+
+constexpr size_t Prefetch::kNoPrefetchSize;
+
+template<typename FPType, bool do_prefetch>
+void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
+                          const RowSetCollection::Elem row_indices,
+                          const GHistIndexMatrix& gmat,
+                          const size_t n_features,
+                          GHistRow hist) {
+  const size_t size = row_indices.Size();
+  const size_t* rid = row_indices.begin;
+  const float* pgh = reinterpret_cast<const float*>(gpair.data());
+  const uint32_t* gradient_index = gmat.index.data();
+  FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
+
+  const uint32_t two {2};  // Each element from 'gpair' and 'hist' contains
+                           // 2 FP values: gradient and hessian.
+                           // So we need to multiply each row-index/bin-index by 2
+                           // to work with gradient pairs as a singe row FP array
+
+  for (size_t i = 0; i < size; ++i) {
+    const size_t icol_start = rid[i] * n_features;
+    const size_t idx_gh = two * rid[i];
+
+    if (do_prefetch) {
+      const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;
+
+      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
+      for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
+           j += Prefetch::kPrefetchStep) {
+        PREFETCH_READ_T0(gradient_index + j);
+      }
+    }
+
+    for (size_t j = icol_start; j < icol_start + n_features; ++j) {
+      const uint32_t idx_bin = two * gradient_index[j];
+
+      hist_data[idx_bin]   += pgh[idx_gh];
+      hist_data[idx_bin+1] += pgh[idx_gh+1];
+    }
+  }
+}
+
+template<typename FPType, bool do_prefetch>
+void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
+                           const RowSetCollection::Elem row_indices,
+                           const GHistIndexMatrix& gmat,
+                           GHistRow hist) {
+  const size_t size = row_indices.Size();
+  const size_t* rid = row_indices.begin;
+  const float* pgh = reinterpret_cast<const float*>(gpair.data());
+  const uint32_t* gradient_index = gmat.index.data();
+  const size_t* row_ptr =  gmat.row_ptr.data();
+  FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
+
+  const uint32_t two {2};  // Each element from 'gpair' and 'hist' contains
+                           // 2 FP values: gradient and hessian.
+                           // So we need to multiply each row-index/bin-index by 2
+                           // to work with gradient pairs as a singe row FP array
+
+  for (size_t i = 0; i < size; ++i) {
+    const size_t icol_start = row_ptr[rid[i]];
+    const size_t icol_end = row_ptr[rid[i]+1];
+    const size_t idx_gh = two * rid[i];
+
+    if (do_prefetch) {
+      const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
+      const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];
+
+      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
+      for (size_t j = icol_start_prftch; j < icol_end_prefect; j+=Prefetch::kPrefetchStep) {
+        PREFETCH_READ_T0(gradient_index + j);
+      }
+    }
+
+    for (size_t j = icol_start; j < icol_end; ++j) {
+      const uint32_t idx_bin = two * gradient_index[j];
+      hist_data[idx_bin]   += pgh[idx_gh];
+      hist_data[idx_bin+1] += pgh[idx_gh+1];
+    }
+  }
+}
+
+template<typename FPType, bool do_prefetch>
+void BuildHistKernel(const std::vector<GradientPair>& gpair,
+                     const RowSetCollection::Elem row_indices,
+                     const GHistIndexMatrix& gmat, const bool isDense, GHistRow hist) {
+  if (row_indices.Size() && isDense) {
+    const size_t* row_ptr =  gmat.row_ptr.data();
+    const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
+    BuildHistDenseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, n_features, hist);
+  } else {
+    BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices, gmat, hist);
+  }
+}

 void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
                             const RowSetCollection::Elem row_indices,
                             const GHistIndexMatrix& gmat,
-                             GHistRow hist) {
-  const size_t* rid =  row_indices.begin;
+                             GHistRow hist,
+                             bool isDense) {
+  using FPType = decltype(tree::GradStats::sum_grad);
  const size_t nrows = row_indices.Size();
-  const uint32_t* index = gmat.index.data();
-  const size_t* row_ptr =  gmat.row_ptr.data();
-  const float* pgh = reinterpret_cast<const float*>(gpair.data());
+  const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

-  double* hist_data = reinterpret_cast<double*>(hist.data());
+  // if need to work with all rows from bin-matrix (e.g. root node)
+  const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

-  const size_t cache_line_size = 64;
-  const size_t prefetch_offset = 10;
-  size_t no_prefetch_size = prefetch_offset + cache_line_size/sizeof(*rid);
-  no_prefetch_size = no_prefetch_size > nrows ? nrows : no_prefetch_size;
+  if (contiguousBlock) {
+    // contiguous memory access, built-in HW prefetching is enough
+    BuildHistKernel<FPType, false>(gpair, row_indices, gmat, isDense, hist);
+  } else {
+    const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
+    const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);

-  for (size_t i = 0; i < nrows; ++i) {
-    const size_t icol_start = row_ptr[rid[i]];
-    const size_t icol_end = row_ptr[rid[i]+1];
-
-    if (i < nrows - no_prefetch_size) {
-      PREFETCH_READ_T0(row_ptr + rid[i + prefetch_offset]);
-      PREFETCH_READ_T0(pgh + 2*rid[i + prefetch_offset]);
-    }
-
-    for (size_t j = icol_start; j < icol_end; ++j) {
-      const uint32_t idx_bin = 2*index[j];
-      const size_t idx_gh = 2*rid[i];
-
-      hist_data[idx_bin] += pgh[idx_gh];
-      hist_data[idx_bin+1] += pgh[idx_gh+1];
-    }
+    BuildHistKernel<FPType, true>(gpair, span1, gmat, isDense, hist);
+    // no prefetching to avoid loading extra memory
+    BuildHistKernel<FPType, false>(gpair, span2, gmat, isDense, hist);
  }
 }

--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017 by Contributors
+ * Copyright 2017-2020 by Contributors
 * \file hist_util.h
 * \brief Utility for fast histogram aggregation
 * \author Philip Cho, Tianqi Chen
@@ -25,75 +25,6 @@

 namespace xgboost {
 namespace common {
-
-/*
- * \brief A thin wrapper around dynamically allocated C-style array.
- * Make sure to call resize() before use.
- */
-template<typename T>
-struct SimpleArray {
-  ~SimpleArray() {
-    std::free(ptr_);
-    ptr_ = nullptr;
-  }
-
-  void resize(size_t n) {
-    T* ptr = static_cast<T*>(std::malloc(n * sizeof(T)));
-    CHECK(ptr) << "Failed to allocate memory";
-    if (ptr_) {
-      std::memcpy(ptr, ptr_, n_ * sizeof(T));
-      std::free(ptr_);
-    }
-    ptr_ = ptr;
-    n_ = n;
-  }
-
-  T& operator[](size_t idx) {
-    return ptr_[idx];
-  }
-
-  T& operator[](size_t idx) const {
-    return ptr_[idx];
-  }
-
-  size_t size() const {
-    return n_;
-  }
-
-  T back() const {
-    return ptr_[n_-1];
-  }
-
-  T* data() {
-    return ptr_;
-  }
-
-  const T* data() const {
-    return ptr_;
-  }
-
-
-  T* begin() {
-    return ptr_;
-  }
-
-  const T* begin() const {
-    return ptr_;
-  }
-
-  T* end() {
-    return ptr_ + n_;
-  }
-
-  const T* end() const {
-    return ptr_ + n_;
-  }
-
- private:
-  T* ptr_ = nullptr;
-  size_t n_ = 0;
-};
-
 /*!
 * \brief A single row in global histogram index.
 *  Directly represent the global index in the histogram entry.
@@ -161,7 +92,7 @@ class HistogramCuts {
    return idx;
  }

-  BinIdx SearchBin(Entry const& e) {
+  BinIdx SearchBin(Entry const& e) const {
    return SearchBin(e.fvalue, e.index);
  }
 };
@@ -261,8 +192,9 @@ size_t DeviceSketch(int device,

 /*!
 * \brief preprocessed global index matrix, in CSR format
- *  Transform floating values to integer index in histogram
- *  This is a global histogram index.
+ *
+ *  Transform floating values to integer index in histogram This is a global histogram
+ *  index for CPU histogram.  On GPU ellpack page is used.
 */
 struct GHistIndexMatrix {
  /*! \brief row pointer to rows by element position */
@@ -606,17 +538,15 @@ class ParallelGHistBuilder {
 */
 class GHistBuilder {
 public:
-  // initialize builder
-  inline void Init(size_t nthread, uint32_t nbins) {
-    nthread_ = nthread;
-    nbins_ = nbins;
-  }
+  GHistBuilder() : nthread_{0}, nbins_{0} {}
+  GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}

  // construct a histogram via histogram aggregation
  void BuildHist(const std::vector<GradientPair>& gpair,
                 const RowSetCollection::Elem row_indices,
                 const GHistIndexMatrix& gmat,
-                 GHistRow hist);
+                 GHistRow hist,
+                 bool isDense);
  // same, with feature grouping
  void BuildBlockHist(const std::vector<GradientPair>& gpair,
                      const RowSetCollection::Elem row_indices,
@@ -625,7 +555,7 @@ class GHistBuilder {
  // construct a histogram via subtraction trick
  void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);

-  uint32_t GetNumBins() {
+  uint32_t GetNumBins() const {
      return nbins_;
  }

--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -10,6 +10,7 @@
 #include <xgboost/data.h>
 #include <algorithm>
 #include <vector>
+#include <utility>

 namespace xgboost {
 namespace common {
@@ -29,7 +30,7 @@ class RowSetCollection {
         = default;
    Elem(const size_t* begin,
         const size_t* end,
-         int node_id)
+         int node_id = -1)
        : begin(begin), end(end), node_id(node_id) {}

    inline size_t Size() const {
@@ -57,6 +58,13 @@ class RowSetCollection {
        << "access element that is not in the set";
    return e;
  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline Elem& operator[](unsigned node_id) {
+    Elem& e = elem_of_each_node_[node_id];
+    return e;
+  }
+
  // clear up things
  inline void Clear() {
    elem_of_each_node_.clear();
@@ -83,25 +91,18 @@ class RowSetCollection {
  }
  // split rowset into two
  inline void AddSplit(unsigned node_id,
-                       const std::vector<Split>& row_split_tloc,
                       unsigned left_node_id,
-                       unsigned right_node_id) {
+                       unsigned right_node_id,
+                       size_t n_left,
+                       size_t n_right) {
    const Elem e = elem_of_each_node_[node_id];
-    const auto nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
    CHECK(e.begin != nullptr);
    size_t* all_begin = dmlc::BeginPtr(row_indices_);
    size_t* begin = all_begin + (e.begin - all_begin);

-    size_t* it = begin;
-    for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
-      std::copy(row_split_tloc[tid].left.begin(), row_split_tloc[tid].left.end(), it);
-      it += row_split_tloc[tid].left.size();
-    }
-    size_t* split_pt = it;
-    for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
-      std::copy(row_split_tloc[tid].right.begin(), row_split_tloc[tid].right.end(), it);
-      it += row_split_tloc[tid].right.size();
-    }
+    CHECK_EQ(n_left + n_right, e.Size());
+    CHECK_LE(begin + n_left, e.end);
+    CHECK_EQ(begin + n_left + n_right, e.end);

    if (left_node_id >= elem_of_each_node_.size()) {
      elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
@@ -110,12 +111,12 @@ class RowSetCollection {
      elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
    }

-    elem_of_each_node_[left_node_id] = Elem(begin, split_pt, left_node_id);
-    elem_of_each_node_[right_node_id] = Elem(split_pt, e.end, right_node_id);
+    elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
+    elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
    elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
  }

-  // stores the row indices in the set
+  // stores the row indexes in the set
  std::vector<size_t> row_indices_;

 private:
@@ -123,6 +124,121 @@ class RowSetCollection {
  std::vector<Elem> elem_of_each_node_;
 };

+
+// The builder is required for samples partition to left and rights children for set of nodes
+// Responsible for:
+// 1) Effective memory allocation for intermediate results for multi-thread work
+// 2) Merging partial results produced by threads into original row set (row_set_collection_)
+// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
+template<size_t BlockSize>
+class PartitionBuilder {
+ public:
+  template<typename Func>
+  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) {
+    left_right_nodes_sizes_.resize(n_nodes);
+    blocks_offsets_.resize(n_nodes+1);
+
+    blocks_offsets_[0] = 0;
+    for (size_t i = 1; i < n_nodes+1; ++i) {
+      blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1);
+    }
+
+    if (n_tasks > max_n_tasks_) {
+      mem_blocks_.resize(n_tasks);
+      max_n_tasks_ = n_tasks;
+    }
+  }
+
+  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx).left(), end - begin };
+  }
+
+  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+    const size_t task_idx = GetTaskIdx(nid, begin);
+    return { mem_blocks_.at(task_idx).right(), end - begin };
+  }
+
+  void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx).n_left = n_left;
+  }
+
+  void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+    mem_blocks_.at(task_idx).n_right = n_right;
+  }
+
+
+  size_t GetNLeftElems(int nid) const {
+    return left_right_nodes_sizes_[nid].first;
+  }
+
+  size_t GetNRightElems(int nid) const {
+    return left_right_nodes_sizes_[nid].second;
+  }
+
+  // Each thread has partial results for some set of tree-nodes
+  // The function decides order of merging partial results into final row set
+  void CalculateRowOffsets() {
+    for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) {
+      size_t n_left = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j].n_offset_left = n_left;
+        n_left += mem_blocks_[j].n_left;
+      }
+      size_t n_right = 0;
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+        mem_blocks_[j].n_offset_right = n_left + n_right;
+        n_right += mem_blocks_[j].n_right;
+      }
+      left_right_nodes_sizes_[i] = {n_left, n_right};
+    }
+  }
+
+  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+    size_t task_idx = GetTaskIdx(nid, begin);
+
+    size_t* left_result  = rows_indexes + mem_blocks_[task_idx].n_offset_left;
+    size_t* right_result = rows_indexes + mem_blocks_[task_idx].n_offset_right;
+
+    const size_t* left = mem_blocks_[task_idx].left();
+    const size_t* right = mem_blocks_[task_idx].right();
+
+    std::copy_n(left, mem_blocks_[task_idx].n_left, left_result);
+    std::copy_n(right, mem_blocks_[task_idx].n_right, right_result);
+  }
+
+ protected:
+  size_t GetTaskIdx(int nid, size_t begin) {
+    return blocks_offsets_[nid] + begin / BlockSize;
+  }
+
+  struct BlockInfo{
+    size_t n_left;
+    size_t n_right;
+
+    size_t n_offset_left;
+    size_t n_offset_right;
+
+    size_t* left() {
+      return &left_data_[0];
+    }
+
+    size_t* right() {
+      return &right_data_[0];
+    }
+   private:
+    alignas(128) size_t left_data_[BlockSize];
+    alignas(128) size_t right_data_[BlockSize];
+  };
+  std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
+  std::vector<size_t> blocks_offsets_;
+  std::vector<BlockInfo> mem_blocks_;
+  size_t max_n_tasks_ = 0;
+};
+
+
 }  // namespace common
 }  // namespace xgboost

--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -9,6 +9,8 @@
 #include <vector>
 #include <algorithm>

+#include "xgboost/logging.h"
+
 namespace xgboost {
 namespace common {

@@ -20,11 +22,11 @@ class Range1d {
    CHECK_LT(begin, end);
  }

-  size_t begin() {
+  size_t begin() const {  // NOLINT
    return begin_;
  }

-  size_t end() {
+  size_t end() const {  // NOLINT
    return end_;
  }