Use view for SparsePage exclusively. (#6590)

2021-01-11 18:04:55 +08:00
parent 78f2cd83d7
commit f2f7dd87b8
23 changed files with 151 additions and 113 deletions
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -78,6 +78,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {
    const size_t batch_threads = std::max(
        size_t(1),
        std::min(batch.Size(), static_cast<size_t>(omp_get_max_threads())));
+    auto page = batch.GetView();
    MemStackAllocator<size_t, 128> partial_sums(batch_threads);
    size_t* p_part = partial_sums.Get();

@@ -92,7 +93,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_bins) {

        size_t sum = 0;
        for (size_t i = ibegin; i < iend; ++i) {
-          sum += batch[i].size();
+          sum += page[i].size();
          row_ptr[rbegin + 1 + i] = sum;
        }
      }
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -825,19 +825,20 @@ SparsePage SparsePage::GetTranspose(int num_columns) const {
  const int nthread = omp_get_max_threads();
  builder.InitBudget(num_columns, nthread);
  long batch_size = static_cast<long>(this->Size());  // NOLINT(*)
-#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
+    auto page = this->GetView();
+#pragma omp parallel for default(none) shared(batch_size, builder, page) schedule(static)
  for (long i = 0; i < batch_size; ++i) {  // NOLINT(*)
    int tid = omp_get_thread_num();
-    auto inst = (*this)[i];
+    auto inst = page[i];
    for (const auto& entry : inst) {
      builder.AddBudget(entry.index, tid);
    }
  }
  builder.InitStorage();
-#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
+#pragma omp parallel for default(none) shared(batch_size, builder, page) schedule(static)
  for (long i = 0; i < batch_size; ++i) {  // NOLINT(*)
    int tid = omp_get_thread_num();
-    auto inst = (*this)[i];
+    auto inst = page[i];
    for (const auto& entry : inst) {
      builder.Push(
          entry.index,
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -28,13 +28,12 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
  auto out = new SimpleDMatrix;
  SparsePage& out_page = out->sparse_page_;
  for (auto const &page : this->GetBatches<SparsePage>()) {
-    page.data.HostVector();
-    page.offset.HostVector();
+    auto batch = page.GetView();
    auto& h_data = out_page.data.HostVector();
    auto& h_offset = out_page.offset.HostVector();
    size_t rptr{0};
    for (auto ridx : ridxs) {
-      auto inst = page[ridx];
+      auto inst = batch[ridx];
      rptr += inst.size();
      std::copy(inst.begin(), inst.end(), std::back_inserter(h_data));
      h_offset.emplace_back(rptr);
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -173,9 +173,10 @@ class GBLinear : public GradientBooster {
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      // parallel over local batch
      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
+      auto page = batch.GetView();
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nsize; ++i) {
-        auto inst = batch[i];
+        auto inst = page[i];
        auto row_idx = static_cast<size_t>(batch.base_rowid + i);
        // loop over output groups
        for (int gid = 0; gid < ngroup; ++gid) {
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -678,6 +678,7 @@ class Dart : public GBTree {
    CHECK_EQ(preds.size(), p_fmat->Info().num_row_ * num_group);
    // start collecting the prediction
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+      auto page = batch.GetView();
      constexpr int kUnroll = 8;
      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
      const bst_omp_uint rest = nsize % kUnroll;
@@ -692,7 +693,7 @@ class Dart : public GBTree {
            ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
          }
          for (int k = 0; k < kUnroll; ++k) {
-            inst[k] = batch[i + k];
+            inst[k] = page[i + k];
          }
          for (int k = 0; k < kUnroll; ++k) {
            for (int gid = 0; gid < num_group; ++gid) {
@@ -707,7 +708,7 @@ class Dart : public GBTree {
      for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
        RegTree::FVec& feats = thread_temp_[0];
        const auto ridx = static_cast<int64_t>(batch.base_rowid + i);
-        const SparsePage::Inst inst = batch[i];
+        const SparsePage::Inst inst = page[i];
        for (int gid = 0; gid < num_group; ++gid) {
          const size_t offset = ridx * num_group + gid;
          preds[offset] +=
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -82,7 +82,8 @@ inline std::pair<double, double> GetGradient(int group_idx, int num_group, int f
                                             DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
-    auto col = batch[fidx];
+    auto page = batch.GetView();
+    auto col = page[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.size());
    for (bst_omp_uint j = 0; j < ndata; ++j) {
      const bst_float v = col[j].fvalue;
@@ -111,7 +112,8 @@ inline std::pair<double, double> GetGradientParallel(int group_idx, int num_grou
                                                     DMatrix *p_fmat) {
  double sum_grad = 0.0, sum_hess = 0.0;
  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
-    auto col = batch[fidx];
+    auto page = batch.GetView();
+    auto col = page[fidx];
    const auto ndata = static_cast<bst_omp_uint>(col.size());
 #pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
    for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -166,7 +168,8 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
                                   DMatrix *p_fmat) {
  if (dw == 0.0f) return;
  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
-    auto col = batch[fidx];
+    auto page = batch.GetView();
+    auto col = page[fidx];
    // update grad value
    const auto num_row = static_cast<bst_omp_uint>(col.size());
 #pragma omp parallel for schedule(static)
@@ -334,9 +337,10 @@ class GreedyFeatureSelector : public FeatureSelector {
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
  for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
-      #pragma omp parallel for schedule(static)
+    auto page = batch.GetView();
+#pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const auto col = batch[i];
+        const auto col = page[i];
        const bst_uint ndata = col.size();
        auto &sums = gpair_sums_[group_idx * nfeat + i];
        for (bst_uint j = 0u; j < ndata; ++j) {
@@ -399,10 +403,11 @@ class ThriftyFeatureSelector : public FeatureSelector {
    // Calculate univariate gradient sums
    std::fill(gpair_sums_.begin(), gpair_sums_.end(), std::make_pair(0., 0.));
    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
-// column-parallel is usually faster than row-parallel
+      auto page = batch.GetView();
+      // column-parallel is usually fastaer than row-parallel
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
-        const auto col = batch[i];
+        const auto col = page[i];
        const bst_uint ndata = col.size();
        for (bst_uint gid = 0u; gid < ngroup; ++gid) {
          auto &sums = gpair_sums_[gid * nfeat + i];
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -60,6 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT

    CHECK(p_fmat->SingleColBlock());
    SparsePage const& batch = *(p_fmat->GetBatches<CSCPage>().begin());
+    auto page = batch.GetView();

    if (IsEmpty()) {
      return;
@@ -72,7 +73,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    row_ptr_ = {0};
    // iterate through columns
    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
-      common::Span<Entry const> col = batch[fidx];
+      common::Span<Entry const> col = page[fidx];
      auto cmp = [](Entry e1, Entry e2) {
        return e1.index < e2.index;
      };
@@ -89,7 +90,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
    data_.resize(row_ptr_.back());
    gpair_.resize(num_row_ * model_param.num_output_group);
    for (size_t fidx = 0; fidx < batch.Size(); fidx++) {
-      auto col = batch[fidx];
+      auto col = page[fidx];
      auto seg = column_segments[fidx];
      dh::safe_cuda(cudaMemcpy(
          data_.data().get() + row_ptr_[fidx],
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -52,6 +52,7 @@ class ShotgunUpdater : public LinearUpdater {
    selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
                     param_.reg_alpha_denorm, param_.reg_lambda_denorm, 0);
    for (const auto &batch : p_fmat->GetBatches<CSCPage>()) {
+      auto page = batch.GetView();
      const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nfeat; ++i) {
@@ -60,7 +61,7 @@ class ShotgunUpdater : public LinearUpdater {
           param_.reg_lambda_denorm);
        if (ii < 0) continue;
        const bst_uint fid = ii;
-        auto col = batch[ii];
+        auto col = page[ii];
        for (int gid = 0; gid < ngroup; ++gid) {
          double sum_grad = 0.0, sum_hess = 0.0;
          for (auto& c : col) {
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -360,18 +360,19 @@ class CPUPredictor : public Predictor {
    // start collecting the prediction
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
      // parallel over local batch
+      auto page = batch.GetView();
      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nsize; ++i) {
        const int tid = omp_get_thread_num();
        auto ridx = static_cast<size_t>(batch.base_rowid + i);
        RegTree::FVec &feats = thread_temp_[tid];
-        feats.Fill(batch[i]);
+        feats.Fill(page[i]);
        for (unsigned j = 0; j < ntree_limit; ++j) {
          int tid = model.trees[j]->GetLeafIndex(feats);
          preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
        }
-        feats.Drop(batch[i]);
+        feats.Drop(page[i]);
      }
    }
  }
@@ -407,6 +408,7 @@ class CPUPredictor : public Predictor {
    const std::vector<bst_float>& base_margin = info.base_margin_.HostVector();
    // start collecting the contributions
    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+      auto page = batch.GetView();
      // parallel over local batch
      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
 #pragma omp parallel for schedule(static)
@@ -417,7 +419,7 @@ class CPUPredictor : public Predictor {
        // loop over all classes
        for (int gid = 0; gid < ngroup; ++gid) {
          bst_float* p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
-          feats.Fill(batch[i]);
+          feats.Fill(page[i]);
          // calculate contributions
          for (unsigned j = 0; j < ntree_limit; ++j) {
            std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
@@ -435,7 +437,7 @@ class CPUPredictor : public Predictor {
                    (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
            }
          }
-          feats.Drop(batch[i]);
+          feats.Drop(page[i]);
          // add base margin to BIAS
          if (base_margin.size() != 0) {
            p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -59,8 +59,9 @@ class BaseMaker: public TreeUpdater {
                -std::numeric_limits<bst_float>::max());
      // start accumulating statistics
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+        auto page = batch.GetView();
        for (bst_uint fid = 0; fid < batch.Size(); ++fid) {
-          auto c = batch[fid];
+          auto c = page[fid];
          if (c.size() != 0) {
            CHECK_LT(fid * 2, fminmax_.size());
            fminmax_[fid * 2 + 0] =
@@ -249,8 +250,9 @@ class BaseMaker: public TreeUpdater {
  inline void CorrectNonDefaultPositionByBatch(
      const SparsePage &batch, const std::vector<bst_uint> &sorted_split_set,
      const RegTree &tree) {
+    auto page = batch.GetView();
    for (size_t fid = 0; fid < batch.Size(); ++fid) {
-      auto col = batch[fid];
+      auto col = page[fid];
      auto it = std::lower_bound(sorted_split_set.begin(), sorted_split_set.end(), fid);

      if (it != sorted_split_set.end() && *it == fid) {
@@ -308,10 +310,11 @@ class BaseMaker: public TreeUpdater {
    std::vector<unsigned> fsplits;
    this->GetSplitSet(nodes, tree, &fsplits);
    for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+      auto page = batch.GetView();
      for (auto fid : fsplits) {
-        auto col = batch[fid];
+        auto col = page[fid];
        const auto ndata = static_cast<bst_omp_uint>(col.size());
-        #pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(static)
        for (bst_omp_uint j = 0; j < ndata; ++j) {
          const bst_uint ridx = col[j].index;
          const bst_float fvalue = col[j].fvalue;
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -77,8 +77,9 @@ class ColMaker: public TreeUpdater {
    if (column_densities_.empty()) {
      std::vector<size_t> column_size(dmat->Info().num_col_);
      for (const auto &batch : dmat->GetBatches<SortedCSCPage>()) {
+        auto page = batch.GetView();
        for (auto i = 0u; i < batch.Size(); i++) {
-          column_size[i] += batch[i].size();
+          column_size[i] += page[i].size();
        }
      }
      column_densities_.resize(column_size.size());
@@ -447,13 +448,14 @@ class ColMaker: public TreeUpdater {
 #endif  // defined(_OPENMP)
      {
        dmlc::OMPException omp_handler;
+        auto page = batch.GetView();
 #pragma omp parallel for schedule(dynamic, batch_size)
        for (bst_omp_uint i = 0; i < num_features; ++i) {
          omp_handler.Run([&]() {
            auto evaluator = tree_evaluator_.GetEvaluator();
            bst_feature_t const fid = feat_set[i];
            int32_t const tid = omp_get_thread_num();
-            auto c = batch[fid];
+            auto c = page[fid];
            const bool ind =
                c.size() != 0 && c[0].fvalue == c[c.size() - 1].fvalue;
            if (colmaker_train_param_.NeedForwardSearch(
@@ -562,8 +564,9 @@ class ColMaker: public TreeUpdater {
      std::sort(fsplits.begin(), fsplits.end());
      fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+        auto page = batch.GetView();
        for (auto fid : fsplits) {
-          auto col = batch[fid];
+          auto col = page[fid];
          const auto ndata = static_cast<bst_omp_uint>(col.size());
 #pragma omp parallel for schedule(static)
          for (bst_omp_uint j = 0; j < ndata; ++j) {
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -338,6 +338,7 @@ class CQHistMaker: public HistMaker {
      thread_hist_.resize(omp_get_max_threads());
      // start accumulating statistics
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
+        auto page = batch.GetView();
        // start enumeration
        const auto nsize = static_cast<bst_omp_uint>(fset.size());
 #pragma omp parallel for schedule(dynamic, 1)
@@ -345,7 +346,7 @@ class CQHistMaker: public HistMaker {
          int fid = fset[i];
          int offset = feat2workindex_[fid];
          if (offset >= 0) {
-            this->UpdateHistCol(gpair, batch[fid], info, tree,
+            this->UpdateHistCol(gpair, page[fid], info, tree,
                                fset, offset,
                                &thread_hist_[omp_get_thread_num()]);
          }
@@ -413,15 +414,15 @@ class CQHistMaker: public HistMaker {
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
        // TWOPASS: use the real set + split set in the column iteration.
        this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree);
-
+        auto page = batch.GetView();
        // start enumeration
        const auto nsize = static_cast<bst_omp_uint>(work_set_.size());
-        #pragma omp parallel for schedule(dynamic, 1)
+#pragma omp parallel for schedule(dynamic, 1)
        for (bst_omp_uint i = 0; i < nsize; ++i) {
          int fid = work_set_[i];
          int offset = feat2workindex_[fid];
          if (offset >= 0) {
-            this->UpdateSketchCol(gpair, batch[fid], tree,
+            this->UpdateSketchCol(gpair, page[fid], tree,
                                  work_set_size, offset,
                                  &thread_sketch_[omp_get_thread_num()]);
          }
@@ -696,6 +697,7 @@ class GlobalProposalHistMaker: public CQHistMaker {
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
        // TWOPASS: use the real set + split set in the column iteration.
        this->CorrectNonDefaultPositionByBatch(batch, this->fsplit_set_, tree);
+        auto page = batch.GetView();

        // start enumeration
        const auto nsize = static_cast<bst_omp_uint>(this->work_set_.size());
@@ -704,7 +706,7 @@ class GlobalProposalHistMaker: public CQHistMaker {
          int fid = this->work_set_[i];
          int offset = this->feat2workindex_[fid];
          if (offset >= 0) {
-            this->UpdateHistCol(gpair, batch[fid], info, tree,
+            this->UpdateHistCol(gpair, page[fid], info, tree,
                                fset, offset,
                                &this->thread_hist_[omp_get_thread_num()]);
          }
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -69,11 +69,12 @@ class TreeRefresher: public TreeUpdater {
      const MetaInfo &info = p_fmat->Info();
      // start accumulating statistics
      for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+        auto page = batch.GetView();
        CHECK_LT(batch.Size(), std::numeric_limits<unsigned>::max());
        const auto nbatch = static_cast<bst_omp_uint>(batch.Size());
-        #pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(static)
        for (bst_omp_uint i = 0; i < nbatch; ++i) {
-          SparsePage::Inst inst = batch[i];
+          SparsePage::Inst inst = page[i];
          const int tid = omp_get_thread_num();
          const auto ridx = static_cast<bst_uint>(batch.base_rowid + i);
          RegTree::FVec &feats = fvec_temp[tid];