Batch UpdatePosition using cudaMemcpy (#7964)

2022-06-30 17:52:40 +02:00
parent 2407381c3d
commit bc4f802b17
5 changed files with 441 additions and 516 deletions
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -182,10 +182,11 @@ struct GPUHistMakerDevice {
  std::unique_ptr<RowPartitioner> row_partitioner;
  DeviceHistogramStorage<GradientSumT> hist{};

-  dh::caching_device_vector<GradientPair> d_gpair;  // storage for gpair;
+  dh::device_vector<GradientPair> d_gpair;  // storage for gpair;
  common::Span<GradientPair> gpair;

-  dh::caching_device_vector<int> monotone_constraints;
+  dh::device_vector<int> monotone_constraints;
+  dh::device_vector<float> update_predictions;

  /*! \brief Sum gradient for each node. */
  std::vector<GradientPairPrecise> node_sum_gradients;
@@ -356,36 +357,49 @@ struct GPUHistMakerDevice {
    return true;
  }

-  void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
-    RegTree::Node split_node = (*p_tree)[e.nid];
-    auto split_type = p_tree->NodeSplitType(e.nid);
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
-    auto node_cats = e.split.split_cats.Bits();
+  // Extra data for each node that is passed
+  // to the update position function
+  struct NodeSplitData {
+    RegTree::Node split_node;
+    FeatureType split_type;
+    common::CatBitField node_cats;
+  };

-    row_partitioner->UpdatePosition(
-        e.nid, split_node.LeftChild(), split_node.RightChild(),
-        [=] __device__(bst_uint ridx) {
+  void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
+    if (candidates.empty()) return;
+    std::vector<int> nidx(candidates.size());
+    std::vector<int> left_nidx(candidates.size());
+    std::vector<int> right_nidx(candidates.size());
+    std::vector<NodeSplitData> split_data(candidates.size());
+    for (int i = 0; i < candidates.size(); i++) {
+      auto& e = candidates[i];
+      RegTree::Node split_node = (*p_tree)[e.nid];
+      auto split_type = p_tree->NodeSplitType(e.nid);
+      nidx.at(i) = e.nid;
+      left_nidx.at(i) = split_node.LeftChild();
+      right_nidx.at(i) = split_node.RightChild();
+      split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
+    }
+
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    row_partitioner->UpdatePositionBatch(
+        nidx, left_nidx, right_nidx, split_data,
+        [=] __device__(bst_uint ridx, const NodeSplitData& data) {
          // given a row index, returns the node id it belongs to
-          bst_float cut_value =
-              d_matrix.GetFvalue(ridx, split_node.SplitIndex());
+          bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
          // Missing value
-          bst_node_t new_position = 0;
+          bool go_left = true;
          if (isnan(cut_value)) {
-            new_position = split_node.DefaultChild();
+            go_left = data.split_node.DefaultLeft();
          } else {
-            bool go_left = true;
-            if (split_type == FeatureType::kCategorical) {
-              go_left = common::Decision<false>(node_cats, cut_value, split_node.DefaultLeft());
+            if (data.split_type == FeatureType::kCategorical) {
+              go_left = common::Decision<false>(data.node_cats.Bits(), cut_value,
+                                                data.split_node.DefaultLeft());
            } else {
-              go_left = cut_value <= split_node.SplitCond();
-            }
-            if (go_left) {
-              new_position = split_node.LeftChild();
-            } else {
-              new_position = split_node.RightChild();
+              go_left = cut_value <= data.split_node.SplitCond();
            }
          }
-          return new_position;
+          return go_left;
        });
  }

@@ -394,6 +408,16 @@ struct GPUHistMakerDevice {
  // prediction cache
  void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
                        HostDeviceVector<bst_node_t>* p_out_position) {
+    // Prediction cache will not be used with external memory
+    if (!p_fmat->SingleColBlock()) {
+      if (task.UpdateTreeLeaf()) {
+        LOG(FATAL) << "Current objective function can not be used with external memory.";
+      }
+      p_out_position->Resize(0);
+      update_predictions.clear();
+      return;
+    }
+
    dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
    dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                  d_nodes.size() * sizeof(RegTree::Node),
@@ -412,25 +436,9 @@ struct GPUHistMakerDevice {
      dh::CopyToD(categories_segments, &d_categories_segments);
    }

-    if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
-      row_partitioner.reset();  // Release the device memory first before reallocating
-      row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
-    }
-    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
-      // see comment in the `FinalisePositionInPage`.
-      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
-    }
-    if (page->n_rows == p_fmat->Info().num_row_) {
-      FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                             dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
-                             p_out_position);
-    } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
-        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                               dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
-                               p_out_position);
-      }
-    }
+    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
+                           p_out_position);
  }

  void FinalisePositionInPage(EllpackPageImpl const *page,
@@ -438,79 +446,73 @@ struct GPUHistMakerDevice {
                              common::Span<FeatureType const> d_feature_types,
                              common::Span<uint32_t const> categories,
                              common::Span<RegTree::Segment> categories_segments,
-                              ObjInfo task,
                              HostDeviceVector<bst_node_t>* p_out_position) {
    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
    auto d_gpair = this->gpair;
-    row_partitioner->FinalisePosition(
-        ctx_, task, p_out_position,
-        [=] __device__(size_t row_id, int position) {
-          // What happens if user prune the tree?
-          if (!d_matrix.IsInRange(row_id)) {
-            return RowPartitioner::kIgnoredTreePosition;
-          }
-          auto node = d_nodes[position];
+    update_predictions.resize(row_partitioner->GetRows().size());
+    auto d_update_predictions = dh::ToSpan(update_predictions);
+    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->Resize(row_partitioner->GetRows().size());

-          while (!node.IsLeaf()) {
-            bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
-            // Missing value
-            if (isnan(element)) {
-              position = node.DefaultChild();
-            } else {
-              bool go_left = true;
-              if (common::IsCat(d_feature_types, position)) {
-                auto node_cats =
-                    categories.subspan(categories_segments[position].beg,
-                                       categories_segments[position].size);
-                go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
-              } else {
-                go_left = element <= node.SplitCond();
-              }
-              if (go_left) {
-                position = node.LeftChild();
-              } else {
-                position = node.RightChild();
-              }
-            }
-            node = d_nodes[position];
-          }
+    auto new_position_op = [=] __device__(size_t row_id, int position) {
+      // What happens if user prune the tree?
+      if (!d_matrix.IsInRange(row_id)) {
+        return RowPartitioner::kIgnoredTreePosition;
+      }
+      auto node = d_nodes[position];

-          return position;
-        },
-        [d_gpair] __device__(size_t ridx) {
-          // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-          // the sampler compacts the gradient vector.
-          return d_gpair[ridx].GetHess() - .0f == 0.f;
-        });
+      while (!node.IsLeaf()) {
+        bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
+        // Missing value
+        if (isnan(element)) {
+          position = node.DefaultChild();
+        } else {
+          bool go_left = true;
+          if (common::IsCat(d_feature_types, position)) {
+            auto node_cats = categories.subspan(categories_segments[position].beg,
+                                                categories_segments[position].size);
+            go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
+          } else {
+            go_left = element <= node.SplitCond();
+          }
+          if (go_left) {
+            position = node.LeftChild();
+          } else {
+            position = node.RightChild();
+          }
+        }
+
+        node = d_nodes[position];
+      }
+
+      d_update_predictions[row_id] = node.LeafValue();
+      return position;
+    };  // NOLINT
+
+    auto d_out_position = p_out_position->DeviceSpan();
+    row_partitioner->FinalisePosition(d_out_position, new_position_op);
+
+    dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
+      bst_node_t position = d_out_position[idx];
+      d_update_predictions[idx] = d_nodes[position].LeafValue();
+      bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
+      d_out_position[idx] = is_row_sampled ? ~position : position;
+    });
  }

-  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+  bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+    if (update_predictions.empty()) {
+      return false;
+    }
    CHECK(p_tree);
    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
-    auto d_ridx = row_partitioner->GetRows();
-
-    GPUTrainingParam param_d(param);
-    dh::TemporaryArray<GradientPairPrecise> device_node_sum_gradients(node_sum_gradients.size());
-
-    dh::safe_cuda(cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(),
-                                  sizeof(GradientPairPrecise) * node_sum_gradients.size(),
-                                  cudaMemcpyHostToDevice));
-    auto d_position = row_partitioner->GetPosition();
-    auto d_node_sum_gradients = device_node_sum_gradients.data().get();
-    auto tree_evaluator = evaluator_.GetEvaluator();
-
-    auto const& h_nodes = p_tree->GetNodes();
-    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
-                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
-    auto d_nodes = dh::ToSpan(nodes);
-    dh::LaunchN(d_ridx.size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
-      bst_node_t nidx = d_position[idx];
-      auto weight = d_nodes[nidx].LeafValue();
-      out_preds_d(d_ridx[idx]) += weight;
+    auto d_update_predictions = dh::ToSpan(update_predictions);
+    CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
+    dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
+      out_preds_d(idx) += d_update_predictions[idx];
    });
-    row_partitioner.reset();
+    return true;
  }

  // num histograms is the number of contiguous histograms in memory to reduce over
@@ -684,14 +686,12 @@ struct GPUHistMakerDevice {
      auto new_candidates =
          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());

-      for (const auto& e : filtered_expand_set) {
-        monitor.Start("UpdatePosition");
-        // Update position is only run when child is valid, instead of right after apply
-        // split (as in approx tree method).  Hense we have the finalise position call
-        // in GPU Hist.
-        this->UpdatePosition(e, p_tree);
-        monitor.Stop("UpdatePosition");
-      }
+      monitor.Start("UpdatePosition");
+      // Update position is only run when child is valid, instead of right after apply
+      // split (as in approx tree method).  Hense we have the finalise position call
+      // in GPU Hist.
+      this->UpdatePosition(filtered_expand_set, p_tree);
+      monitor.Stop("UpdatePosition");

      monitor.Start("BuildHist");
      this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
@@ -844,9 +844,9 @@ class GPUHistMaker : public TreeUpdater {
      return false;
    }
    monitor_.Start("UpdatePredictionCache");
-    maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
    monitor_.Stop("UpdatePredictionCache");
-    return true;
+    return result;
  }

  TrainParam param_;  // NOLINT