Modify caching allocator/vector and fix issues relating to inability to train large datasets (#4615)

2019-07-08 23:33:27 -07:00
parent cd1526d3b1
commit 7a388cbf8b
5 changed files with 22 additions and 18 deletions
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -611,8 +611,6 @@ struct DeviceShard {
  /*! \brief Sum gradient for each node. */
  std::vector<GradientPair> node_sum_gradients;
  common::Span<GradientPair> node_sum_gradients_d;
-  /*! \brief On-device feature set, only actually used on one of the devices */
-  dh::device_vector<int> feature_set_d;
  /*! The row offset for this shard. */
  bst_uint row_begin_idx;
  bst_uint row_end_idx;
@@ -700,6 +698,7 @@ struct DeviceShard {
    this->interaction_constraints.Reset();
    std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
              GradientPair());
+    row_partitioner.reset();  // Release the device memory first before reallocating
    row_partitioner.reset(new RowPartitioner(device_id, n_rows));

    dh::safe_cuda(cudaMemcpyAsync(
@@ -921,6 +920,7 @@ struct DeviceShard {
    dh::safe_cuda(cudaMemcpy(
        out_preds_d, prediction_cache.data(),
        prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
+    row_partitioner.reset();
  }

  void AllReduceHist(int nidx, dh::AllReducer* reducer) {