Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. (#3446)

* Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. - added distributions to HostDeviceVector - using HostDeviceVector for labels, weights and base margings in MetaInfo - using HostDeviceVector for offset and data in SparsePage - other necessary refactoring * Added const version of HostDeviceVector API calls. - const versions added to calls that can trigger data transfers, e.g. DevicePointer() - updated the code that uses HostDeviceVector - objective functions now accept const HostDeviceVector<bst_float>& for predictions * Updated src/linear/updater_gpu_coordinate.cu. * Added read-only state for HostDeviceVector sync. - this means no copies are performed if both host and devices access the HostDeviceVector read-only * Fixed linter and test errors. - updated the lz4 plugin - added ConstDeviceSpan to HostDeviceVector - using device % dh::NVisibleDevices() for the physical device number, e.g. in calls to cudaSetDevice() * Fixed explicit template instantiation errors for HostDeviceVector. - replaced HostDeviceVector<unsigned int> with HostDeviceVector<int> * Fixed HostDeviceVector tests that require multiple GPUs. - added a mock set device handler; when set, it is called instead of cudaSetDevice()
2018-08-30 04:28:47 +02:00
parent 58d783df16
commit 72cd1517d6
45 changed files with 1141 additions and 560 deletions
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -41,7 +41,7 @@ class ColMaker: public TreeUpdater {
      Builder builder(
        param_,
        std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone()));
-      builder.Update(gpair->HostVector(), dmat, tree);
+      builder.Update(gpair->ConstHostVector(), dmat, tree);
    }
    param_.learning_rate = lr;
  }
@@ -784,7 +784,7 @@ class DistColMaker : public ColMaker {
      param_,
      std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone()));
    // build the tree
-    builder.Update(gpair->HostVector(), dmat, trees[0]);
+    builder.Update(gpair->ConstHostVector(), dmat, trees[0]);
    //// prune the tree, note that pruner will sync the tree
    pruner_->Update(gpair, dmat, trees);
    // update position after the tree is pruned
--- a/src/tree/updater_fast_hist.cc
+++ b/src/tree/updater_fast_hist.cc
@@ -164,7 +164,7 @@ class FastHistMaker: public TreeUpdater {
      double time_evaluate_split = 0;
      double time_apply_split = 0;

-      std::vector<GradientPair>& gpair_h = gpair->HostVector();
+      const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();

      spliteval_->Reset();

--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -650,7 +650,7 @@ class GPUMaker : public TreeUpdater {

  void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
                    std::vector<int>* fId, std::vector<size_t>* offset) {
-    MetaInfo info = dmat->Info();
+    const MetaInfo& info = dmat->Info();
    CHECK(info.num_col_ < std::numeric_limits<int>::max());
    CHECK(info.num_row_ < std::numeric_limits<int>::max());
    nRows = static_cast<int>(info.num_row_);
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -387,11 +387,13 @@ struct DeviceShard {

  void InitRowPtrs(const SparsePage& row_batch) {
    dh::safe_cuda(cudaSetDevice(device_idx));
+    const auto& offset_vec = row_batch.offset.HostVector();
    row_ptrs.resize(n_rows + 1);
-    thrust::copy(row_batch.offset.data() + row_begin_idx,
-                 row_batch.offset.data() + row_end_idx + 1,
+    thrust::copy(offset_vec.data() + row_begin_idx,
+                 offset_vec.data() + row_end_idx + 1,
                 row_ptrs.begin());
    auto row_iter = row_ptrs.begin();
+    // find the maximum row size
    auto get_size = [=] __device__(size_t row) {
      return row_iter[row + 1] - row_iter[row];
    }; // NOLINT
@@ -432,9 +434,12 @@ struct DeviceShard {
      (dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)),
       static_cast<size_t>(n_rows));

-    thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
+    const auto& offset_vec = row_batch.offset.HostVector();
+    const auto& data_vec = row_batch.data.HostVector();

+    thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
    size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
+
    for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
      size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
      size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
@@ -443,12 +448,12 @@ struct DeviceShard {
      }
      size_t batch_nrows = batch_row_end - batch_row_begin;
      size_t n_entries =
-        row_batch.offset[row_begin_idx + batch_row_end] -
-        row_batch.offset[row_begin_idx + batch_row_begin];
+        offset_vec[row_begin_idx + batch_row_end] -
+        offset_vec[row_begin_idx + batch_row_begin];
      dh::safe_cuda
        (cudaMemcpy
         (entries_d.data().get(),
-          &row_batch.data[row_batch.offset[row_begin_idx + batch_row_begin]],
+          data_vec.data() + offset_vec[row_begin_idx + batch_row_begin],
          n_entries * sizeof(Entry), cudaMemcpyDefault));
      dim3 block3(32, 8, 1);
      dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
@@ -458,7 +463,7 @@ struct DeviceShard {
         row_ptrs.data().get() + batch_row_begin,
         entries_d.data().get(), cuts_d.data().get(), cut_row_ptrs_d.data().get(),
         batch_row_begin, batch_nrows,
-         row_batch.offset[row_begin_idx + batch_row_begin],
+         offset_vec[row_begin_idx + batch_row_begin],
         row_stride, null_gidx_value);

      dh::safe_cuda(cudaGetLastError());
@@ -538,7 +543,7 @@ struct DeviceShard {

    std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
    ridx_segments.front() = Segment(0, ridx.Size());
-    this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx));
+    this->gpair.copy(dh_gpair->tcbegin(device_idx), dh_gpair->tcend(device_idx));
    SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
    hist.Reset();
  }
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -30,7 +30,7 @@ class HistMaker: public BaseMaker {
    param_.learning_rate = lr / trees.size();
    // build tree
    for (auto tree : trees) {
-      this->Update(gpair->HostVector(), p_fmat, tree);
+      this->Update(gpair->ConstHostVector(), p_fmat, tree);
    }
    param_.learning_rate = lr;
  }
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -29,7 +29,7 @@ class TreeRefresher: public TreeUpdater {
              DMatrix *p_fmat,
              const std::vector<RegTree*> &trees) override {
    if (trees.size() == 0) return;
-    std::vector<GradientPair> &gpair_h = gpair->HostVector();
+    const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
    // number of threads
    // thread temporal space
    std::vector<std::vector<TStats> > stemp;
--- a/src/tree/updater_skmaker.cc
+++ b/src/tree/updater_skmaker.cc
@@ -30,7 +30,7 @@ class SketchMaker: public BaseMaker {
    param_.learning_rate = lr / trees.size();
    // build tree
    for (auto tree : trees) {
-      this->Update(gpair->HostVector(), p_fmat, tree);
+      this->Update(gpair->ConstHostVector(), p_fmat, tree);
    }
    param_.learning_rate = lr;
  }