-Add experimental GPU algorithm for lossguided mode (#2755)

-Improved GPU algorithm unit tests -Removed some thrust code to improve compile times
2017-10-01 00:18:35 +13:00
parent 69c3b78a29
commit 4cb2f7598b
14 changed files with 1291 additions and 593 deletions
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -43,12 +43,14 @@ struct DeviceGMat {
    gidx = common::CompressedIterator<uint32_t>(gidx_buffer.data(), n_bins);

    // row_ptr
-    thrust::copy(gmat.row_ptr.data() + row_begin,
-                 gmat.row_ptr.data() + row_end + 1, row_ptr.tbegin());
+    dh::safe_cuda(cudaMemcpy(row_ptr.data(), gmat.row_ptr.data() + row_begin,
+                             row_ptr.size() * sizeof(size_t),
+                             cudaMemcpyHostToDevice));
    // normalise row_ptr
    size_t start = gmat.row_ptr[row_begin];
-    thrust::transform(row_ptr.tbegin(), row_ptr.tend(), row_ptr.tbegin(),
-                      [=] __device__(size_t val) { return val - start; });
+    auto d_row_ptr = row_ptr.data();
+    dh::launch_n(row_ptr.device_idx(), row_ptr.size(),
+                 [=] __device__(size_t idx) { d_row_ptr[idx] -= start; });
  }
 };

@@ -61,12 +63,15 @@ struct HistHelper {
  __device__ void Add(bst_gpair gpair, int gidx, int nidx) const {
    int hist_idx = nidx * n_bins + gidx;

-    auto dst_ptr = reinterpret_cast<unsigned long long int*>(&d_hist[hist_idx]); // NOLINT
+    auto dst_ptr =
+        reinterpret_cast<unsigned long long int*>(&d_hist[hist_idx]);  // NOLINT
    gpair_sum_t tmp(gpair.GetGrad(), gpair.GetHess());
    auto src_ptr = reinterpret_cast<gpair_sum_t::value_t*>(&tmp);

-    atomicAdd(dst_ptr, static_cast<unsigned long long int>(*src_ptr)); // NOLINT
-    atomicAdd(dst_ptr + 1, static_cast<unsigned long long int>(*(src_ptr + 1))); // NOLINT
+    atomicAdd(dst_ptr,
+              static_cast<unsigned long long int>(*src_ptr));  // NOLINT
+    atomicAdd(dst_ptr + 1,
+              static_cast<unsigned long long int>(*(src_ptr + 1)));  // NOLINT
  }
  __device__ gpair_sum_t Get(int gidx, int nidx) const {
    return d_hist[nidx * n_bins + gidx];
@@ -96,51 +101,10 @@ struct DeviceHist {
  int LevelSize(int depth) { return n_bins * n_nodes_level(depth); }
 };

-struct SplitCandidate {
-  float loss_chg;
-  bool missing_left;
-  float fvalue;
-  int findex;
-  gpair_sum_t left_sum;
-  gpair_sum_t right_sum;
-
-  __host__ __device__ SplitCandidate()
-      : loss_chg(-FLT_MAX), missing_left(true), fvalue(0), findex(-1) {}
-
-  __device__ void Update(float loss_chg_in, bool missing_left_in,
-                         float fvalue_in, int findex_in,
-                         gpair_sum_t left_sum_in, gpair_sum_t right_sum_in,
-                         const GPUTrainingParam& param) {
-    if (loss_chg_in > loss_chg &&
-        left_sum_in.GetHess() >= param.min_child_weight &&
-        right_sum_in.GetHess() >= param.min_child_weight) {
-      loss_chg = loss_chg_in;
-      missing_left = missing_left_in;
-      fvalue = fvalue_in;
-      left_sum = left_sum_in;
-      right_sum = right_sum_in;
-      findex = findex_in;
-    }
-  }
-  __device__ bool IsValid() const { return loss_chg > 0.0f; }
-};
-
-struct GpairCallbackOp {
-  // Running prefix
-  gpair_sum_t running_total;
-  // Constructor
-  __device__ GpairCallbackOp() : running_total(gpair_sum_t()) {}
-  __device__ bst_gpair operator()(bst_gpair block_aggregate) {
-    gpair_sum_t old_prefix = running_total;
-    running_total += block_aggregate;
-    return old_prefix;
-  }
-};
-
 template <int BLOCK_THREADS>
 __global__ void find_split_kernel(
    const gpair_sum_t* d_level_hist, int* d_feature_segments, int depth,
-    int n_features, int n_bins, DeviceDenseNode* d_nodes,
+    int n_features, int n_bins, DeviceNodeStats* d_nodes,
    int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map,
    GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp,
    bool colsample, int* d_feature_flags) {
@@ -156,15 +120,15 @@ __global__ void find_split_kernel(
    typename SumReduceT::TempStorage sum_reduce;
  };

-  __shared__ cub::Uninitialized<SplitCandidate> uninitialized_split;
-  SplitCandidate& split = uninitialized_split.Alias();
+  __shared__ cub::Uninitialized<DeviceSplitCandidate> uninitialized_split;
+  DeviceSplitCandidate& split = uninitialized_split.Alias();
  __shared__ cub::Uninitialized<gpair_sum_t> uninitialized_sum;
  gpair_sum_t& shared_sum = uninitialized_sum.Alias();
  __shared__ ArgMaxT block_max;
  __shared__ TempStorage temp_storage;

  if (threadIdx.x == 0) {
-    split = SplitCandidate();
+    split = DeviceSplitCandidate();
  }

  __syncthreads();
@@ -197,7 +161,7 @@ __global__ void find_split_kernel(
    }
    //    __syncthreads(); // no need to synch because below there is a Scan

-    GpairCallbackOp prefix_op = GpairCallbackOp();
+    auto prefix_op = SumCallbackOp<gpair_sum_t>();
    for (int scan_begin = begin; scan_begin < end;
         scan_begin += BLOCK_THREADS) {
      bool thread_active = scan_begin + threadIdx.x < end;
@@ -245,7 +209,8 @@ __global__ void find_split_kernel(
        gpair_sum_t left = missing_left ? bin + missing : bin;
        gpair_sum_t right = parent_sum - left;

-        split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param);
+        split.Update(gain, missing_left ? LeftDir : RightDir, fvalue, fidx,
+                     left, right, gpu_param);
      }
      __syncthreads();
    }  // end scan
@@ -253,17 +218,16 @@ __global__ void find_split_kernel(

  // Create node
  if (threadIdx.x == 0 && split.IsValid()) {
-    d_nodes[node_idx].SetSplit(split.fvalue, split.findex,
-                               split.missing_left ? LeftDir : RightDir);
+    d_nodes[node_idx].SetSplit(split);

-    DeviceDenseNode& left_child = d_nodes[left_child_nidx(node_idx)];
-    DeviceDenseNode& right_child = d_nodes[right_child_nidx(node_idx)];
+    DeviceNodeStats& left_child = d_nodes[left_child_nidx(node_idx)];
+    DeviceNodeStats& right_child = d_nodes[right_child_nidx(node_idx)];
    bool& left_child_smallest = d_left_child_smallest_temp[node_idx];
    left_child =
-        DeviceDenseNode(split.left_sum, left_child_nidx(node_idx), gpu_param);
+        DeviceNodeStats(split.left_sum, left_child_nidx(node_idx), gpu_param);

    right_child =
-        DeviceDenseNode(split.right_sum, right_child_nidx(node_idx), gpu_param);
+        DeviceNodeStats(split.right_sum, right_child_nidx(node_idx), gpu_param);

    // Record smallest node
    if (split.left_sum.GetHess() <= split.right_sum.GetHess()) {
@@ -336,7 +300,7 @@ class GPUHistMaker : public TreeUpdater {
      // reset static timers used across iterations
      cpu_init_time = 0;
      gpu_init_time = 0;
-      cpu_time.reset();
+      cpu_time.Reset();
      gpu_time = 0;

      // set dList member
@@ -399,31 +363,31 @@ class GPUHistMaker : public TreeUpdater {
      is_dense = info->num_nonzero == info->num_col * info->num_row;
      dh::Timer time0;
      hmat_.Init(&fmat, param.max_bin);
-      cpu_init_time += time0.elapsedSeconds();
+      cpu_init_time += time0.ElapsedSeconds();
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] CPU Time for hmat_.Init "
-                     << time0.elapsedSeconds() << " sec";
+                     << time0.ElapsedSeconds() << " sec";
        fflush(stdout);
      }
-      time0.reset();
+      time0.Reset();

      gmat_.cut = &hmat_;
-      cpu_init_time += time0.elapsedSeconds();
+      cpu_init_time += time0.ElapsedSeconds();
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.cut "
-                     << time0.elapsedSeconds() << " sec";
+                     << time0.ElapsedSeconds() << " sec";
        fflush(stdout);
      }
-      time0.reset();
+      time0.Reset();

      gmat_.Init(&fmat);
-      cpu_init_time += time0.elapsedSeconds();
+      cpu_init_time += time0.ElapsedSeconds();
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.Init() "
-                     << time0.elapsedSeconds() << " sec";
+                     << time0.ElapsedSeconds() << " sec";
        fflush(stdout);
      }
-      time0.reset();
+      time0.Reset();

      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE)
@@ -563,9 +527,9 @@ class GPUHistMaker : public TreeUpdater {
      int device_idx = dList[d_idx];
      dh::safe_cuda(cudaSetDevice(device_idx));

-      nodes[d_idx].fill(DeviceDenseNode());
-      nodes_temp[d_idx].fill(DeviceDenseNode());
-      nodes_child_temp[d_idx].fill(DeviceDenseNode());
+      nodes[d_idx].fill(DeviceNodeStats());
+      nodes_temp[d_idx].fill(DeviceNodeStats());
+      nodes_child_temp[d_idx].fill(DeviceNodeStats());

      position[d_idx].fill(0);

@@ -584,7 +548,7 @@ class GPUHistMaker : public TreeUpdater {
    dh::synchronize_n_devices(n_devices, dList);

    if (!initialised) {
-      gpu_init_time = time1.elapsedSeconds() - cpu_init_time;
+      gpu_init_time = time1.ElapsedSeconds() - cpu_init_time;
      gpu_time = -cpu_init_time;
      if (param.debug_verbose) {  // Only done once for each training session
        LOG(CONSOLE) << "[GPU Plug-in] Time for GPU operations during First "
@@ -701,12 +665,12 @@ class GPUHistMaker : public TreeUpdater {
      dh::synchronize_n_devices(n_devices, dList);
    }
  }
-#define MIN_BLOCK_THREADS 32
-#define CHUNK_BLOCK_THREADS 32
+#define MIN_BLOCK_THREADS 128
+#define CHUNK_BLOCK_THREADS 128
 // MAX_BLOCK_THREADS of 1024 is hard-coded maximum block size due
 // to CUDA capability 35 and above requirement
 // for Maximum number of threads per block
-#define MAX_BLOCK_THREADS 1024
+#define MAX_BLOCK_THREADS 512

  void FindSplit(int depth) {
    // Specialised based on max_bins
@@ -783,7 +747,7 @@ class GPUHistMaker : public TreeUpdater {

      dh::launch_n(device_idx, 1, [=] __device__(int idx) {
        bst_gpair sum_gradients = sum;
-        d_nodes[idx] = DeviceDenseNode(sum_gradients, 0, gpu_param);
+        d_nodes[idx] = DeviceNodeStats(sum_gradients, 0, gpu_param);
      });
    }
    // synch all devices to host before moving on (No, can avoid because
@@ -802,7 +766,7 @@ class GPUHistMaker : public TreeUpdater {
      int device_idx = dList[d_idx];

      auto d_position = position[d_idx].data();
-      DeviceDenseNode* d_nodes = nodes[d_idx].data();
+      DeviceNodeStats* d_nodes = nodes[d_idx].data();
      auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
      auto d_gidx = device_matrix[d_idx].gidx;
      int n_columns = info->num_col;
@@ -814,7 +778,7 @@ class GPUHistMaker : public TreeUpdater {
        if (!is_active(pos, depth)) {
          return;
        }
-        DeviceDenseNode node = d_nodes[pos];
+        DeviceNodeStats node = d_nodes[pos];

        if (node.IsLeaf()) {
          return;
@@ -842,7 +806,7 @@ class GPUHistMaker : public TreeUpdater {

      auto d_position = position[d_idx].data();
      auto d_position_tmp = position_tmp[d_idx].data();
-      DeviceDenseNode* d_nodes = nodes[d_idx].data();
+      DeviceNodeStats* d_nodes = nodes[d_idx].data();
      auto d_gidx_feature_map = gidx_feature_map[d_idx].data();
      auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
      auto d_gidx = device_matrix[d_idx].gidx;
@@ -862,7 +826,7 @@ class GPUHistMaker : public TreeUpdater {
                       return;
                     }

-                     DeviceDenseNode node = d_nodes[pos];
+                     DeviceNodeStats node = d_nodes[pos];

                     if (node.IsLeaf()) {
                       d_position_tmp[local_idx] = pos;
@@ -887,7 +851,7 @@ class GPUHistMaker : public TreeUpdater {
              return;
            }

-            DeviceDenseNode node = d_nodes[pos];
+            DeviceNodeStats node = d_nodes[pos];

            if (node.IsLeaf()) {
              return;
@@ -976,8 +940,10 @@ class GPUHistMaker : public TreeUpdater {
                     d_prediction_cache[local_idx] += d_nodes[pos].weight * eps;
                   });

-      thrust::copy(prediction_cache[d_idx].tbegin(),
-                   prediction_cache[d_idx].tend(), &out_preds[row_begin]);
+      dh::safe_cuda(
+          cudaMemcpy(&out_preds[row_begin], prediction_cache[d_idx].data(),
+                     prediction_cache[d_idx].size() * sizeof(bst_float),
+                     cudaMemcpyDeviceToHost));
    }
    dh::synchronize_n_devices(n_devices, dList);

@@ -1003,7 +969,7 @@ class GPUHistMaker : public TreeUpdater {
    dh::safe_cuda(cudaSetDevice(master_device));
    dense2sparse_tree(p_tree, nodes[0], param);

-    gpu_time += time0.elapsedSeconds();
+    gpu_time += time0.ElapsedSeconds();

    if (param.debug_verbose) {
      LOG(CONSOLE)
@@ -1014,10 +980,10 @@ class GPUHistMaker : public TreeUpdater {

    if (param.debug_verbose) {
      LOG(CONSOLE) << "[GPU Plug-in] Cumulative CPU Time "
-                   << cpu_time.elapsedSeconds() << " sec";
+                   << cpu_time.ElapsedSeconds() << " sec";
      LOG(CONSOLE)
          << "[GPU Plug-in] Cumulative CPU Time excluding initial time "
-          << (cpu_time.elapsedSeconds() - cpu_init_time - gpu_time) << " sec";
+          << (cpu_time.ElapsedSeconds() - cpu_init_time - gpu_time) << " sec";
      fflush(stdout);
    }
  }
@@ -1048,9 +1014,9 @@ class GPUHistMaker : public TreeUpdater {

  std::vector<dh::CubMemory> temp_memory;
  std::vector<DeviceHist> hist_vec;
-  std::vector<dh::dvec<DeviceDenseNode>> nodes;
-  std::vector<dh::dvec<DeviceDenseNode>> nodes_temp;
-  std::vector<dh::dvec<DeviceDenseNode>> nodes_child_temp;
+  std::vector<dh::dvec<DeviceNodeStats>> nodes;
+  std::vector<dh::dvec<DeviceNodeStats>> nodes_temp;
+  std::vector<dh::dvec<DeviceNodeStats>> nodes_child_temp;
  std::vector<dh::dvec<bool>> left_child_smallest;
  std::vector<dh::dvec<bool>> left_child_smallest_temp;
  std::vector<dh::dvec<int>> feature_flags;