Fix several GPU bugs (#2916)

* Fix #2905 * Fix gpu_exact test failures * Fix bug in GPU prediction where multiple calls to batch prediction can produce incorrect results * Fix GPU documentation formatting
2017-12-04 08:27:49 +13:00
parent 1e3aabbadc
commit 1b77903eeb
7 changed files with 109 additions and 46 deletions
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -292,11 +292,9 @@ class GPUPredictor : public xgboost::Predictor {
    thrust::copy(model.tree_info.begin(), model.tree_info.end(),
                 tree_group.begin());

-    if (device_matrix->predictions.size() != out_preds->size()) {
-      device_matrix->predictions.resize(out_preds->size());
-      thrust::copy(out_preds->begin(), out_preds->end(),
-                   device_matrix->predictions.begin());
-    }
+    device_matrix->predictions.resize(out_preds->size());
+    thrust::copy(out_preds->begin(), out_preds->end(),
+                 device_matrix->predictions.begin());

    const int BLOCK_THREADS = 128;
    const int GRID_SIZE = static_cast<int>(
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -336,8 +336,8 @@ struct XGBOOST_ALIGNAS(16) GradStats {
    this->Add(b.GetGrad(), b.GetHess());
  }
  /*! \brief calculate leaf weight */
-template <typename param_t>
-  inline double CalcWeight(const param_t& param) const {
+  template <typename param_t>
+  XGBOOST_DEVICE inline double CalcWeight(const param_t &param) const {
    return xgboost::tree::CalcWeight(param, sum_grad, sum_hess);
  }
  /*! \brief calculate gain of the solution */
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -302,7 +302,7 @@ DEV_INLINE void argMaxWithAtomics(
      ExactSplitCandidate s;
      bst_gpair missing = parentSum - colSum;
      s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain,
-                                 param, 0, ValueConstraint(), tmp);
+                                 param, tmp);
      s.index = id;
      atomicArgMax(nodeSplits + uid, s);
    }  // end if nodeId != UNUSED_NODE
@@ -580,7 +580,7 @@ class GPUMaker : public TreeUpdater {
        // get the default direction for the current node
        bst_gpair missing = n.sum_gradients - gradSum;
        loss_chg_missing(gradScan, missing, n.sum_gradients, n.root_gain,
-                         gpu_param, 0, ValueConstraint(), missingLeft);
+                         gpu_param, missingLeft);
        // get the score/weight/id/gradSum for left and right child nodes
        bst_gpair lGradSum = missingLeft ? gradScan + missing : gradScan;
        bst_gpair rGradSum = n.sum_gradients - lGradSum;
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -240,6 +240,29 @@ __device__ inline float device_calc_loss_chg(const GPUTrainingParam& param,
  return left_gain + right_gain - parent_gain;
 }

+// Without constraints
+template <typename gpair_t>
+__device__ float inline loss_chg_missing(const gpair_t& scan,
+                                         const gpair_t& missing,
+                                         const gpair_t& parent_sum,
+                                         const float& parent_gain,
+                                         const GPUTrainingParam& param,
+                                         bool& missing_left_out) {  // NOLINT
+  float missing_left_loss =
+      device_calc_loss_chg(param, scan + missing, parent_sum, parent_gain);
+  float missing_right_loss =
+      device_calc_loss_chg(param, scan, parent_sum, parent_gain);
+
+  if (missing_left_loss >= missing_right_loss) {
+    missing_left_out = true;
+    return missing_left_loss;
+  } else {
+    missing_left_out = false;
+    return missing_right_loss;
+  }
+}
+
+// With constraints
 template <typename gpair_t>
 __device__ float inline loss_chg_missing(
    const gpair_t& scan, const gpair_t& missing, const gpair_t& parent_sum,
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -287,6 +287,10 @@ struct DeviceShard {
    size_t compressed_size_bytes =
        common::CompressedBufferWriter::CalculateBufferSize(
            ellpack_matrix.size(), num_symbols);
+
+    CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
+        << "Max leaves and max depth cannot both be unconstrained for "
+           "gpu_hist.";
    int max_nodes =
        param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
    ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,