Predict on Ellpack. (#5327)

* Unify GPU prediction node. * Add `PageExists`. * Dispatch prediction on input data for GPU Predictor.
2020-02-23 06:27:03 +08:00
parent 70a91ec3ba
commit 655cf17b60
19 changed files with 320 additions and 134 deletions
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -14,6 +14,7 @@
 #include "xgboost/host_device_vector.h"

 #include "../gbm/gbtree_model.h"
+#include "../data/ellpack_page.cuh"
 #include "../common/common.h"
 #include "../common/device_helpers.cuh"

@@ -22,78 +23,32 @@ namespace predictor {

 DMLC_REGISTRY_FILE_TAG(gpu_predictor);

-/**
- * \struct  DevicePredictionNode
- *
- * \brief Packed 16 byte representation of a tree node for use in device
- * prediction
- */
-struct DevicePredictionNode {
-  XGBOOST_DEVICE DevicePredictionNode()
-      : fidx{-1}, left_child_idx{-1}, right_child_idx{-1} {}
+struct SparsePageView {
+  common::Span<const Entry> d_data;
+  common::Span<const bst_row_t> d_row_ptr;

-  union NodeValue {
-    float leaf_weight;
-    float fvalue;
-  };
-
-  int fidx;
-  int left_child_idx;
-  int right_child_idx;
-  NodeValue val{};
-
-  DevicePredictionNode(const RegTree::Node& n) {  // NOLINT
-    static_assert(sizeof(DevicePredictionNode) == 16, "Size is not 16 bytes");
-    this->left_child_idx = n.LeftChild();
-    this->right_child_idx = n.RightChild();
-    this->fidx = n.SplitIndex();
-    if (n.DefaultLeft()) {
-      fidx |= (1U << 31);
-    }
-
-    if (n.IsLeaf()) {
-      this->val.leaf_weight = n.LeafValue();
-    } else {
-      this->val.fvalue = n.SplitCond();
-    }
-  }
-
-  XGBOOST_DEVICE bool IsLeaf() const { return left_child_idx == -1; }
-
-  XGBOOST_DEVICE int GetFidx() const { return fidx & ((1U << 31) - 1U); }
-
-  XGBOOST_DEVICE bool MissingLeft() const { return (fidx >> 31) != 0; }
-
-  XGBOOST_DEVICE int MissingIdx() const {
-    if (MissingLeft()) {
-      return this->left_child_idx;
-    } else {
-      return this->right_child_idx;
-    }
-  }
-
-  XGBOOST_DEVICE float GetFvalue() const { return val.fvalue; }
-
-  XGBOOST_DEVICE float GetWeight() const { return val.leaf_weight; }
+  XGBOOST_DEVICE SparsePageView(common::Span<const Entry> data,
+                                common::Span<const bst_row_t> row_ptr) :
+      d_data{data}, d_row_ptr{row_ptr} {}
 };

-struct ElementLoader {
+struct SparsePageLoader {
  bool use_shared;
  common::Span<const bst_row_t> d_row_ptr;
  common::Span<const Entry> d_data;
-  int num_features;
+  bst_feature_t num_features;
  float* smem;
  size_t entry_start;

-  __device__ ElementLoader(bool use_shared, common::Span<const bst_row_t> row_ptr,
-                           common::Span<const Entry> entry, int num_features,
-                           float* smem, int num_rows, size_t entry_start)
+  __device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
+                              bst_row_t num_rows, size_t entry_start)
      : use_shared(use_shared),
-        d_row_ptr(row_ptr),
-        d_data(entry),
+        d_row_ptr(data.d_row_ptr),
+        d_data(data.d_data),
        num_features(num_features),
-        smem(smem),
        entry_start(entry_start) {
+    extern __shared__ float _smem[];
+    smem = _smem;
    // Copy instances
    if (use_shared) {
      bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -111,7 +66,7 @@ struct ElementLoader {
      __syncthreads();
    }
  }
-  __device__ float GetFvalue(int ridx, int fidx) {
+  __device__ float GetFvalue(int ridx, int fidx) const {
    if (use_shared) {
      return smem[threadIdx.x * num_features + fidx];
    } else {
@@ -141,52 +96,69 @@ struct ElementLoader {
  }
 };

-__device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree,
-                               ElementLoader* loader) {
-  DevicePredictionNode n = tree[0];
+struct EllpackLoader {
+  EllpackMatrix const& matrix;
+  XGBOOST_DEVICE EllpackLoader(EllpackMatrix const& m, bool use_shared, bst_feature_t num_features,
+                               bst_row_t num_rows, size_t entry_start) : matrix{m} {}
+  __device__ __forceinline__ float GetFvalue(int ridx, int fidx) const {
+    auto gidx = matrix.GetBinIndex(ridx, fidx);
+    if (gidx == -1) {
+      return nan("");
+    }
+    // The gradient index needs to be shifted by one as min values are not included in the
+    // cuts.
+    if (gidx == matrix.info.feature_segments[fidx]) {
+      return matrix.info.min_fvalue[fidx];
+    }
+    return matrix.info.gidx_fvalue_map[gidx - 1];
+  }
+};
+
+template <typename Loader>
+__device__ float GetLeafWeight(bst_uint ridx, const RegTree::Node* tree,
+                               Loader* loader) {
+  RegTree::Node n = tree[0];
  while (!n.IsLeaf()) {
-    float fvalue = loader->GetFvalue(ridx, n.GetFidx());
+    float fvalue = loader->GetFvalue(ridx, n.SplitIndex());
    // Missing value
    if (isnan(fvalue)) {
-      n = tree[n.MissingIdx()];
+      n = tree[n.DefaultChild()];
    } else {
-      if (fvalue < n.GetFvalue()) {
-        n = tree[n.left_child_idx];
+      if (fvalue < n.SplitCond()) {
+        n = tree[n.LeftChild()];
      } else {
-        n = tree[n.right_child_idx];
+        n = tree[n.RightChild()];
      }
    }
  }
-  return n.GetWeight();
+  return n.LeafValue();
 }

-template <int BLOCK_THREADS>
-__global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
+template <typename Loader, typename Data>
+__global__ void PredictKernel(Data data,
+                              common::Span<const RegTree::Node> d_nodes,
                              common::Span<float> d_out_predictions,
                              common::Span<size_t> d_tree_segments,
                              common::Span<int> d_tree_group,
-                              common::Span<const bst_row_t> d_row_ptr,
-                              common::Span<const Entry> d_data, size_t tree_begin,
-                              size_t tree_end, size_t num_features,
+                              size_t tree_begin, size_t tree_end, size_t num_features,
                              size_t num_rows, size_t entry_start,
                              bool use_shared, int num_group) {
-  extern __shared__ float smem[];
  bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-  ElementLoader loader(use_shared, d_row_ptr, d_data, num_features, smem,
-                       num_rows, entry_start);
+  Loader loader(data, use_shared, num_features, num_rows, entry_start);
  if (global_idx >= num_rows) return;
  if (num_group == 1) {
    float sum = 0;
    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      const DevicePredictionNode* d_tree =
+      const RegTree::Node* d_tree =
          &d_nodes[d_tree_segments[tree_idx - tree_begin]];
-      sum += GetLeafWeight(global_idx, d_tree, &loader);
+      float leaf = GetLeafWeight(global_idx, d_tree, &loader);
+      sum += leaf;
    }
    d_out_predictions[global_idx] += sum;
  } else {
    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
      int tree_group = d_tree_group[tree_idx];
-      const DevicePredictionNode* d_tree =
+      const RegTree::Node* d_tree =
          &d_nodes[d_tree_segments[tree_idx - tree_begin]];
      bst_uint out_prediction_idx = global_idx * num_group + tree_group;
      d_out_predictions[out_prediction_idx] +=
@@ -198,13 +170,13 @@ __global__ void PredictKernel(common::Span<const DevicePredictionNode> d_nodes,
 class GPUPredictor : public xgboost::Predictor {
 private:
  void InitModel(const gbm::GBTreeModel& model,
-   const thrust::host_vector<size_t>& h_tree_segments,
-   const thrust::host_vector<DevicePredictionNode>& h_nodes,
-   size_t tree_begin, size_t tree_end) {
+                 const thrust::host_vector<size_t>& h_tree_segments,
+                 const thrust::host_vector<RegTree::Node>& h_nodes,
+                 size_t tree_begin, size_t tree_end) {
    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
    nodes_.resize(h_nodes.size());
    dh::safe_cuda(cudaMemcpyAsync(nodes_.data().get(), h_nodes.data(),
-                                  sizeof(DevicePredictionNode) * h_nodes.size(),
+                                  sizeof(RegTree::Node) * h_nodes.size(),
                                  cudaMemcpyHostToDevice));
    tree_segments_.resize(h_tree_segments.size());
    dh::safe_cuda(cudaMemcpyAsync(tree_segments_.data().get(), h_tree_segments.data(),
@@ -219,15 +191,11 @@ class GPUPredictor : public xgboost::Predictor {
    this->num_group_ = model.learner_model_param_->num_output_group;
  }

-  void PredictInternal(const SparsePage& batch,
-                       size_t num_features,
+  void PredictInternal(const SparsePage& batch, size_t num_features,
                       HostDeviceVector<bst_float>* predictions,
                       size_t batch_offset) {
-    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
-    batch.data.SetDevice(generic_param_->gpu_id);
    batch.offset.SetDevice(generic_param_->gpu_id);
-    predictions->SetDevice(generic_param_->gpu_id);
-
+    batch.data.SetDevice(generic_param_->gpu_id);
    const uint32_t BLOCK_THREADS = 128;
    size_t num_rows = batch.Size();
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
@@ -240,12 +208,29 @@ class GPUPredictor : public xgboost::Predictor {
      use_shared = false;
    }
    size_t entry_start = 0;
-
+    SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan()};
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
-        PredictKernel<BLOCK_THREADS>,
+        PredictKernel<SparsePageLoader, SparsePageView>,
+        data,
        dh::ToSpan(nodes_), predictions->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
-        batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, num_rows,
+        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
+        this->tree_begin_, this->tree_end_, num_features, num_rows,
+        entry_start, use_shared, this->num_group_);
+  }
+  void PredictInternal(EllpackMatrix const& batch, HostDeviceVector<bst_float>* out_preds,
+                       size_t batch_offset) {
+    const uint32_t BLOCK_THREADS = 256;
+    size_t num_rows = batch.n_rows;
+    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
+
+    bool use_shared = false;
+    size_t entry_start = 0;
+    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
+        PredictKernel<EllpackLoader, EllpackMatrix>,
+        batch,
+        dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
+        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
+        this->tree_begin_, this->tree_end_, batch.info.NumFeatures(), num_rows,
        entry_start, use_shared, this->num_group_);
  }

@@ -261,7 +246,7 @@ class GPUPredictor : public xgboost::Predictor {
      h_tree_segments.push_back(sum);
    }

-    thrust::host_vector<DevicePredictionNode> h_nodes(h_tree_segments.back());
+    thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
      auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
      std::copy(src_nodes.begin(), src_nodes.end(),
@@ -270,26 +255,31 @@ class GPUPredictor : public xgboost::Predictor {
    InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
  }

-  void DevicePredictInternal(DMatrix* dmat,
-                             HostDeviceVector<bst_float>* out_preds,
+  void DevicePredictInternal(DMatrix* dmat, HostDeviceVector<float>* out_preds,
                             const gbm::GBTreeModel& model, size_t tree_begin,
                             size_t tree_end) {
+    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
    if (tree_end - tree_begin == 0) {
      return;
    }
    monitor_.StartCuda("DevicePredictInternal");
-
    InitModel(model, tree_begin, tree_end);
+    out_preds->SetDevice(generic_param_->gpu_id);

-    size_t batch_offset = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      batch.offset.SetDevice(generic_param_->gpu_id);
-      batch.data.SetDevice(generic_param_->gpu_id);
-      PredictInternal(batch, model.learner_model_param_->num_feature,
-                      out_preds, batch_offset);
-      batch_offset += batch.Size() * model.learner_model_param_->num_output_group;
+    if (dmat->PageExists<EllpackPage>()) {
+      size_t batch_offset = 0;
+      for (auto const& page : dmat->GetBatches<EllpackPage>()) {
+        this->PredictInternal(page.Impl()->matrix, out_preds, batch_offset);
+        batch_offset += page.Impl()->matrix.n_rows;
+      }
+    } else {
+      size_t batch_offset = 0;
+      for (auto &batch : dmat->GetBatches<SparsePage>()) {
+        this->PredictInternal(batch, model.learner_model_param_->num_feature,
+                              out_preds, batch_offset);
+        batch_offset += batch.Size() * model.learner_model_param_->num_output_group;
+      }
    }
-
    monitor_.StopCuda("DevicePredictInternal");
  }

@@ -418,7 +408,7 @@ class GPUPredictor : public xgboost::Predictor {
  }

  common::Monitor monitor_;
-  dh::device_vector<DevicePredictionNode> nodes_;
+  dh::device_vector<RegTree::Node> nodes_;
  dh::device_vector<size_t> tree_segments_;
  dh::device_vector<int> tree_group_;
  size_t max_shared_memory_bytes_;