Remove column major specialization. (#5755)

Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
2020-06-05 16:19:14 +08:00
parent bd9d57f579
commit cacff9232a
10 changed files with 70 additions and 204 deletions
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -118,14 +118,18 @@ struct EllpackLoader {
  }
 };

-struct CuPyAdapterLoader {
-  data::CupyAdapterBatch batch;
+template <typename Batch>
+struct DeviceAdapterLoader {
+  Batch batch;
  bst_feature_t columns;
  float* smem;
  bool use_shared;

-  DEV_INLINE CuPyAdapterLoader(data::CupyAdapterBatch const batch, bool use_shared,
-                               bst_feature_t num_features, bst_row_t num_rows, size_t entry_start) :
+  using BatchT = Batch;
+
+  DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
+                                 bst_feature_t num_features, bst_row_t num_rows,
+                                 size_t entry_start) :
    batch{batch},
    columns{num_features},
    use_shared{use_shared} {
@@ -155,39 +159,6 @@ struct CuPyAdapterLoader {
  }
 };

-struct CuDFAdapterLoader {
-  data::CudfAdapterBatch batch;
-  bst_feature_t columns;
-  float* smem;
-  bool use_shared;
-
-  DEV_INLINE CuDFAdapterLoader(data::CudfAdapterBatch const batch, bool use_shared,
-                               bst_feature_t num_features,
-                               bst_row_t num_rows, size_t entry_start)
-      : batch{batch}, columns{num_features}, use_shared{use_shared} {
-    extern __shared__ float _smem[];
-    smem = _smem;
-    if (use_shared) {
-      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-      size_t shared_elements = blockDim.x * num_features;
-      dh::BlockFill(smem, shared_elements, nanf(""));
-      __syncthreads();
-      if (global_idx < num_rows) {
-        for (size_t i = 0; i < columns; ++i) {
-          smem[threadIdx.x * columns + i] = batch.GetValue(global_idx, i);
-        }
-      }
-    }
-    __syncthreads();
-  }
-  DEV_INLINE float GetFvalue(bst_row_t ridx, bst_feature_t fidx) const {
-    if (use_shared) {
-      return smem[threadIdx.x * columns + fidx];
-    }
-    return batch.GetValue(ridx, fidx);
-  }
-};
-
 template <typename Loader>
 __device__ float GetLeafWeight(bst_uint ridx, const RegTree::Node* tree,
                               Loader* loader) {
@@ -429,7 +400,7 @@ class GPUPredictor : public xgboost::Predictor {
          out_preds->Size() == dmat->Info().num_row_);
  }

-  template <typename Adapter, typename Loader, typename Batch>
+  template <typename Adapter, typename Loader>
  void DispatchedInplacePredict(dmlc::any const &x,
                                const gbm::GBTreeModel &model, float missing,
                                PredictionCacheEntry *out_preds,
@@ -439,22 +410,22 @@ class GPUPredictor : public xgboost::Predictor {
    DeviceModel d_model;
    d_model.Init(model, tree_begin, tree_end, this->generic_param_->gpu_id);

-    auto m = dmlc::get<Adapter>(x);
-    CHECK_EQ(m.NumColumns(), model.learner_model_param->num_feature)
+    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
+    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
        << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(this->generic_param_->gpu_id, m.DeviceIdx())
+    CHECK_EQ(this->generic_param_->gpu_id, m->DeviceIdx())
        << "XGBoost is running on device: " << this->generic_param_->gpu_id << ", "
-        << "but data is on: " << m.DeviceIdx();
+        << "but data is on: " << m->DeviceIdx();
    MetaInfo info;
-    info.num_col_ = m.NumColumns();
-    info.num_row_ = m.NumRows();
+    info.num_col_ = m->NumColumns();
+    info.num_row_ = m->NumRows();
    this->InitOutPredictions(info, &(out_preds->predictions), model);

    const uint32_t BLOCK_THREADS = 128;
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(info.num_row_, BLOCK_THREADS));

    auto shared_memory_bytes =
-        static_cast<size_t>(sizeof(float) * m.NumColumns() * BLOCK_THREADS);
+        static_cast<size_t>(sizeof(float) * m->NumColumns() * BLOCK_THREADS);
    bool use_shared = true;
    if (shared_memory_bytes > max_shared_memory_bytes) {
      shared_memory_bytes = 0;
@@ -463,22 +434,24 @@ class GPUPredictor : public xgboost::Predictor {
    size_t entry_start = 0;

    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
-        PredictKernel<Loader, Batch>,
-        m.Value(),
+        PredictKernel<Loader, typename Loader::BatchT>,
+        m->Value(),
        dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
        dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
-        tree_begin, tree_end, m.NumColumns(), info.num_row_,
+        tree_begin, tree_end, m->NumColumns(), info.num_row_,
        entry_start, use_shared, output_groups);
  }

  void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
                      float missing, PredictionCacheEntry *out_preds,
                      uint32_t tree_begin, unsigned tree_end) const override {
-    if (x.type() == typeid(data::CupyAdapter)) {
-      this->DispatchedInplacePredict<data::CupyAdapter, CuPyAdapterLoader, data::CupyAdapterBatch>(
+    if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
+      this->DispatchedInplacePredict<
+          data::CupyAdapter, DeviceAdapterLoader<data::CupyAdapterBatch>>(
          x, model, missing, out_preds, tree_begin, tree_end);
-    } else if (x.type() == typeid(data::CudfAdapter)) {
-      this->DispatchedInplacePredict<data::CudfAdapter, CuDFAdapterLoader, data::CudfAdapterBatch>(
+    } else if (x.type() == typeid(std::shared_ptr<data::CudfAdapter>)) {
+      this->DispatchedInplacePredict<
+          data::CudfAdapter, DeviceAdapterLoader<data::CudfAdapterBatch>>(
          x, model, missing, out_preds, tree_begin, tree_end);
    } else {
      LOG(FATAL) << "Only CuPy and CuDF are supported by GPU Predictor.";