Thread safe, inplace prediction. (#5389)

Normal prediction with DMatrix is now thread safe with locks. Added inplace prediction is lock free thread safe. When data is on device (cupy, cudf), the returned data is also on device. * Implementation for numpy, csr, cudf and cupy. * Implementation for dask. * Remove sync in simple dmatrix.
2020-03-30 15:35:28 +08:00
parent 7f980e9f83
commit 6601a641d7
25 changed files with 1217 additions and 167 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -12,6 +12,7 @@

 #include "xgboost/base.h"
 #include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
 #include "xgboost/learner.h"
 #include "xgboost/c_api.h"
 #include "xgboost/logging.h"
@@ -450,6 +451,95 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
  API_END();
 }

+// A hidden API as cache id is not being supported yet.
+XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, float *values,
+                                      xgboost::bst_ulong n_rows,
+                                      xgboost::bst_ulong n_cols,
+                                      float missing,
+                                      unsigned iteration_begin,
+                                      unsigned iteration_end,
+                                      char const* c_type,
+                                      xgboost::bst_ulong cache_id,
+                                      xgboost::bst_ulong *out_len,
+                                      const float **out_result) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  CHECK_EQ(cache_id, 0) << "Cache ID is not supported yet";
+  auto *learner = static_cast<xgboost::Learner *>(handle);
+
+  auto x = xgboost::data::DenseAdapter(values, n_rows, n_cols);
+  HostDeviceVector<float>* p_predt { nullptr };
+  std::string type { c_type };
+  learner->InplacePredict(x, type, missing, &p_predt);
+  CHECK(p_predt);
+
+  *out_result = dmlc::BeginPtr(p_predt->HostVector());
+  *out_len = static_cast<xgboost::bst_ulong>(p_predt->Size());
+  API_END();
+}
+
+// A hidden API as cache id is not being supported yet.
+XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle,
+                                    const size_t* indptr,
+                                    const unsigned* indices,
+                                    const bst_float* data,
+                                    size_t nindptr,
+                                    size_t nelem,
+                                    size_t num_col,
+                                    float missing,
+                                    unsigned iteration_begin,
+                                    unsigned iteration_end,
+                                    char const *c_type,
+                                    xgboost::bst_ulong cache_id,
+                                    xgboost::bst_ulong *out_len,
+                                    const float **out_result) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  CHECK_EQ(cache_id, 0) << "Cache ID is not supported yet";
+  auto *learner = static_cast<xgboost::Learner *>(handle);
+
+  auto x = data::CSRAdapter(indptr, indices, data, nindptr - 1, nelem, num_col);
+  HostDeviceVector<float>* p_predt { nullptr };
+  std::string type { c_type };
+  learner->InplacePredict(x, type, missing, &p_predt);
+  CHECK(p_predt);
+
+  *out_result = dmlc::BeginPtr(p_predt->HostVector());
+  *out_len = static_cast<xgboost::bst_ulong>(p_predt->Size());
+  API_END();
+}
+
+#if !defined(XGBOOST_USE_CUDA)
+XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(BoosterHandle handle,
+                                                      char const* c_json_strs,
+                                                      float missing,
+                                                      unsigned iteration_begin,
+                                                      unsigned iteration_end,
+                                                      char const* c_type,
+                                                      xgboost::bst_ulong cache_id,
+                                                      xgboost::bst_ulong *out_len,
+                                                      float const** out_result) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  LOG(FATAL) << "XGBoost not compiled with CUDA.";
+  API_END();
+}
+XGB_DLL int XGBoosterPredictFromArrayInterface(BoosterHandle handle,
+                                               char const* c_json_strs,
+                                               float missing,
+                                               unsigned iteration_begin,
+                                               unsigned iteration_end,
+                                               char const* c_type,
+                                               xgboost::bst_ulong cache_id,
+                                               xgboost::bst_ulong *out_len,
+                                               const float **out_result) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  LOG(FATAL) << "XGBoost not compiled with CUDA.";
+  API_END();
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
  API_BEGIN();
  CHECK_HANDLE();
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -52,3 +52,60 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromArrayInterface(char const* c_json_s
    new std::shared_ptr<DMatrix>(new data::DeviceDMatrix(&adapter, missing, nthread, max_bin));
  API_END();
 }
+
+// A hidden API as cache id is not being supported yet.
+XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(BoosterHandle handle,
+                                                      char const* c_json_strs,
+                                                      float missing,
+                                                      unsigned iteration_begin,
+                                                      unsigned iteration_end,
+                                                      char const* c_type,
+                                                      xgboost::bst_ulong cache_id,
+                                                      xgboost::bst_ulong *out_len,
+                                                      float const** out_result) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  CHECK_EQ(cache_id, 0) << "Cache ID is not supported yet";
+  auto *learner = static_cast<Learner*>(handle);
+
+  std::string json_str{c_json_strs};
+  auto x = data::CudfAdapter(json_str);
+  HostDeviceVector<float>* p_predt { nullptr };
+  std::string type { c_type };
+  learner->InplacePredict(x, type, missing, &p_predt);
+  CHECK(p_predt);
+  CHECK(p_predt->DeviceCanRead());
+
+  *out_result = p_predt->ConstDevicePointer();
+  *out_len = static_cast<xgboost::bst_ulong>(p_predt->Size());
+
+  API_END();
+}
+// A hidden API as cache id is not being supported yet.
+XGB_DLL int XGBoosterPredictFromArrayInterface(BoosterHandle handle,
+                                               char const* c_json_strs,
+                                               float missing,
+                                               unsigned iteration_begin,
+                                               unsigned iteration_end,
+                                               char const* c_type,
+                                               xgboost::bst_ulong cache_id,
+                                               xgboost::bst_ulong *out_len,
+                                               float const** out_result) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  CHECK_EQ(cache_id, 0) << "Cache ID is not supported yet";
+  auto *learner = static_cast<Learner*>(handle);
+
+  std::string json_str{c_json_strs};
+  auto x = data::CupyAdapter(json_str);
+  HostDeviceVector<float>* p_predt { nullptr };
+  std::string type { c_type };
+  learner->InplacePredict(x, type, missing, &p_predt);
+  CHECK(p_predt);
+  CHECK(p_predt->DeviceCanRead());
+
+  *out_result = p_predt->ConstDevicePointer();
+  *out_len = static_cast<xgboost::bst_ulong>(p_predt->Size());
+
+  API_END();
+}
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -52,6 +52,13 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
                      : std::numeric_limits<float>::quiet_NaN();
    return COOTuple(row_idx, column_idx, value);
  }
+  __device__ float GetValue(size_t ridx, bst_feature_t fidx) const {
+    auto const& column = columns_[fidx];
+    float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
+                      ? column.GetElement(ridx)
+                      : std::numeric_limits<float>::quiet_NaN();
+    return value;
+  }

 private:
  common::Span<ArrayInterface> columns_;
@@ -129,6 +136,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
    for (auto& json_col : json_columns) {
      auto column = ArrayInterface(get<Object const>(json_col));
      columns.push_back(column);
+      CHECK_EQ(column.num_cols, 1);
      column_ptr.emplace_back(column_ptr.back() + column.num_rows);
      num_rows_ = std::max(num_rows_, size_t(column.num_rows));
      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -122,8 +122,6 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
    CopyDataColumnMajor(adapter, sparse_page_.data.DeviceSpan(),
                        adapter->DeviceIdx(), missing, s_offset);
  }
-  // Sync
-  sparse_page_.data.HostVector();

  info.num_col_ = adapter->NumColumns();
  info.num_row_ = adapter->NumRows();
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2014-2019 by Contributors
+ * Copyright 2014-2020 by Contributors
 * \file gbtree.cc
 * \brief gradient boosted tree implementation.
 * \author Tianqi Chen
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_map>

+#include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/logging.h"
 #include "xgboost/gbm.h"
@@ -203,6 +204,22 @@ class GBTree : public GradientBooster {
                    bool training,
                    unsigned ntree_limit) override;

+  void InplacePredict(dmlc::any const &x, float missing,
+                      PredictionCacheEntry *out_preds,
+                      uint32_t layer_begin = 0,
+                      unsigned layer_end = 0) const override {
+    CHECK(configured_);
+    // From here on, layer becomes concrete trees.
+    bst_group_t groups = model_.learner_model_param_->num_output_group;
+    uint32_t tree_begin = layer_begin * groups * tparam_.num_parallel_tree;
+    uint32_t tree_end = layer_end * groups * tparam_.num_parallel_tree;
+    if (tree_end == 0 || tree_end > model_.trees.size()) {
+      tree_end = static_cast<uint32_t>(model_.trees.size());
+    }
+    this->GetPredictor()->InplacePredict(x, model_, missing, out_preds,
+                                         tree_begin, tree_end);
+  }
+
  void PredictInstance(const SparsePage::Inst& inst,
                       std::vector<bst_float>* out_preds,
                       unsigned ntree_limit) override {
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -8,6 +8,8 @@
 #include <dmlc/parameter.h>
 #include <dmlc/thread_local.h>

+#include <atomic>
+#include <mutex>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -18,6 +20,7 @@
 #include <utility>
 #include <vector>

+#include "dmlc/any.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/model.h"
@@ -205,7 +208,7 @@ class LearnerConfiguration : public Learner {
  PredictionContainer cache_;

 protected:
-  bool need_configuration_;
+  std::atomic<bool> need_configuration_;
  std::map<std::string, std::string> cfg_;
  // Stores information like best-iteration for early stopping.
  std::map<std::string, std::string> attributes_;
@@ -214,6 +217,7 @@ class LearnerConfiguration : public Learner {
  LearnerModelParam learner_model_param_;
  LearnerTrainParam tparam_;
  std::vector<std::string> metric_names_;
+  std::mutex config_lock_;

 public:
  explicit LearnerConfiguration(std::vector<std::shared_ptr<DMatrix> > cache)
@@ -226,6 +230,9 @@ class LearnerConfiguration : public Learner {
  // Configuration before data is known.

  void Configure() override {
+    // Varient of double checked lock
+    if (!this->need_configuration_) { return; }
+    std::lock_guard<std::mutex> gard(config_lock_);
    if (!this->need_configuration_) { return; }

    monitor_.Start("Configure");
@@ -1003,6 +1010,23 @@ class LearnerImpl : public LearnerIO {
  XGBAPIThreadLocalEntry& GetThreadLocal() const override {
    return (*XGBAPIThreadLocalStore::Get())[this];
  }
+
+  void InplacePredict(dmlc::any const &x, std::string const &type,
+                      float missing, HostDeviceVector<bst_float> **out_preds,
+                      uint32_t layer_begin = 0, uint32_t layer_end = 0) override {
+    this->Configure();
+    auto& out_predictions = this->GetThreadLocal().prediction_entry;
+    this->gbm_->InplacePredict(x, missing, &out_predictions, layer_begin,
+                               layer_end);
+    if (type == "value") {
+      obj_->PredTransform(&out_predictions.predictions);
+    } else if (type == "margin") {
+    } else {
+      LOG(FATAL) << "Unsupported prediction type:" << type;
+    }
+    *out_preds = &out_predictions.predictions;
+  }
+
  const std::map<std::string, std::string>& GetConfigurationArguments() const override {
    return cfg_;
  }
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -2,13 +2,22 @@
 * Copyright by Contributors 2017-2020
 */
 #include <dmlc/omp.h>
+#include <dmlc/any.h>

+#include <cstddef>
+#include <limits>
+#include <mutex>
+
+#include "xgboost/base.h"
+#include "xgboost/data.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
 #include "xgboost/logging.h"
 #include "xgboost/host_device_vector.h"

+#include "../data/adapter.h"
+#include "../common/math.h"
 #include "../gbm/gbtree_model.h"

 namespace xgboost {
@@ -16,89 +25,156 @@ namespace predictor {

 DMLC_REGISTRY_FILE_TAG(cpu_predictor);

+bst_float PredValue(const SparsePage::Inst &inst,
+                    const std::vector<std::unique_ptr<RegTree>> &trees,
+                    const std::vector<int> &tree_info, int bst_group,
+                    RegTree::FVec *p_feats, unsigned tree_begin,
+                    unsigned tree_end) {
+  bst_float psum = 0.0f;
+  p_feats->Fill(inst);
+  for (size_t i = tree_begin; i < tree_end; ++i) {
+    if (tree_info[i] == bst_group) {
+      int tid = trees[i]->GetLeafIndex(*p_feats);
+      psum += (*trees[i])[tid].LeafValue();
+    }
+  }
+  p_feats->Drop(inst);
+  return psum;
+}
+
+template <size_t kUnrollLen = 8>
+struct SparsePageView {
+  SparsePage const* page;
+  bst_row_t base_rowid;
+  static size_t constexpr kUnroll = kUnrollLen;
+
+  explicit SparsePageView(SparsePage const *p)
+      : page{p}, base_rowid{page->base_rowid} {
+    // Pull to host before entering omp block, as this is not thread safe.
+    page->data.HostVector();
+    page->offset.HostVector();
+  }
+  SparsePage::Inst operator[](size_t i) { return (*page)[i]; }
+  size_t Size() const { return page->Size(); }
+};
+
+template <typename Adapter, size_t kUnrollLen = 8>
+class AdapterView {
+  Adapter* adapter_;
+  float missing_;
+  common::Span<Entry> workspace_;
+  std::vector<size_t> current_unroll_;
+
+ public:
+  static size_t constexpr kUnroll = kUnrollLen;
+
+ public:
+  explicit AdapterView(Adapter *adapter, float missing,
+                       common::Span<Entry> workplace)
+      : adapter_{adapter}, missing_{missing}, workspace_{workplace},
+        current_unroll_(omp_get_max_threads() > 0 ? omp_get_max_threads() : 1, 0) {}
+  SparsePage::Inst operator[](size_t i) {
+    bst_feature_t columns = adapter_->NumColumns();
+    auto const &batch = adapter_->Value();
+    auto row = batch.GetLine(i);
+    auto t = omp_get_thread_num();
+    auto const beg = (columns * kUnroll * t) + (current_unroll_[t] * columns);
+    size_t non_missing {beg};
+    for (size_t c = 0; c < row.Size(); ++c) {
+      auto e = row.GetElement(c);
+      if (missing_ != e.value && !common::CheckNAN(e.value)) {
+        workspace_[non_missing] =
+            Entry{static_cast<bst_feature_t>(e.column_idx), e.value};
+        ++non_missing;
+      }
+    }
+    auto ret = workspace_.subspan(beg, non_missing - beg);
+    current_unroll_[t]++;
+    if (current_unroll_[t] == kUnroll) {
+      current_unroll_[t] = 0;
+    }
+    return ret;
+  }
+
+  size_t Size() const { return adapter_->NumRows(); }
+
+  bst_row_t const static base_rowid = 0;  // NOLINT
+};
+
+template <typename DataView>
+void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds,
+                        gbm::GBTreeModel const &model, int32_t tree_begin,
+                        int32_t tree_end,
+                        std::vector<RegTree::FVec> *p_thread_temp) {
+  auto& thread_temp = *p_thread_temp;
+  int32_t const num_group = model.learner_model_param_->num_output_group;
+
+  std::vector<bst_float> &preds = *out_preds;
+  CHECK_EQ(model.param.size_leaf_vector, 0)
+      << "size_leaf_vector is enforced to 0 so far";
+  // parallel over local batch
+  const auto nsize = static_cast<bst_omp_uint>(batch.Size());
+  auto constexpr kUnroll = DataView::kUnroll;
+  const bst_omp_uint rest = nsize % kUnroll;
+  if (nsize >= kUnroll) {
+#pragma omp parallel for schedule(static)
+    for (bst_omp_uint i = 0; i < nsize - rest; i += kUnroll) {
+      const int tid = omp_get_thread_num();
+      RegTree::FVec &feats = thread_temp[tid];
+      int64_t ridx[kUnroll];
+      SparsePage::Inst inst[kUnroll];
+      for (size_t k = 0; k < kUnroll; ++k) {
+        ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
+      }
+      for (size_t k = 0; k < kUnroll; ++k) {
+        inst[k] = batch[i + k];
+      }
+      for (size_t k = 0; k < kUnroll; ++k) {
+        for (int gid = 0; gid < num_group; ++gid) {
+          const size_t offset = ridx[k] * num_group + gid;
+          preds[offset] += PredValue(inst[k], model.trees, model.tree_info, gid,
+                                     &feats, tree_begin, tree_end);
+        }
+      }
+    }
+  }
+  for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
+    RegTree::FVec &feats = thread_temp[0];
+    const auto ridx = static_cast<int64_t>(batch.base_rowid + i);
+    auto inst = batch[i];
+    for (int gid = 0; gid < num_group; ++gid) {
+      const size_t offset = ridx * num_group + gid;
+      preds[offset] += PredValue(inst, model.trees, model.tree_info, gid,
+                                 &feats, tree_begin, tree_end);
+    }
+  }
+}
+
 class CPUPredictor : public Predictor {
 protected:
-  static bst_float PredValue(const SparsePage::Inst& inst,
-                             const std::vector<std::unique_ptr<RegTree>>& trees,
-                             const std::vector<int>& tree_info, int bst_group,
-                             RegTree::FVec* p_feats,
-                             unsigned tree_begin, unsigned tree_end) {
-    bst_float psum = 0.0f;
-    p_feats->Fill(inst);
-    for (size_t i = tree_begin; i < tree_end; ++i) {
-      if (tree_info[i] == bst_group) {
-        int tid = trees[i]->GetLeafIndex(*p_feats);
-        psum += (*trees[i])[tid].LeafValue();
-      }
-    }
-    p_feats->Drop(inst);
-    return psum;
-  }
-
  // init thread buffers
-  inline void InitThreadTemp(int nthread, int num_feature) {
-    int prev_thread_temp_size = thread_temp.size();
+  static void InitThreadTemp(int nthread, int num_feature, std::vector<RegTree::FVec>* out) {
+    int prev_thread_temp_size = out->size();
    if (prev_thread_temp_size < nthread) {
-      thread_temp.resize(nthread, RegTree::FVec());
+      out->resize(nthread, RegTree::FVec());
      for (int i = prev_thread_temp_size; i < nthread; ++i) {
-        thread_temp[i].Init(num_feature);
+        (*out)[i].Init(num_feature);
      }
    }
  }

-  void PredInternal(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
-                    gbm::GBTreeModel const &model, int32_t tree_begin,
-                    int32_t tree_end) {
-    int32_t const num_group = model.learner_model_param_->num_output_group;
-    const int nthread = omp_get_max_threads();
-    InitThreadTemp(nthread, model.learner_model_param_->num_feature);
-    std::vector<bst_float>& preds = *out_preds;
-    CHECK_EQ(model.param.size_leaf_vector, 0)
-        << "size_leaf_vector is enforced to 0 so far";
-    CHECK_EQ(preds.size(), p_fmat->Info().num_row_ * num_group);
-    // start collecting the prediction
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      // parallel over local batch
-      constexpr int kUnroll = 8;
-      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
-      const bst_omp_uint rest = nsize % kUnroll;
-      // Pull to host before entering omp block, as this is not thread safe.
-      batch.data.HostVector();
-      batch.offset.HostVector();
-      if (nsize >= kUnroll) {
-#pragma omp parallel for schedule(static)
-        for (bst_omp_uint i = 0; i < nsize - rest; i += kUnroll) {
-          const int tid = omp_get_thread_num();
-          RegTree::FVec& feats = thread_temp[tid];
-          int64_t ridx[kUnroll];
-          SparsePage::Inst inst[kUnroll];
-          for (int k = 0; k < kUnroll; ++k) {
-            ridx[k] = static_cast<int64_t>(batch.base_rowid + i + k);
-          }
-          for (int k = 0; k < kUnroll; ++k) {
-            inst[k] = batch[i + k];
-          }
-          for (int k = 0; k < kUnroll; ++k) {
-            for (int gid = 0; gid < num_group; ++gid) {
-              const size_t offset = ridx[k] * num_group + gid;
-              preds[offset] += this->PredValue(
-                  inst[k], model.trees, model.tree_info, gid,
-                  &feats, tree_begin, tree_end);
-            }
-          }
-        }
-      }
-      for (bst_omp_uint i = nsize - rest; i < nsize; ++i) {
-        RegTree::FVec& feats = thread_temp[0];
-        const auto ridx = static_cast<int64_t>(batch.base_rowid + i);
-        auto inst = batch[i];
-        for (int gid = 0; gid < num_group; ++gid) {
-          const size_t offset = ridx * num_group + gid;
-          preds[offset] +=
-              this->PredValue(inst, model.trees, model.tree_info, gid,
-                              &feats, tree_begin, tree_end);
-        }
-      }
+  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
+                      gbm::GBTreeModel const &model, int32_t tree_begin,
+                      int32_t tree_end) {
+    std::lock_guard<std::mutex> guard(lock_);
+    const int threads = omp_get_max_threads();
+    InitThreadTemp(threads, model.learner_model_param_->num_feature, &this->thread_temp_);
+    for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
+      CHECK_EQ(out_preds->size(),
+               p_fmat->Info().num_row_ * model.learner_model_param_->num_output_group);
+      size_t constexpr kUnroll = 8;
+      PredictBatchKernel(SparsePageView<kUnroll>{&batch}, out_preds, model, tree_begin,
+                         tree_end, &thread_temp_);
    }
  }

@@ -175,9 +251,9 @@ class CPUPredictor : public Predictor {
    CHECK_LE(beg_version, end_version);

    if (beg_version < end_version) {
-      this->PredInternal(dmat, &out_preds->HostVector(), model,
-                         beg_version * output_groups,
-                         end_version * output_groups);
+      this->PredictDMatrix(dmat, &out_preds->HostVector(), model,
+                           beg_version * output_groups,
+                           end_version * output_groups);
    }

    // delta means {size of forest} * {number of newly accumulated layers}
@@ -189,12 +265,49 @@ class CPUPredictor : public Predictor {
          out_preds->Size() == dmat->Info().num_row_);
  }

+  template <typename Adapter>
+  void DispatchedInplacePredict(dmlc::any const &x,
+                                const gbm::GBTreeModel &model, float missing,
+                                PredictionCacheEntry *out_preds,
+                                uint32_t tree_begin, uint32_t tree_end) const {
+    auto threads = omp_get_max_threads();
+    auto m = dmlc::get<Adapter>(x);
+    CHECK_EQ(m.NumColumns(), model.learner_model_param_->num_feature)
+        << "Number of columns in data must equal to trained model.";
+    MetaInfo info;
+    info.num_col_ = m.NumColumns();
+    info.num_row_ = m.NumRows();
+    this->InitOutPredictions(info, &(out_preds->predictions), model);
+    std::vector<Entry> workspace(info.num_col_ * 8 * threads);
+    auto &predictions = out_preds->predictions.HostVector();
+    std::vector<RegTree::FVec> thread_temp;
+    InitThreadTemp(threads, model.learner_model_param_->num_feature, &thread_temp);
+    size_t constexpr kUnroll = 8;
+    PredictBatchKernel(AdapterView<Adapter, kUnroll>(
+                           &m, missing, common::Span<Entry>{workspace}),
+                       &predictions, model, tree_begin, tree_end, &thread_temp);
+  }
+
+  void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
+                      float missing, PredictionCacheEntry *out_preds,
+                      uint32_t tree_begin, unsigned tree_end) const override {
+    if (x.type() == typeid(data::DenseAdapter)) {
+      this->DispatchedInplacePredict<data::DenseAdapter>(
+          x, model, missing, out_preds, tree_begin, tree_end);
+    } else if (x.type() == typeid(data::CSRAdapter)) {
+      this->DispatchedInplacePredict<data::CSRAdapter>(
+          x, model, missing, out_preds, tree_begin, tree_end);
+    } else {
+      LOG(FATAL) << "Data type is not supported by CPU Predictor.";
+    }
+  }
+
  void PredictInstance(const SparsePage::Inst& inst,
                       std::vector<bst_float>* out_preds,
                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    if (thread_temp.size() == 0) {
-      thread_temp.resize(1, RegTree::FVec());
-      thread_temp[0].Init(model.learner_model_param_->num_feature);
+    if (thread_temp_.size() == 0) {
+      thread_temp_.resize(1, RegTree::FVec());
+      thread_temp_[0].Init(model.learner_model_param_->num_feature);
    }
    ntree_limit *= model.learner_model_param_->num_output_group;
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
@@ -204,16 +317,16 @@ class CPUPredictor : public Predictor {
                      (model.param.size_leaf_vector + 1));
    // loop over output groups
    for (uint32_t gid = 0; gid < model.learner_model_param_->num_output_group; ++gid) {
-      (*out_preds)[gid] =
-          PredValue(inst, model.trees, model.tree_info, gid,
-                    &thread_temp[0], 0, ntree_limit) +
-          model.learner_model_param_->base_score;
+      (*out_preds)[gid] = PredValue(inst, model.trees, model.tree_info, gid,
+                                    &thread_temp_[0], 0, ntree_limit) +
+                          model.learner_model_param_->base_score;
    }
  }
+
  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
    const int nthread = omp_get_max_threads();
-    InitThreadTemp(nthread, model.learner_model_param_->num_feature);
+    InitThreadTemp(nthread, model.learner_model_param_->num_feature, &this->thread_temp_);
    const MetaInfo& info = p_fmat->Info();
    // number of valid trees
    ntree_limit *= model.learner_model_param_->num_output_group;
@@ -230,7 +343,7 @@ class CPUPredictor : public Predictor {
      for (bst_omp_uint i = 0; i < nsize; ++i) {
        const int tid = omp_get_thread_num();
        auto ridx = static_cast<size_t>(batch.base_rowid + i);
-        RegTree::FVec& feats = thread_temp[tid];
+        RegTree::FVec &feats = thread_temp_[tid];
        feats.Fill(batch[i]);
        for (unsigned j = 0; j < ntree_limit; ++j) {
          int tid = model.trees[j]->GetLeafIndex(feats);
@@ -247,7 +360,7 @@ class CPUPredictor : public Predictor {
                           bool approximate, int condition,
                           unsigned condition_feature) override {
    const int nthread = omp_get_max_threads();
-    InitThreadTemp(nthread,  model.learner_model_param_->num_feature);
+    InitThreadTemp(nthread,  model.learner_model_param_->num_feature, &this->thread_temp_);
    const MetaInfo& info = p_fmat->Info();
    // number of valid trees
    ntree_limit *= model.learner_model_param_->num_output_group;
@@ -277,7 +390,7 @@ class CPUPredictor : public Predictor {
 #pragma omp parallel for schedule(static)
      for (bst_omp_uint i = 0; i < nsize; ++i) {
        auto row_idx = static_cast<size_t>(batch.base_rowid + i);
-        RegTree::FVec& feats = thread_temp[omp_get_thread_num()];
+        RegTree::FVec &feats = thread_temp_[omp_get_thread_num()];
        std::vector<bst_float> this_tree_contribs(ncolumns);
        // loop over all classes
        for (int gid = 0; gid < ngroup; ++gid) {
@@ -359,7 +472,10 @@ class CPUPredictor : public Predictor {
      }
    }
  }
-  std::vector<RegTree::FVec> thread_temp;
+
+ private:
+  std::mutex lock_;
+  std::vector<RegTree::FVec> thread_temp_;
 };

 XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -15,6 +15,7 @@

 #include "../gbm/gbtree_model.h"
 #include "../data/ellpack_page.cuh"
+#include "../data/device_adapter.cuh"
 #include "../common/common.h"
 #include "../common/device_helpers.cuh"

@@ -116,6 +117,76 @@ struct EllpackLoader {
  }
 };

+struct CuPyAdapterLoader {
+  data::CupyAdapterBatch batch;
+  bst_feature_t columns;
+  float* smem;
+  bool use_shared;
+
+  DEV_INLINE CuPyAdapterLoader(data::CupyAdapterBatch const batch, bool use_shared,
+                               bst_feature_t num_features, bst_row_t num_rows, size_t entry_start) :
+    batch{batch},
+    columns{num_features},
+    use_shared{use_shared} {
+      extern __shared__ float _smem[];
+      smem = _smem;
+      if (use_shared) {
+        uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+        size_t shared_elements = blockDim.x * num_features;
+        dh::BlockFill(smem, shared_elements, nanf(""));
+        __syncthreads();
+        if (global_idx < num_rows) {
+          auto beg = global_idx * columns;
+          auto end = (global_idx + 1) * columns;
+          for (size_t i = beg; i < end; ++i) {
+            smem[threadIdx.x * num_features + (i - beg)] = batch.GetElement(i).value;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+  DEV_INLINE float GetFvalue(bst_row_t ridx, bst_feature_t fidx) const {
+    if (use_shared) {
+      return smem[threadIdx.x * columns + fidx];
+    }
+    return batch.GetElement(ridx * columns + fidx).value;
+  }
+};
+
+struct CuDFAdapterLoader {
+  data::CudfAdapterBatch batch;
+  bst_feature_t columns;
+  float* smem;
+  bool use_shared;
+
+  DEV_INLINE CuDFAdapterLoader(data::CudfAdapterBatch const batch, bool use_shared,
+                               bst_feature_t num_features,
+                               bst_row_t num_rows, size_t entry_start)
+      : batch{batch}, columns{num_features}, use_shared{use_shared} {
+    extern __shared__ float _smem[];
+    smem = _smem;
+    if (use_shared) {
+      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      size_t shared_elements = blockDim.x * num_features;
+      dh::BlockFill(smem, shared_elements, nanf(""));
+      __syncthreads();
+      if (global_idx < num_rows) {
+        for (size_t i = 0; i < columns; ++i) {
+          smem[threadIdx.x * columns + i] = batch.GetValue(global_idx, i);
+        }
+      }
+    }
+    __syncthreads();
+  }
+  DEV_INLINE float GetFvalue(bst_row_t ridx, bst_feature_t fidx) const {
+    if (use_shared) {
+      return smem[threadIdx.x * columns + fidx];
+    }
+    return batch.GetValue(ridx, fidx);
+  }
+};
+
 template <typename Loader>
 __device__ float GetLeafWeight(bst_uint ridx, const RegTree::Node* tree,
                               Loader* loader) {
@@ -169,30 +240,61 @@ __global__ void PredictKernel(Data data,
  }
 }

-class GPUPredictor : public xgboost::Predictor {
- private:
-  void InitModel(const gbm::GBTreeModel& model,
+class DeviceModel {
+ public:
+  dh::device_vector<RegTree::Node> nodes;
+  dh::device_vector<size_t> tree_segments;
+  dh::device_vector<int> tree_group;
+  size_t tree_beg_;  // NOLINT
+  size_t tree_end_;  // NOLINT
+  int num_group;
+
+  void CopyModel(const gbm::GBTreeModel& model,
                 const thrust::host_vector<size_t>& h_tree_segments,
                 const thrust::host_vector<RegTree::Node>& h_nodes,
                 size_t tree_begin, size_t tree_end) {
-    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
-    nodes_.resize(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes_.data().get(), h_nodes.data(),
+    nodes.resize(h_nodes.size());
+    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
                                  sizeof(RegTree::Node) * h_nodes.size(),
                                  cudaMemcpyHostToDevice));
-    tree_segments_.resize(h_tree_segments.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_segments_.data().get(), h_tree_segments.data(),
+    tree_segments.resize(h_tree_segments.size());
+    dh::safe_cuda(cudaMemcpyAsync(tree_segments.data().get(), h_tree_segments.data(),
                                  sizeof(size_t) * h_tree_segments.size(),
                                  cudaMemcpyHostToDevice));
-    tree_group_.resize(model.tree_info.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_group_.data().get(), model.tree_info.data(),
+    tree_group.resize(model.tree_info.size());
+    dh::safe_cuda(cudaMemcpyAsync(tree_group.data().get(), model.tree_info.data(),
                                  sizeof(int) * model.tree_info.size(),
                                  cudaMemcpyHostToDevice));
-    this->tree_begin_ = tree_begin;
+    this->tree_beg_ = tree_begin;
    this->tree_end_ = tree_end;
-    this->num_group_ = model.learner_model_param_->num_output_group;
+    this->num_group = model.learner_model_param_->num_output_group;
  }

+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
+    dh::safe_cuda(cudaSetDevice(gpu_id));
+    CHECK_EQ(model.param.size_leaf_vector, 0);
+    // Copy decision trees to device
+    thrust::host_vector<size_t> h_tree_segments{};
+    h_tree_segments.reserve((tree_end - tree_begin) + 1);
+    size_t sum = 0;
+    h_tree_segments.push_back(sum);
+    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      sum += model.trees.at(tree_idx)->GetNodes().size();
+      h_tree_segments.push_back(sum);
+    }
+
+    thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
+    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
+      std::copy(src_nodes.begin(), src_nodes.end(),
+                h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
+    }
+    CopyModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
+  }
+};
+
+class GPUPredictor : public xgboost::Predictor {
+ private:
  void PredictInternal(const SparsePage& batch, size_t num_features,
                       HostDeviceVector<bst_float>* predictions,
                       size_t batch_offset) {
@@ -214,10 +316,10 @@ class GPUPredictor : public xgboost::Predictor {
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
        PredictKernel<SparsePageLoader, SparsePageView>,
        data,
-        dh::ToSpan(nodes_), predictions->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
-        this->tree_begin_, this->tree_end_, num_features, num_rows,
-        entry_start, use_shared, this->num_group_);
+        dh::ToSpan(model_.nodes), predictions->DeviceSpan().subspan(batch_offset),
+        dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
+        model_.tree_beg_, model_.tree_end_, num_features, num_rows,
+        entry_start, use_shared, model_.num_group);
  }
  void PredictInternal(EllpackDeviceAccessor const& batch, HostDeviceVector<bst_float>* out_preds,
                       size_t batch_offset) {
@@ -230,31 +332,10 @@ class GPUPredictor : public xgboost::Predictor {
    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
        PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
        batch,
-        dh::ToSpan(nodes_), out_preds->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_),
-        this->tree_begin_, this->tree_end_, batch.NumFeatures(), num_rows,
-        entry_start, use_shared, this->num_group_);
-  }
-
-  void InitModel(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
-    CHECK_EQ(model.param.size_leaf_vector, 0);
-    // Copy decision trees to device
-    thrust::host_vector<size_t> h_tree_segments{};
-    h_tree_segments.reserve((tree_end - tree_begin) + 1);
-    size_t sum = 0;
-    h_tree_segments.push_back(sum);
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      sum += model.trees.at(tree_idx)->GetNodes().size();
-      h_tree_segments.push_back(sum);
-    }
-
-    thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
-      std::copy(src_nodes.begin(), src_nodes.end(),
-                h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
-    }
-    InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
+        dh::ToSpan(model_.nodes), out_preds->DeviceSpan().subspan(batch_offset),
+        dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
+        model_.tree_beg_, model_.tree_end_, batch.NumFeatures(), num_rows,
+        entry_start, use_shared, model_.num_group);
  }

  void DevicePredictInternal(DMatrix* dmat, HostDeviceVector<float>* out_preds,
@@ -264,8 +345,7 @@ class GPUPredictor : public xgboost::Predictor {
    if (tree_end - tree_begin == 0) {
      return;
    }
-    monitor_.StartCuda("DevicePredictInternal");
-    InitModel(model, tree_begin, tree_end);
+    model_.Init(model, tree_begin, tree_end, generic_param_->gpu_id);
    out_preds->SetDevice(generic_param_->gpu_id);

    if (dmat->PageExists<EllpackPage>()) {
@@ -284,7 +364,6 @@ class GPUPredictor : public xgboost::Predictor {
        batch_offset += batch.Size() * model.learner_model_param_->num_output_group;
      }
    }
-    monitor_.StopCuda("DevicePredictInternal");
  }

 public:
@@ -302,6 +381,7 @@ class GPUPredictor : public xgboost::Predictor {
                    unsigned ntree_limit = 0) override {
    // This function is duplicated with CPU predictor PredictBatch, see comments in there.
    // FIXME(trivialfis): Remove the duplication.
+    std::lock_guard<std::mutex> const guard(lock_);
    int device = generic_param_->gpu_id;
    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
    ConfigureDevice(device);
@@ -348,6 +428,63 @@ class GPUPredictor : public xgboost::Predictor {
          out_preds->Size() == dmat->Info().num_row_);
  }

+  template <typename Adapter, typename Loader, typename Batch>
+  void DispatchedInplacePredict(dmlc::any const &x,
+                                const gbm::GBTreeModel &model, float missing,
+                                PredictionCacheEntry *out_preds,
+                                uint32_t tree_begin, uint32_t tree_end) const {
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(this->generic_param_->gpu_id);
+    uint32_t const output_groups =  model.learner_model_param_->num_output_group;
+    DeviceModel d_model;
+    d_model.Init(model, tree_begin, tree_end, this->generic_param_->gpu_id);
+
+    auto m = dmlc::get<Adapter>(x);
+    CHECK_EQ(m.NumColumns(), model.learner_model_param_->num_feature)
+        << "Number of columns in data must equal to trained model.";
+    CHECK_EQ(this->generic_param_->gpu_id, m.DeviceIdx())
+        << "XGBoost is running on device: " << this->generic_param_->gpu_id << ", "
+        << "but data is on: " << m.DeviceIdx();
+    MetaInfo info;
+    info.num_col_ = m.NumColumns();
+    info.num_row_ = m.NumRows();
+    this->InitOutPredictions(info, &(out_preds->predictions), model);
+
+    const uint32_t BLOCK_THREADS = 128;
+    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(info.num_row_, BLOCK_THREADS));
+
+    auto shared_memory_bytes =
+        static_cast<size_t>(sizeof(float) * m.NumColumns() * BLOCK_THREADS);
+    bool use_shared = true;
+    if (shared_memory_bytes > max_shared_memory_bytes) {
+      shared_memory_bytes = 0;
+      use_shared = false;
+    }
+    size_t entry_start = 0;
+
+    dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
+        PredictKernel<Loader, Batch>,
+        m.Value(),
+        dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
+        dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
+        tree_begin, tree_end, m.NumColumns(), info.num_row_,
+        entry_start, use_shared, output_groups);
+  }
+
+  void InplacePredict(dmlc::any const &x, const gbm::GBTreeModel &model,
+                      float missing, PredictionCacheEntry *out_preds,
+                      uint32_t tree_begin, unsigned tree_end) const override {
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(this->generic_param_->gpu_id);
+    if (x.type() == typeid(data::CupyAdapter)) {
+      this->DispatchedInplacePredict<data::CupyAdapter, CuPyAdapterLoader, data::CupyAdapterBatch>(
+          x, model, missing, out_preds, tree_begin, tree_end);
+    } else if (x.type() == typeid(data::CudfAdapter)) {
+      this->DispatchedInplacePredict<data::CudfAdapter, CuDFAdapterLoader, data::CudfAdapterBatch>(
+          x, model, missing, out_preds, tree_begin, tree_end);
+    } else {
+      LOG(FATAL) << "Only CuPy and CuDF are supported by GPU Predictor.";
+    }
+  }
+
 protected:
  void InitOutPredictions(const MetaInfo& info,
                          HostDeviceVector<bst_float>* out_preds,
@@ -411,14 +548,9 @@ class GPUPredictor : public xgboost::Predictor {
    }
  }

-  common::Monitor monitor_;
-  dh::device_vector<RegTree::Node> nodes_;
-  dh::device_vector<size_t> tree_segments_;
-  dh::device_vector<int> tree_group_;
+  std::mutex lock_;
+  DeviceModel model_;
  size_t max_shared_memory_bytes_;
-  size_t tree_begin_;
-  size_t tree_end_;
-  int num_group_;
 };

 XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -2,8 +2,9 @@
 * Copyright 2017-2020 by Contributors
 */
 #include <dmlc/registry.h>
-#include <xgboost/predictor.h>
+#include <mutex>

+#include "xgboost/predictor.h"
 #include "xgboost/data.h"
 #include "xgboost/generic_parameters.h"

@@ -25,6 +26,7 @@ void PredictionContainer::ClearExpiredEntries() {
 }

 PredictionCacheEntry &PredictionContainer::Cache(std::shared_ptr<DMatrix> m, int32_t device) {
+  std::lock_guard<std::mutex> guard { cache_lock_ };
  this->ClearExpiredEntries();
  container_[m.get()].ref = m;
  if (device != GenericParameter::kCpuId) {