[breaking] Add prediction fucntion for DMatrix and use inplace predict for dask. (#6668)

* Add a new API function for predicting on `DMatrix`. This function aligns with rest of the `XGBoosterPredictFrom*` functions on semantic of function arguments. * Purge `ntree_limit` from libxgboost, use iteration instead. * [dask] Use `inplace_predict` by default for dask sklearn models. * [dask] Run prediction shape inference on worker instead of client. The breaking change is in the Python sklearn `apply` function, I made it to be consistent with other prediction functions where `best_iteration` is used by default.
2021-02-08 18:26:32 +08:00
parent dbb5208a0a
commit 4656b09d5d
29 changed files with 1134 additions and 604 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -619,20 +619,58 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
  CHECK_HANDLE();
  auto *learner = static_cast<Learner*>(handle);
  auto& entry = learner->GetThreadLocal().prediction_entry;
-  learner->Predict(
-      *static_cast<std::shared_ptr<DMatrix>*>(dmat),
-      (option_mask & 1) != 0,
-      &entry.predictions, ntree_limit,
-      static_cast<bool>(training),
-      (option_mask & 2) != 0,
-      (option_mask & 4) != 0,
-      (option_mask & 8) != 0,
-      (option_mask & 16) != 0);
+  auto iteration_end = GetIterationFromTreeLimit(ntree_limit, learner);
+  learner->Predict(*static_cast<std::shared_ptr<DMatrix> *>(dmat),
+                   (option_mask & 1) != 0, &entry.predictions, 0, iteration_end,
+                   static_cast<bool>(training), (option_mask & 2) != 0,
+                   (option_mask & 4) != 0, (option_mask & 8) != 0,
+                   (option_mask & 16) != 0);
  *out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
  *len = static_cast<xgboost::bst_ulong>(entry.predictions.Size());
  API_END();
 }

+XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
+                                        DMatrixHandle dmat,
+                                        char const* c_json_config,
+                                        xgboost::bst_ulong const **out_shape,
+                                        xgboost::bst_ulong *out_dim,
+                                        bst_float const **out_result) {
+  API_BEGIN();
+  if (handle == nullptr) {
+    LOG(FATAL) << "Booster has not been intialized or has already been disposed.";
+  }
+  if (dmat == nullptr) {
+    LOG(FATAL) << "DMatrix has not been intialized or has already been disposed.";
+  }
+  auto config = Json::Load(StringView{c_json_config});
+
+  auto *learner = static_cast<Learner*>(handle);
+  auto& entry = learner->GetThreadLocal().prediction_entry;
+  auto p_m = *static_cast<std::shared_ptr<DMatrix> *>(dmat);
+  auto type = PredictionType(get<Integer const>(config["type"]));
+  auto iteration_begin = get<Integer const>(config["iteration_begin"]);
+  auto iteration_end = get<Integer const>(config["iteration_end"]);
+  learner->Predict(
+      *static_cast<std::shared_ptr<DMatrix> *>(dmat),
+      type == PredictionType::kMargin, &entry.predictions, iteration_begin,
+      iteration_end, get<Boolean const>(config["training"]),
+      type == PredictionType::kLeaf, type == PredictionType::kContribution,
+      type == PredictionType::kApproxContribution,
+      type == PredictionType::kInteraction);
+  *out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
+  auto &shape = learner->GetThreadLocal().prediction_shape;
+  auto chunksize = p_m->Info().num_row_ == 0 ? 0 : entry.predictions.Size() / p_m->Info().num_row_;
+  auto rounds = iteration_end - iteration_begin;
+  rounds = rounds == 0 ? learner->BoostedRounds() : rounds;
+  // Determine shape
+  bool strict_shape = get<Boolean const>(config["strict_shape"]);
+  CalcPredictShape(strict_shape, type, p_m->Info().num_row_,
+                   p_m->Info().num_col_, chunksize, learner->Groups(), rounds,
+                   &shape, out_dim);
+  *out_shape = dmlc::BeginPtr(shape);
+  API_END();
+}

 template <typename T>
 void InplacePredictImpl(std::shared_ptr<T> x, std::shared_ptr<DMatrix> p_m,
@@ -705,7 +743,7 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr,
 }

 #if !defined(XGBOOST_USE_CUDA)
-XGB_DLL int XGBoosterPredictFromArrayInterface(
+XGB_DLL int XGBoosterPredictFromCUDAArray(
    BoosterHandle handle, char const *c_json_strs, char const *c_json_config,
    DMatrixHandle m, xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
    const float **out_result) {
@@ -715,7 +753,7 @@ XGB_DLL int XGBoosterPredictFromArrayInterface(
  API_END();
 }

-XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(
+XGB_DLL int XGBoosterPredictFromCUDAColumnar(
    BoosterHandle handle, char const *c_json_strs, char const *c_json_config,
    DMatrixHandle m, xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
    const float **out_result) {
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -66,8 +66,7 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_json_strs,
  API_END();
 }

-// A hidden API as cache id is not being supported yet.
-XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(
+XGB_DLL int XGBoosterPredictFromCudaColumnar(
    BoosterHandle handle, char const *c_json_strs, char const *c_json_config,
    DMatrixHandle m, xgboost::bst_ulong const **out_shape,
    xgboost::bst_ulong *out_dim, const float **out_result) {
@@ -79,8 +78,7 @@ XGB_DLL int XGBoosterPredictFromArrayInterfaceColumns(
      handle, c_json_strs, c_json_config, p_m, out_shape, out_dim, out_result);
 }

-// A hidden API as cache id is not being supported yet.
-XGB_DLL int XGBoosterPredictFromArrayInterface(
+XGB_DLL int XGBoosterPredictFromCudaArray(
    BoosterHandle handle, char const *c_json_strs, char const *c_json_config,
    DMatrixHandle m, xgboost::bst_ulong const **out_shape,
    xgboost::bst_ulong *out_dim, const float **out_result) {
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -9,6 +9,7 @@
 #include <vector>

 #include "xgboost/logging.h"
+#include "xgboost/json.h"
 #include "xgboost/learner.h"

 namespace xgboost {
@@ -30,8 +31,8 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
                             std::vector<bst_ulong> *out_shape,
                             xgboost::bst_ulong *out_dim) {
  auto &shape = *out_shape;
-  if ((type == PredictionType::kMargin || type == PredictionType::kValue) &&
-      rows != 0) {
+  if (type == PredictionType::kMargin && rows != 0) {
+    // When kValue is used, softmax can change the chunksize.
    CHECK_EQ(chunksize, groups);
  }

@@ -110,5 +111,35 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
      std::accumulate(shape.cbegin(), shape.cend(), 1, std::multiplies<>{}),
      chunksize * rows);
 }
+
+// Reverse the ntree_limit in old prediction API.
+inline uint32_t GetIterationFromTreeLimit(uint32_t ntree_limit, Learner *learner) {
+  // On Python and R, `best_ntree_limit` is set to `best_iteration * num_parallel_tree`.
+  // To reverse it we just divide it by `num_parallel_tree`.
+  if (ntree_limit != 0) {
+    learner->Configure();
+    uint32_t num_parallel_tree = 0;
+
+    Json config{Object()};
+    learner->SaveConfig(&config);
+    auto const &booster =
+        get<String const>(config["learner"]["gradient_booster"]["name"]);
+    if (booster == "gblinear") {
+      num_parallel_tree = 0;
+    } else if (booster == "dart") {
+      num_parallel_tree = std::stoi(
+          get<String const>(config["learner"]["gradient_booster"]["gbtree"]
+                                  ["gbtree_train_param"]["num_parallel_tree"]));
+    } else if (booster == "gbtree") {
+      num_parallel_tree = std::stoi(get<String const>(
+          (config["learner"]["gradient_booster"]["gbtree_train_param"]
+                 ["num_parallel_tree"])));
+    } else {
+      LOG(FATAL) << "Unknown booster:" << booster;
+    }
+    ntree_limit /= std::max(num_parallel_tree, 1u);
+  }
+  return ntree_limit;
+}
 }  // namespace xgboost
 #endif  // XGBOOST_C_API_C_API_UTILS_H_
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -25,6 +25,7 @@
 #include "common/config.h"
 #include "common/io.h"
 #include "common/version.h"
+#include "c_api/c_api_utils.h"

 namespace xgboost {
 enum CLITask {
@@ -58,6 +59,8 @@ struct CLIParam : public XGBoostParameter<CLIParam> {
  int dsplit;
  /*!\brief limit number of trees in prediction */
  int ntree_limit;
+  int iteration_begin;
+  int iteration_end;
  /*!\brief whether to directly output margin value */
  bool pred_margin;
  /*! \brief whether dump statistics along with model */
@@ -109,7 +112,11 @@ struct CLIParam : public XGBoostParameter<CLIParam> {
        .add_enum("row", 2)
        .describe("Data split mode.");
    DMLC_DECLARE_FIELD(ntree_limit).set_default(0).set_lower_bound(0)
-        .describe("Number of trees used for prediction, 0 means use all trees.");
+        .describe("(Deprecated) Use iteration_begin/iteration_end instead.");
+    DMLC_DECLARE_FIELD(iteration_begin).set_default(0).set_lower_bound(0)
+        .describe("Begining of boosted tree iteration used for prediction.");
+    DMLC_DECLARE_FIELD(iteration_end).set_default(0).set_lower_bound(0)
+        .describe("End of boosted tree iteration used for prediction.  0 means all the trees.");
    DMLC_DECLARE_FIELD(pred_margin).set_default(false)
        .describe("Whether to predict margin value instead of probability.");
    DMLC_DECLARE_FIELD(dump_stats).set_default(false)
@@ -334,7 +341,13 @@ class CLI {

    LOG(INFO) << "Start prediction...";
    HostDeviceVector<bst_float> preds;
-    learner_->Predict(dtest, param_.pred_margin, &preds, param_.ntree_limit);
+    if (param_.ntree_limit != 0) {
+      param_.iteration_end = GetIterationFromTreeLimit(param_.ntree_limit, learner_.get());
+      LOG(WARNING) << "`ntree_limit` is deprecated, use `iteration_begin` and "
+                      "`iteration_end` instead.";
+    }
+    learner_->Predict(dtest, param_.pred_margin, &preds, param_.iteration_begin,
+                      param_.iteration_end);
    LOG(CONSOLE) << "Writing prediction to " << param_.name_pred;

    std::unique_ptr<dmlc::Stream> fo(
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -47,6 +47,12 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
        .describe("Maximum rows per batch.");
  }
 };
+
+void LinearCheckLayer(unsigned layer_begin, unsigned layer_end) {
+  CHECK_EQ(layer_begin, 0) << "Linear booster does not support prediction range.";
+  CHECK_EQ(layer_end, 0)   << "Linear booster does not support prediction range.";
+}
+
 /*!
 * \brief gradient boosted linear model
 */
@@ -130,20 +136,19 @@ class GBLinear : public GradientBooster {
    monitor_.Stop("DoBoost");
  }

-  void PredictBatch(DMatrix *p_fmat,
-                    PredictionCacheEntry *predts,
-                    bool, unsigned ntree_limit) override {
+  void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *predts,
+                    bool training, unsigned layer_begin, unsigned layer_end) override {
    monitor_.Start("PredictBatch");
+    LinearCheckLayer(layer_begin, layer_end);
    auto* out_preds = &predts->predictions;
-    CHECK_EQ(ntree_limit, 0U)
-        << "GBLinear::Predict ntrees is only valid for gbtree predictor";
    this->PredictBatchInternal(p_fmat, &out_preds->HostVector());
    monitor_.Stop("PredictBatch");
  }
  // add base margin
  void PredictInstance(const SparsePage::Inst &inst,
                       std::vector<bst_float> *out_preds,
-                       unsigned) override {
+                       unsigned layer_begin, unsigned layer_end) override {
+    LinearCheckLayer(layer_begin, layer_end);
    const int ngroup = model_.learner_model_param->num_output_group;
    for (int gid = 0; gid < ngroup; ++gid) {
      this->Pred(inst, dmlc::BeginPtr(*out_preds), gid,
@@ -151,16 +156,15 @@ class GBLinear : public GradientBooster {
    }
  }

-  void PredictLeaf(DMatrix *, HostDeviceVector<bst_float> *, unsigned) override {
+  void PredictLeaf(DMatrix *, HostDeviceVector<bst_float> *, unsigned, unsigned) override {
    LOG(FATAL) << "gblinear does not support prediction of leaf index";
  }

  void PredictContribution(DMatrix* p_fmat,
                           HostDeviceVector<bst_float>* out_contribs,
-                           unsigned ntree_limit, bool, int, unsigned) override {
+                           unsigned layer_begin, unsigned layer_end, bool, int, unsigned) override {
    model_.LazyInitModel();
-    CHECK_EQ(ntree_limit, 0U)
-        << "GBLinear::PredictContribution: ntrees is only valid for gbtree predictor";
+    LinearCheckLayer(layer_begin, layer_end);
    const auto& base_margin = p_fmat->Info().base_margin_.ConstHostVector();
    const int ngroup = model_.learner_model_param->num_output_group;
    const size_t ncolumns = model_.learner_model_param->num_feature + 1;
@@ -197,7 +201,8 @@ class GBLinear : public GradientBooster {

  void PredictInteractionContributions(DMatrix* p_fmat,
                                       HostDeviceVector<bst_float>* out_contribs,
-                                       unsigned, bool) override {
+                                       unsigned layer_begin, unsigned layer_end, bool) override {
+    LinearCheckLayer(layer_begin, layer_end);
    std::vector<bst_float>& contribs = out_contribs->HostVector();

    // linear models have no interaction effects
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -414,7 +414,7 @@ void GBTree::Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
  auto layer_trees = this->LayerTrees();

  layer_end = layer_end == 0 ? model_.trees.size() / layer_trees : layer_end;
-  CHECK_GE(layer_end, layer_begin);
+  CHECK_GT(layer_end, layer_begin);
  CHECK_GE(step, 1);
  int32_t n_layers = (layer_end - layer_begin) / step;
  std::vector<std::unique_ptr<RegTree>> &out_trees = out_model.trees;
@@ -438,10 +438,35 @@ void GBTree::Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
 void GBTree::PredictBatch(DMatrix* p_fmat,
                          PredictionCacheEntry* out_preds,
                          bool,
-                          unsigned ntree_limit) {
+                          unsigned layer_begin,
+                          unsigned layer_end) {
  CHECK(configured_);
+  if (layer_end == 0) {
+    layer_end = this->BoostedRounds();
+  }
+  if (layer_begin != 0 || layer_end < out_preds->version) {
+    // cache is dropped.
+    out_preds->version = 0;
+  }
+  bool reset = false;
+  if (layer_begin == 0) {
+    layer_begin = out_preds->version;
+  } else {
+    // When begin layer is not 0, the cache is not useful.
+    reset = true;
+  }
+
+  uint32_t tree_begin, tree_end;
+  std::tie(tree_begin, tree_end) =
+      detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
  GetPredictor(&out_preds->predictions, p_fmat)
-      ->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
+      ->PredictBatch(p_fmat, out_preds, model_, tree_begin, tree_end);
+  if (reset) {
+    out_preds->version = 0;
+  } else {
+    uint32_t delta = layer_end - out_preds->version;
+    out_preds->Update(delta);
+  }
 }

 std::unique_ptr<Predictor> const &
@@ -603,13 +628,14 @@ class Dart : public GBTree {
  void PredictBatch(DMatrix* p_fmat,
                    PredictionCacheEntry* p_out_preds,
                    bool training,
-                    unsigned ntree_limit) override {
+                    unsigned layer_begin,
+                    unsigned layer_end) override {
    DropTrees(training);
    int num_group = model_.learner_model_param->num_output_group;
-    ntree_limit *= num_group;
-    if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
-      ntree_limit = static_cast<unsigned>(model_.trees.size());
-    }
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) =
+        detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
+
    size_t n = num_group * p_fmat->Info().num_row_;
    const auto &base_margin = p_fmat->Info().base_margin_.ConstHostVector();
    auto& out_preds = p_out_preds->predictions.HostVector();
@@ -623,26 +649,24 @@ class Dart : public GBTree {
    }
    const int nthread = omp_get_max_threads();
    InitThreadTemp(nthread);
-    PredLoopSpecalize(p_fmat, &out_preds, num_group, 0, ntree_limit);
+    PredLoopSpecalize(p_fmat, &out_preds, num_group, tree_begin, tree_end);
  }

  void PredictInstance(const SparsePage::Inst &inst,
                       std::vector<bst_float> *out_preds,
-                       unsigned ntree_limit) override {
+                       unsigned layer_begin, unsigned layer_end) override {
    DropTrees(false);
    if (thread_temp_.size() == 0) {
      thread_temp_.resize(1, RegTree::FVec());
      thread_temp_[0].Init(model_.learner_model_param->num_feature);
    }
    out_preds->resize(model_.learner_model_param->num_output_group);
-    ntree_limit *= model_.learner_model_param->num_output_group;
-    if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
-      ntree_limit = static_cast<unsigned>(model_.trees.size());
-    }
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
    // loop over output groups
    for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
      (*out_preds)[gid] =
-          PredValue(inst, gid, &thread_temp_[0], 0, ntree_limit) +
+          PredValue(inst, gid, &thread_temp_[0], 0, tree_end) +
          model_.learner_model_param->base_score;
    }
  }
@@ -653,22 +677,25 @@ class Dart : public GBTree {

  void PredictContribution(DMatrix* p_fmat,
                           HostDeviceVector<bst_float>* out_contribs,
-                           unsigned ntree_limit, bool approximate, int,
+                           unsigned layer_begin, unsigned layer_end, bool approximate, int,
                           unsigned) override {
    CHECK(configured_);
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
    cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_,
-                                        ntree_limit, &weight_drop_, approximate);
+                                        tree_end, &weight_drop_, approximate);
  }

-  void PredictInteractionContributions(DMatrix* p_fmat,
-                                       HostDeviceVector<bst_float>* out_contribs,
-                                       unsigned ntree_limit, bool approximate) override {
+  void PredictInteractionContributions(
+      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+      unsigned layer_begin, unsigned layer_end, bool approximate) override {
    CHECK(configured_);
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
    cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_,
-                                                    ntree_limit, &weight_drop_, approximate);
+                                                    tree_end, &weight_drop_, approximate);
  }

-
 protected:
  inline void PredLoopSpecalize(
      DMatrix* p_fmat,
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -164,7 +164,9 @@ inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const &model,
  if (tree_end == 0) {
    tree_end = static_cast<uint32_t>(model.trees.size());
  }
-  CHECK_LT(tree_begin, tree_end);
+  if (model.trees.size() != 0) {
+    CHECK_LE(tree_begin, tree_end);
+  }
  return {tree_begin, tree_end};
 }

@@ -260,10 +262,8 @@ class GBTree : public GradientBooster {
    return model_.trees.size() / this->LayerTrees();
  }

-  void PredictBatch(DMatrix* p_fmat,
-                    PredictionCacheEntry* out_preds,
-                    bool training,
-                    unsigned ntree_limit) override;
+  void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *out_preds,
+                    bool training, unsigned layer_begin, unsigned layer_end) override;

  void InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
                      float missing, PredictionCacheEntry *out_preds,
@@ -297,33 +297,49 @@ class GBTree : public GradientBooster {

  void PredictInstance(const SparsePage::Inst& inst,
                       std::vector<bst_float>* out_preds,
-                       unsigned ntree_limit) override {
+                       uint32_t layer_begin, uint32_t layer_end) override {
    CHECK(configured_);
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
    cpu_predictor_->PredictInstance(inst, out_preds, model_,
-                                    ntree_limit);
+                                    tree_end);
  }

  void PredictLeaf(DMatrix* p_fmat,
                   HostDeviceVector<bst_float>* out_preds,
-                   unsigned ntree_limit) override {
-    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, ntree_limit);
+                   uint32_t layer_begin, uint32_t layer_end) override {
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
+    CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
+                               "n_iteration), use model slicing instead.";
+    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
  }

  void PredictContribution(DMatrix* p_fmat,
                           HostDeviceVector<bst_float>* out_contribs,
-                           unsigned ntree_limit, bool approximate,
+                           uint32_t layer_begin, uint32_t layer_end, bool approximate,
                           int, unsigned) override {
    CHECK(configured_);
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
+    CHECK_EQ(tree_begin, 0)
+        << "Predict contribution supports only iteration end: (0, "
+           "n_iteration), using model slicing instead.";
    this->GetPredictor()->PredictContribution(
-        p_fmat, out_contribs, model_, ntree_limit, nullptr, approximate);
+        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

-  void PredictInteractionContributions(DMatrix* p_fmat,
-                                       HostDeviceVector<bst_float>* out_contribs,
-                                       unsigned ntree_limit, bool approximate) override {
+  void PredictInteractionContributions(
+      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
+      uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
    CHECK(configured_);
-    this->GetPredictor()->PredictInteractionContributions(p_fmat, out_contribs, model_,
-                                                    ntree_limit, nullptr, approximate);
+    uint32_t tree_begin, tree_end;
+    std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
+    CHECK_EQ(tree_begin, 0)
+        << "Predict interaction contribution supports only iteration end: (0, "
+           "n_iteration), using model slicing instead.";
+    this->GetPredictor()->PredictInteractionContributions(
+        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
  }

  std::vector<std::string> DumpModel(const FeatureMap& fmap,
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -22,6 +22,7 @@

 #include "dmlc/any.h"
 #include "xgboost/base.h"
+#include "xgboost/c_api.h"
 #include "xgboost/data.h"
 #include "xgboost/model.h"
 #include "xgboost/predictor.h"
@@ -996,7 +997,7 @@ class LearnerImpl : public LearnerIO {
    auto& predt = local_cache->Cache(train, generic_parameters_.gpu_id);

    monitor_.Start("PredictRaw");
-    this->PredictRaw(train.get(), &predt, true);
+    this->PredictRaw(train.get(), &predt, true, 0, 0);
    TrainingObserver::Instance().Observe(predt.predictions, "Predictions");
    monitor_.Stop("PredictRaw");

@@ -1057,7 +1058,7 @@ class LearnerImpl : public LearnerIO {
      std::shared_ptr<DMatrix> m = data_sets[i];
      auto &predt = local_cache->Cache(m, generic_parameters_.gpu_id);
      this->ValidateDMatrix(m.get(), false);
-      this->PredictRaw(m.get(), &predt, false);
+      this->PredictRaw(m.get(), &predt, false, 0, 0);

      auto &out = output_predictions_.Cache(m, generic_parameters_.gpu_id).predictions;
      out.Resize(predt.predictions.Size());
@@ -1075,8 +1076,8 @@ class LearnerImpl : public LearnerIO {
  }

  void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float>* out_preds, unsigned ntree_limit,
-               bool training,
+               HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
+               unsigned layer_end, bool training,
               bool pred_leaf, bool pred_contribs, bool approx_contribs,
               bool pred_interactions) override {
    int multiple_predictions = static_cast<int>(pred_leaf) +
@@ -1085,16 +1086,16 @@ class LearnerImpl : public LearnerIO {
    this->Configure();
    CHECK_LE(multiple_predictions, 1) << "Perform one kind of prediction at a time.";
    if (pred_contribs) {
-      gbm_->PredictContribution(data.get(), out_preds, ntree_limit, approx_contribs);
+      gbm_->PredictContribution(data.get(), out_preds, layer_begin, layer_end, approx_contribs);
    } else if (pred_interactions) {
-      gbm_->PredictInteractionContributions(data.get(), out_preds, ntree_limit,
+      gbm_->PredictInteractionContributions(data.get(), out_preds, layer_begin, layer_end,
                                            approx_contribs);
    } else if (pred_leaf) {
-      gbm_->PredictLeaf(data.get(), out_preds, ntree_limit);
+      gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
    } else {
      auto local_cache = this->GetPredictionCache();
      auto& prediction = local_cache->Cache(data, generic_parameters_.gpu_id);
-      this->PredictRaw(data.get(), &prediction, training, ntree_limit);
+      this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
      // Copy the prediction cache to output prediction. out_preds comes from C API
      out_preds->SetDevice(generic_parameters_.gpu_id);
      out_preds->Resize(prediction.predictions.Size());
@@ -1151,12 +1152,11 @@ class LearnerImpl : public LearnerIO {
   *   predictor, when it equals 0, this means we are using all the trees
   * \param training allow dropout when the DART booster is being used
   */
-  void PredictRaw(DMatrix* data, PredictionCacheEntry* out_preds,
-                  bool training,
-                  unsigned ntree_limit = 0) const {
+  void PredictRaw(DMatrix *data, PredictionCacheEntry *out_preds, bool training,
+                  unsigned layer_begin, unsigned layer_end) const {
    CHECK(gbm_ != nullptr) << "Predict must happen after Load or configuration";
    this->ValidateDMatrix(data, false);
-    gbm_->PredictBatch(data, out_preds, training, ntree_limit);
+    gbm_->PredictBatch(data, out_preds, training, layer_begin, layer_end);
  }

  void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -234,56 +234,28 @@ class CPUPredictor : public Predictor {
 public:
  explicit CPUPredictor(GenericParameter const* generic_param) :
      Predictor::Predictor{generic_param} {}
-  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
-  // multi-output and forest.  Same problem exists for tree_begin
-  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    uint32_t const ntree_limit = 0) const override {
-    // tree_begin is not used, right now we just enforce it to be 0.
-    CHECK_EQ(tree_begin, 0);
+  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
+                    const gbm::GBTreeModel &model, uint32_t tree_begin,
+                    uint32_t tree_end = 0) const override {
    auto* out_preds = &predts->predictions;
-    CHECK_GE(predts->version, tree_begin);
    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
      CHECK_EQ(predts->version, 0);
    }
+    // This is actually already handled in gbm, but large amount of tests rely on the
+    // behaviour.
+    if (tree_end == 0) {
+      tree_end = model.trees.size();
+    }
    if (predts->version == 0) {
      // out_preds->Size() can be non-zero as it's initialized here before any tree is
      // built at the 0^th iterator.
      this->InitOutPredictions(dmat->Info(), out_preds, model);
    }
-
-    uint32_t const output_groups =  model.learner_model_param->num_output_group;
-    CHECK_NE(output_groups, 0);
-    // Right now we just assume ntree_limit provided by users means number of tree layers
-    // in the context of multi-output model
-    uint32_t real_ntree_limit = ntree_limit * output_groups;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    if (tree_end - tree_begin == 0) {
+      return;
    }
-
-    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
-    // When users have provided ntree_limit, end_version can be lesser, cache is violated
-    if (predts->version > end_version) {
-      CHECK_NE(ntree_limit, 0);
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-      predts->version = 0;
-    }
-    uint32_t const beg_version = predts->version;
-    CHECK_LE(beg_version, end_version);
-
-    if (beg_version < end_version) {
-      this->PredictDMatrix(dmat, &out_preds->HostVector(), model,
-                           beg_version * output_groups,
-                           end_version * output_groups);
-    }
-
-    // delta means {size of forest} * {number of newly accumulated layers}
-    uint32_t delta = end_version - beg_version;
-    CHECK_LE(delta, model.trees.size());
-    predts->Update(delta);
-
-    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
-          out_preds->Size() == dmat->Info().num_row_);
+    this->PredictDMatrix(dmat, &out_preds->HostVector(), model, tree_begin,
+                         tree_end);
  }

  template <typename Adapter>
@@ -362,7 +334,6 @@ class CPUPredictor : public Predictor {
    InitThreadTemp(nthread, model.learner_model_param->num_feature, &feat_vecs);
    const MetaInfo& info = p_fmat->Info();
    // number of valid trees
-    ntree_limit *= model.learner_model_param->num_output_group;
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
      ntree_limit = static_cast<unsigned>(model.trees.size());
    }
@@ -398,7 +369,6 @@ class CPUPredictor : public Predictor {
    InitThreadTemp(nthread,  model.learner_model_param->num_feature, &feat_vecs);
    const MetaInfo& info = p_fmat->Info();
    // number of valid trees
-    ntree_limit *= model.learner_model_param->num_output_group;
    if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
      ntree_limit = static_cast<unsigned>(model.trees.size());
    }
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -536,6 +536,7 @@ class GPUPredictor : public xgboost::Predictor {
    const uint32_t BLOCK_THREADS = 256;
    size_t num_rows = batch.n_rows;
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
+    DeviceModel d_model;

    bool use_shared = false;
    size_t entry_start = 0;
@@ -593,54 +594,27 @@ class GPUPredictor : public xgboost::Predictor {
  }

  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    unsigned ntree_limit = 0) const override {
-    // This function is duplicated with CPU predictor PredictBatch, see comments in there.
-    // FIXME(trivialfis): Remove the duplication.
+                    const gbm::GBTreeModel& model, uint32_t tree_begin,
+                    uint32_t tree_end = 0) const override {
    int device = generic_param_->gpu_id;
    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
-    ConfigureDevice(device);
-
-    CHECK_EQ(tree_begin, 0);
    auto* out_preds = &predts->predictions;
-    CHECK_GE(predts->version, tree_begin);
+
    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
      CHECK_EQ(predts->version, 0);
    }
+    if (tree_end == 0) {
+      tree_end = model.trees.size();
+    }
    if (predts->version == 0) {
+      // out_preds->Size() can be non-zero as it's initialized here before any tree is
+      // built at the 0^th iterator.
      this->InitOutPredictions(dmat->Info(), out_preds, model);
    }
-
-    uint32_t const output_groups =  model.learner_model_param->num_output_group;
-    CHECK_NE(output_groups, 0);
-
-    uint32_t real_ntree_limit = ntree_limit * output_groups;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    if (tree_end - tree_begin == 0) {
+      return;
    }
-
-    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
-
-    if (predts->version > end_version) {
-      CHECK_NE(ntree_limit, 0);
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-      predts->version = 0;
-    }
-    uint32_t const beg_version = predts->version;
-    CHECK_LE(beg_version, end_version);
-
-    if (beg_version < end_version) {
-      this->DevicePredictInternal(dmat, out_preds, model,
-                                  beg_version * output_groups,
-                                  end_version * output_groups);
-    }
-
-    uint32_t delta = end_version - beg_version;
-    CHECK_LE(delta, model.trees.size());
-    predts->Update(delta);
-
-    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
-          out_preds->Size() == dmat->Info().num_row_);
+    this->DevicePredictInternal(dmat, out_preds, model, tree_begin, tree_end);
  }

  template <typename Adapter, typename Loader>
@@ -648,15 +622,12 @@ class GPUPredictor : public xgboost::Predictor {
                                const gbm::GBTreeModel &model, float,
                                PredictionCacheEntry *out_preds,
                                uint32_t tree_begin, uint32_t tree_end) const {
-    auto max_shared_memory_bytes = dh::MaxSharedMemory(this->generic_param_->gpu_id);
    uint32_t const output_groups =  model.learner_model_param->num_output_group;
-    DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, this->generic_param_->gpu_id);

    auto m = dmlc::get<std::shared_ptr<Adapter>>(x);
    CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
        << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(this->generic_param_->gpu_id, m->DeviceIdx())
+    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
        << "XGBoost is running on device: " << this->generic_param_->gpu_id << ", "
        << "but data is on: " << m->DeviceIdx();
    if (p_m) {
@@ -667,12 +638,17 @@ class GPUPredictor : public xgboost::Predictor {
      info.num_row_ = m->NumRows();
      this->InitOutPredictions(info, &(out_preds->predictions), model);
    }
+    out_preds->predictions.SetDevice(m->DeviceIdx());

    const uint32_t BLOCK_THREADS = 128;
    auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));

+    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
    size_t shared_memory_bytes =
        SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
+    DeviceModel d_model;
+    d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
+
    bool use_shared = shared_memory_bytes != 0;
    size_t entry_start = 0;

@@ -707,20 +683,17 @@ class GPUPredictor : public xgboost::Predictor {

  void PredictContribution(DMatrix* p_fmat,
                           HostDeviceVector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, unsigned ntree_limit,
+                           const gbm::GBTreeModel& model, unsigned tree_end,
                           std::vector<bst_float>*,
                           bool approximate, int,
                           unsigned) const override {
    if (approximate) {
      LOG(FATAL) << "Approximated contribution is not implemented in GPU Predictor.";
    }
-
    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
    out_contribs->SetDevice(generic_param_->gpu_id);
-    uint32_t real_ntree_limit =
-        ntree_limit * model.learner_model_param->num_output_group;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    if (tree_end == 0 || tree_end > model.trees.size()) {
+      tree_end = static_cast<uint32_t>(model.trees.size());
    }

    const int ngroup = model.learner_model_param->num_output_group;
@@ -734,8 +707,7 @@ class GPUPredictor : public xgboost::Predictor {
    auto phis = out_contribs->DeviceSpan();

    dh::device_vector<gpu_treeshap::PathElement> device_paths;
-    ExtractPaths(&device_paths, model, real_ntree_limit,
-                 generic_param_->gpu_id);
+    ExtractPaths(&device_paths, model, tree_end, generic_param_->gpu_id);
    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
      batch.data.SetDevice(generic_param_->gpu_id);
      batch.offset.SetDevice(generic_param_->gpu_id);
@@ -761,20 +733,17 @@ class GPUPredictor : public xgboost::Predictor {
  void PredictInteractionContributions(DMatrix* p_fmat,
                                       HostDeviceVector<bst_float>* out_contribs,
                                       const gbm::GBTreeModel& model,
-                                       unsigned ntree_limit,
+                                       unsigned tree_end,
                                       std::vector<bst_float>*,
                                       bool approximate) const override {
    if (approximate) {
      LOG(FATAL) << "[Internal error]: " << __func__
                 << " approximate is not implemented in GPU Predictor.";
    }
-
    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
    out_contribs->SetDevice(generic_param_->gpu_id);
-    uint32_t real_ntree_limit =
-        ntree_limit * model.learner_model_param->num_output_group;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    if (tree_end == 0 || tree_end > model.trees.size()) {
+      tree_end = static_cast<uint32_t>(model.trees.size());
    }

    const int ngroup = model.learner_model_param->num_output_group;
@@ -789,8 +758,7 @@ class GPUPredictor : public xgboost::Predictor {
    auto phis = out_contribs->DeviceSpan();

    dh::device_vector<gpu_treeshap::PathElement> device_paths;
-    ExtractPaths(&device_paths, model, real_ntree_limit,
-                 generic_param_->gpu_id);
+    ExtractPaths(&device_paths, model, tree_end, generic_param_->gpu_id);
    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
      batch.data.SetDevice(generic_param_->gpu_id);
      batch.offset.SetDevice(generic_param_->gpu_id);
@@ -841,29 +809,28 @@ class GPUPredictor : public xgboost::Predictor {
               << " is not implemented in GPU Predictor.";
  }

-  void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<bst_float>* predictions,
-                   const gbm::GBTreeModel& model,
-                   unsigned ntree_limit) const override {
+  void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
+                   const gbm::GBTreeModel &model,
+                   unsigned tree_end) const override {
    dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id));
    auto max_shared_memory_bytes = ConfigureDevice(generic_param_->gpu_id);

    const MetaInfo& info = p_fmat->Info();
    constexpr uint32_t kBlockThreads = 128;
-    size_t shared_memory_bytes =
-        SharedMemoryBytes<kBlockThreads>(info.num_col_, max_shared_memory_bytes);
+    size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
+        info.num_col_, max_shared_memory_bytes);
    bool use_shared = shared_memory_bytes != 0;
    bst_feature_t num_features = info.num_col_;
    bst_row_t num_rows = info.num_row_;
    size_t entry_start = 0;

-    uint32_t real_ntree_limit = ntree_limit * model.learner_model_param->num_output_group;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
+    if (tree_end == 0 || tree_end > model.trees.size()) {
+      tree_end = static_cast<uint32_t>(model.trees.size());
    }
    predictions->SetDevice(generic_param_->gpu_id);
-    predictions->Resize(num_rows * real_ntree_limit);
+    predictions->Resize(num_rows * tree_end);
    DeviceModel d_model;
-    d_model.Init(model, 0, real_ntree_limit, this->generic_param_->gpu_id);
+    d_model.Init(model, 0, tree_end, this->generic_param_->gpu_id);

    if (p_fmat->PageExists<SparsePage>()) {
      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {