From a5f232feb8d5657d7495f440431b11de7bd60ba5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 5 Sep 2019 19:09:38 -0400
Subject: [PATCH] Fix calling GPU predictor (#4836)

* Fix calling GPU predictor
---
 src/data/simple_dmatrix.cc     |  1 +
 src/gbm/gbtree.h               | 18 ++++++++--
 src/predictor/cpu_predictor.cc |  4 +--
 tests/cpp/gbm/test_gbtree.cc   | 66 ++++++++++++++++++++++++++++++++++
 tests/cpp/test_learner.cc      |  1 -
 5 files changed, 85 insertions(+), 5 deletions(-)
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 8fb6e2d97..9f75ab055 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -49,6 +49,7 @@ class SimpleBatchIteratorImpl : public BatchIteratorImpl<T> {
 };
 
 BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
+  // since csr is the default data structure so `source_` is always available.
   auto cast = dynamic_cast<SimpleCSRSource*>(source_.get());
   auto begin_iter = BatchIterator<SparsePage>(
       new SimpleBatchIteratorImpl<SparsePage>(&(cast->page_)));
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 63c5263f7..9d65c7681 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -191,7 +191,7 @@ class GBTree : public GradientBooster {
                     HostDeviceVector<bst_float>* out_preds,
                     unsigned ntree_limit) override {
     CHECK(configured_);
-    GetPredictor()->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
+    GetPredictor(out_preds, p_fmat)->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
   }
 
   void PredictInstance(const SparsePage::Inst& inst,
@@ -242,8 +242,22 @@ class GBTree : public GradientBooster {
                      int bst_group,
                      std::vector<std::unique_ptr<RegTree> >* ret);
 
-  std::unique_ptr<Predictor> const& GetPredictor() const {
+  std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
+                                                 DMatrix* f_dmat = nullptr) const {
     CHECK(configured_);
+    // GPU_Hist by default has prediction cache calculated from quantile values, so GPU
+    // Predictor is not used for training dataset.  But when XGBoost performs continue
+    // training with an existing model, the prediction cache is not availbale and number
+    // of tree doesn't equal zero, the whole training dataset got copied into GPU for
+    // precise prediction.  This condition tries to avoid such copy by calling CPU
+    // Predictor.
+    if ((out_pred && out_pred->Size() == 0) &&
+        (model_.param.num_trees != 0) &&
+        // FIXME(trivialfis): Implement a better method for testing whether data is on
+        // device after DMatrix refactoring is done.
+        (f_dmat && !((*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead()))) {
+      return cpu_predictor_;
+    }
     if (tparam_.predictor == "cpu_predictor") {
       CHECK(cpu_predictor_);
       return cpu_predictor_;
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 1bb740361..28dd1d655 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -134,7 +134,7 @@ class CPUPredictor : public Predictor {
     } else {
       if (!base_margin.empty()) {
         std::ostringstream oss;
-        oss << "Warning: Ignoring the base margin, since it has incorrect length. "
+        oss << "Ignoring the base margin, since it has incorrect length. "
             << "The base margin must be an array of length ";
         if (model.param.num_output_group > 1) {
           oss << "[num_class] * [number of data points], i.e. "
@@ -145,7 +145,7 @@ class CPUPredictor : public Predictor {
         }
         oss << "Instead, all data points will use "
             << "base_score = " << model.base_margin;
-        LOG(INFO) << oss.str();
+        LOG(WARNING) << oss.str();
       }
       std::fill(out_preds_h.begin(), out_preds_h.end(), model.base_margin);
     }
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index fd48e9c77..d39aa9a85 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,5 +1,8 @@
 #include <gtest/gtest.h>
+#include <dmlc/filesystem.h>
 #include <xgboost/generic_parameters.h>
+
+#include "xgboost/learner.h"
 #include "../helpers.h"
 #include "../../../src/gbm/gbtree.h"
 
@@ -43,4 +46,67 @@ TEST(GBTree, SelectTreeMethod) {
   ASSERT_EQ(tparam.predictor, "gpu_predictor");
 #endif
 }
+
+#ifdef XGBOOST_USE_CUDA
+TEST(GBTree, ChoosePredictor) {
+  size_t constexpr kNumRows = 17;
+  size_t constexpr kCols = 15;
+  auto pp_mat = CreateDMatrix(kNumRows, kCols, 0);
+  auto& p_mat = *pp_mat;
+
+  std::vector<bst_float> labels (kNumRows);
+  for (size_t i = 0; i < kNumRows; ++i) {
+    labels[i] = i % 2;
+  }
+  p_mat->Info().SetInfo("label", labels.data(), DataType::kFloat32, kNumRows);
+
+  std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {p_mat};
+  std::string n_feat = std::to_string(kCols);
+  Args args {{"tree_method", "approx"}, {"num_feature", n_feat}};
+  GenericParameter generic_param;
+  generic_param.InitAllowUnknown(Args{{"gpu_id", "0"}});
+
+  auto& data = (*(p_mat->GetBatches<SparsePage>().begin())).data;
+
+  auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
+  learner->SetParams(Args{{"tree_method", "gpu_hist"}});
+  for (size_t i = 0; i < 4; ++i) {
+    learner->UpdateOneIter(i, p_mat.get());
+  }
+  ASSERT_TRUE(data.HostCanWrite());
+  dmlc::TemporaryDirectory tempdir;
+  const std::string fname = tempdir.path + "/model_para.bst";
+
+  {
+    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
+    learner->Save(fo.get());
+  }
+
+  // a new learner
+  learner = std::unique_ptr<Learner>(Learner::Create(mat));
+  {
+    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
+    learner->Load(fi.get());
+  }
+  learner->SetParams(Args{{"tree_method", "gpu_hist"}, {"gpu_id", "0"}});
+  for (size_t i = 0; i < 4; ++i) {
+    learner->UpdateOneIter(i, p_mat.get());
+  }
+  ASSERT_TRUE(data.HostCanWrite());
+
+  // pull data into device.
+  data = HostDeviceVector<Entry>(data.HostVector(), 0);
+  data.DeviceSpan();
+  ASSERT_FALSE(data.HostCanWrite());
+
+  // another new learner
+  learner = std::unique_ptr<Learner>(Learner::Create(mat));
+  learner->SetParams(Args{{"tree_method", "gpu_hist"}, {"gpu_id", "0"}});
+  for (size_t i = 0; i < 4; ++i) {
+    learner->UpdateOneIter(i, p_mat.get());
+  }
+  // data is not pulled back into host
+  ASSERT_FALSE(data.HostCanWrite());
+}
+#endif
 }  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 318d09628..0d7f61e7e 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -241,7 +241,6 @@ TEST(Learner, GPUConfiguration) {
 
   delete pp_dmat;
 }
-
 #endif  // XGBOOST_USE_CUDA
 
 }  // namespace xgboost