Revert "Fix #3485, #3540: Don't use dropout for predicting test sets" (#3563)

* Revert "Fix #3485, #3540: Don't use dropout for predicting test sets (#3556)" This reverts commit 44811f233071c5805d70c287abd22b155b732727. * Document behavior of predict() for DART booster * Add notice to parameter.rst
2018-08-08 09:48:55 -07:00 · 2018-08-08 09:48:55 -07:00 · 3c72654e3b
commit 3c72654e3b
parent e3e776bd58
9 changed files with 61 additions and 30 deletions
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -12,6 +12,10 @@ Before running XGBoost, we must set three types of parameters: general parameter

  In R-package, you can use ``.`` (dot) to replace underscore in the parameters, for example, you can use ``max.depth`` to indicate ``max_depth``. The underscore parameters are also valid in R.

+.. contents::
+  :backlinks: none
+  :local:
+
 ******************
 General Parameters
 ******************
@ -172,6 +176,18 @@ Parameters for Tree Booster

 Additional parameters for Dart Booster (``booster=dart``)
 =========================================================
+
+.. note:: Using ``predict()`` with DART booster
+
+  If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
+  some of the trees will be evaluated. This will produce incorrect results if ``data`` is
+  not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
+  a nonzero value, e.g.
+
+  .. code-block:: python
+
+    preds = bst.predict(dtest, ntree_limit=num_round)
+
 * ``sample_type`` [default= ``uniform``]

  - Type of sampling algorithm.
@ -212,7 +228,7 @@ Additional parameters for Dart Booster (``booster=dart``)
  - range: [0.0, 1.0]

 Parameters for Linear Booster (``booster=gblinear``)
-==================================================
+====================================================
 * ``lambda`` [default=0, alias: ``reg_lambda``]

  - L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples.
--- a/doc/tutorials/dart.rst
+++ b/doc/tutorials/dart.rst
@ -111,3 +111,9 @@ Sample Script
  # make prediction
  # ntree_limit must not be 0
  preds = bst.predict(dtest, ntree_limit=num_round)
+
+.. note:: Specify ``ntree_limit`` when predicting with test sets
+
+  By default, ``bst.predict()`` will perform dropouts on trees. To obtain
+  correct results on test sets, disable dropouts by specifying
+  a nonzero value for ``ntree_limit``.
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@ -76,14 +76,11 @@ class GradientBooster {
   * \brief generate predictions for given feature matrix
   * \param dmat feature matrix
   * \param out_preds output vector to hold the predictions
-   * \param dropout whether dropout should be applied to prediction
-   *   This option is only meaningful if booster='dart'; otherwise ignored.
   * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
   *    we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
   */
  virtual void PredictBatch(DMatrix* dmat,
                            HostDeviceVector<bst_float>* out_preds,
-                            bool dropout = true,
                            unsigned ntree_limit = 0) = 0;
  /*!
   * \brief online prediction function, predict score for one instance at a time
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -1120,10 +1120,22 @@ class Booster(object):
        """
        Predict with data.

-        NOTE: This function is not thread safe.
-              For each booster object, predict can only be called from one thread.
-              If you want to run prediction using multiple thread, call bst.copy() to make copies
-              of model object and then call predict
+        .. note:: This function is not thread safe.
+
+          For each booster object, predict can only be called from one thread.
+          If you want to run prediction using multiple thread, call ``bst.copy()`` to make copies
+          of model object and then call ``predict()``.
+
+        .. note:: Using ``predict()`` with DART booster
+
+          If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
+          some of the trees will be evaluated. This will produce incorrect results if ``data`` is
+          not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
+          a nonzero value, e.g.
+
+          .. code-block:: python
+
+            preds = bst.predict(dtest, ntree_limit=num_round)

        Parameters
        ----------
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -563,10 +563,24 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
    def predict(self, data, output_margin=False, ntree_limit=None):
        """
        Predict with `data`.
-        NOTE: This function is not thread safe.
-              For each booster object, predict can only be called from one thread.
-              If you want to run prediction using multiple thread, call xgb.copy() to make copies
-              of model object and then call predict
+
+        .. note:: This function is not thread safe.
+
+          For each booster object, predict can only be called from one thread.
+          If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
+          of model object and then call ``predict()``.
+
+        .. note:: Using ``predict()`` with DART booster
+
+          If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
+          some of the trees will be evaluated. This will produce incorrect results if ``data`` is
+          not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
+          a nonzero value, e.g.
+
+          .. code-block:: python
+
+            preds = bst.predict(dtest, ntree_limit=num_round)
+
        Parameters
        ----------
        data : DMatrix
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@ -103,7 +103,6 @@ class GBLinear : public GradientBooster {

  void PredictBatch(DMatrix *p_fmat,
                    HostDeviceVector<bst_float> *out_preds,
-                    bool dropout,
                    unsigned ntree_limit) override {
    monitor_.Start("PredictBatch");
    CHECK_EQ(ntree_limit, 0U)
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -217,7 +217,6 @@ class GBTree : public GradientBooster {

  void PredictBatch(DMatrix* p_fmat,
               HostDeviceVector<bst_float>* out_preds,
-               bool dropout,
               unsigned ntree_limit) override {
    predictor_->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
  }
@ -357,11 +356,8 @@ class Dart : public GBTree {
  // predict the leaf scores with dropout if ntree_limit = 0
  void PredictBatch(DMatrix* p_fmat,
                    HostDeviceVector<bst_float>* out_preds,
-                    bool dropout,
                    unsigned ntree_limit) override {
-    if (dropout) {
-      DropTrees(ntree_limit);
-    }
+    DropTrees(ntree_limit);
    PredLoopInternal<Dart>(p_fmat, &out_preds->HostVector(), 0, ntree_limit, true);
  }

--- a/src/learner.cc
+++ b/src/learner.cc
@ -469,7 +469,7 @@ class LearnerImpl : public Learner {
    } else if (pred_leaf) {
      gbm_->PredictLeaf(data, &out_preds->HostVector(), ntree_limit);
    } else {
-      this->PredictRaw(data, out_preds, false, ntree_limit);
+      this->PredictRaw(data, out_preds, ntree_limit);
      if (!output_margin) {
        obj_->PredTransform(out_preds);
      }
@ -560,16 +560,14 @@ class LearnerImpl : public Learner {
   * \brief get un-transformed prediction
   * \param data training data matrix
   * \param out_preds output vector that stores the prediction
-   * \param dropout whether dropout should be applied to prediction.
-   *   This option is only meaningful if booster='dart'; otherwise ignored.
   * \param ntree_limit limit number of trees used for boosted tree
   *   predictor, when it equals 0, this means we are using all the trees
   */
  inline void PredictRaw(DMatrix* data, HostDeviceVector<bst_float>* out_preds,
-                         bool dropout = true, unsigned ntree_limit = 0) const {
+                         unsigned ntree_limit = 0) const {
    CHECK(gbm_ != nullptr)
        << "Predict must happen after Load or InitModel";
-    gbm_->PredictBatch(data, out_preds, dropout, ntree_limit);
+    gbm_->PredictBatch(data, out_preds, ntree_limit);
  }

  // model parameter
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@ -48,13 +48,6 @@ class TestModels(unittest.TestCase):
        preds2 = bst2.predict(dtest2, ntree_limit=num_round)
        # assert they are the same
        assert np.sum(np.abs(preds2 - preds)) == 0
-        # regression test for issues #3485, #3540
-        for _ in range(10):
-            bst3 = xgb.Booster(params=param, model_file='xgb.model.dart')
-            dtest3 = xgb.DMatrix('dtest.buffer')
-            preds3 = bst3.predict(dtest3)
-            # assert they are the same
-            assert np.sum(np.abs(preds3 - preds)) == 0, 'preds3 = {}, preds = {}'.format(preds3, preds)

        # check whether sample_type and normalize_type work
        num_round = 50