Fix R dart prediction. (#5204)

* Fix R dart prediction and add test.
2020-01-16 12:11:04 +08:00 · 2020-01-16 12:11:04 +08:00 · 5199b86126
commit 5199b86126
parent 808f61081b
5 changed files with 80 additions and 35 deletions
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@ -5,8 +5,8 @@
 #' @export
 xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
                    params = list(), nrounds,
-                    verbose = 1, print_every_n = 1L, 
-                    early_stopping_rounds = NULL, maximize = NULL, 
+                    verbose = 1, print_every_n = 1L,
+                    early_stopping_rounds = NULL, maximize = NULL,
                    save_period = NULL, save_name = "xgboost.model",
                    xgb_model = NULL, callbacks = list(), ...) {

@ -18,16 +18,16 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
                   early_stopping_rounds = early_stopping_rounds, maximize = maximize,
                   save_period = save_period, save_name = save_name,
                   xgb_model = xgb_model, callbacks = callbacks, ...)
-  return(bst)
+  return (bst)
 }

 #' Training part from Mushroom Data Set
-#' 
+#'
 #' This data set is originally from the Mushroom data set,
 #' UCI Machine Learning Repository.
-#' 
+#'
 #' This data set includes the following fields:
-#' 
+#'
 #' \itemize{
 #'  \item \code{label} the label for each record
 #'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
@ -35,16 +35,16 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
 #'
 #' @references
 #' https://archive.ics.uci.edu/ml/datasets/Mushroom
-#' 
-#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 
-#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 
+#'
+#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
+#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
 #' School of Information and Computer Science.
-#' 
+#'
 #' @docType data
 #' @keywords datasets
 #' @name agaricus.train
 #' @usage data(agaricus.train)
-#' @format A list containing a label vector, and a dgCMatrix object with 6513 
+#' @format A list containing a label vector, and a dgCMatrix object with 6513
 #' rows and 127 variables
 NULL

@ -52,9 +52,9 @@ NULL
 #'
 #' This data set is originally from the Mushroom data set,
 #' UCI Machine Learning Repository.
-#' 
+#'
 #' This data set includes the following fields:
-#' 
+#'
 #' \itemize{
 #'  \item \code{label} the label for each record
 #'  \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
@ -62,16 +62,16 @@ NULL
 #'
 #' @references
 #' https://archive.ics.uci.edu/ml/datasets/Mushroom
-#' 
-#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 
-#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 
+#'
+#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
+#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
 #' School of Information and Computer Science.
-#' 
+#'
 #' @docType data
 #' @keywords datasets
 #' @name agaricus.test
 #' @usage data(agaricus.test)
-#' @format A list containing a label vector, and a dgCMatrix object with 1611 
+#' @format A list containing a label vector, and a dgCMatrix object with 1611
 #' rows and 126 variables
 NULL

@ -107,7 +107,7 @@ NULL
 #' @importFrom graphics par
 #' @importFrom graphics title
 #' @importFrom grDevices rgb
-#' 
+#'
 #' @import methods
 #' @useDynLib xgboost, .registration = TRUE
 NULL
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@ -313,7 +313,7 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
                              R_ExternalPtrAddr(dmat),
                              asInteger(option_mask),
                              asInteger(ntree_limit),
-                              0,
+                              asInteger(training),
                              &olen, &res));
  ret = PROTECT(allocVector(REALSXP, olen));
  for (size_t i = 0; i < olen; ++i) {
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@ -27,7 +27,7 @@ test_that("train and predict binary classification", {

  pred <- predict(bst, test$data)
  expect_length(pred, 1611)
-  
+
  pred1 <- predict(bst, train$data, ntreelimit = 1)
  expect_length(pred1, 6513)
  err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
@ -35,6 +35,54 @@ test_that("train and predict binary classification", {
  expect_lt(abs(err_pred1 - err_log), 10e-6)
 })

+test_that("dart prediction works", {
+  nrounds = 32
+  set.seed(1994)
+
+  d <- cbind(
+    x1 = rnorm(100),
+    x2 = rnorm(100),
+    x3 = rnorm(100))
+  y <- d[,"x1"] + d[,"x2"]^2 +
+    ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
+    rnorm(100)
+
+  set.seed(1994)
+  booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
+                                rate_drop = 0.5, one_drop = TRUE,
+                                eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
+  pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
+  pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
+  expect_true(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
+
+  pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
+  expect_false(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
+
+  set.seed(1994)
+  dtrain <- xgb.DMatrix(data=d, info = list(label=y))
+  booster_by_train <- xgb.train( params = list(
+                                   booster = "dart",
+                                   max_depth = 2,
+                                   eta = 1,
+                                   rate_drop = 0.5,
+                                   one_drop = TRUE,
+                                   nthread = 1,
+                                   tree_method= "exact",
+                                   verbosity = 3,
+                                   objective = "reg:squarederror"
+                                 ),
+                                data = dtrain,
+                                nrounds = nrounds
+                                )
+  pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
+  pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
+  pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
+
+  expect_true(all(matrix(pred_by_train_0, byrow=TRUE) == matrix(pred_by_xgboost_0, byrow=TRUE)))
+  expect_true(all(matrix(pred_by_train_1, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
+  expect_true(all(matrix(pred_by_train_2, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
+})
+
 test_that("train and predict softprob", {
  lb <- as.numeric(iris$Species) - 1
  set.seed(11)
@ -74,7 +122,7 @@ test_that("train and predict softmax", {
  expect_false(is.null(bst$evaluation_log))
  expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
  expect_equal(bst$niter * 3, xgb.ntree(bst))
-  
+
  pred <- predict(bst, as.matrix(iris[, -5]))
  expect_length(pred, nrow(iris))
  err <- sum(pred != lb)/length(lb)
@ -90,12 +138,12 @@ test_that("train and predict RF", {
                 num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1)
  expect_equal(bst$niter, 1)
  expect_equal(xgb.ntree(bst), 20)
-  
+
  pred <- predict(bst, train$data)
  pred_err <- sum((pred > 0.5) != lb)/length(lb)
  expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
  #expect_lt(pred_err, 0.03)
-  
+
  pred <- predict(bst, train$data, ntreelimit = 20)
  pred_err_20 <- sum((pred > 0.5) != lb)/length(lb)
  expect_equal(pred_err_20, pred_err)
@ -211,7 +259,7 @@ test_that("train and predict with non-strict classes", {
  bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
  pr0 <- predict(bst, train_dense)
-  
+
  # dense matrix-like input of non-matrix class
  class(train_dense) <- 'shmatrix'
  expect_true(is.matrix(train_dense))
@ -221,7 +269,7 @@ test_that("train and predict with non-strict classes", {
    , regexp = NA)
  expect_error(pr <- predict(bst, train_dense), regexp = NA)
  expect_equal(pr0, pr)
-  
+
  # dense matrix-like input of non-matrix class with some inheritance
  class(train_dense) <- c('pphmatrix','shmatrix')
  expect_true(is.matrix(train_dense))
@ -231,7 +279,7 @@ test_that("train and predict with non-strict classes", {
    , regexp = NA)
  expect_error(pr <- predict(bst, train_dense), regexp = NA)
  expect_equal(pr0, pr)
-  
+
  # when someone inhertis from xgb.Booster, it should still be possible to use it as xgb.Booster
  class(bst) <- c('super.Booster', 'xgb.Booster')
  expect_error(pr <- predict(bst, train_dense), regexp = NA)
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@ -157,7 +157,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
      params = c(
        list(
          booster = booster,
-          objective = "reg:linear",
+          objective = "reg:squarederror",
          eval_metric = "rmse"),
        if (booster == "dart")
          list(rate_drop = .01, one_drop = T)),
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -435,9 +435,9 @@ class Dart : public GBTree {
      std::fill(out_preds.begin(), out_preds.end(),
                model_.learner_model_param_->base_score);
    }
-
-    PredLoopSpecalize(p_fmat, &out_preds, num_group, 0,
-                      ntree_limit, training);
+    const int nthread = omp_get_max_threads();
+    InitThreadTemp(nthread);
+    PredLoopSpecalize(p_fmat, &out_preds, num_group, 0, ntree_limit);
  }

  void PredictInstance(const SparsePage::Inst &inst,
@ -489,11 +489,8 @@ class Dart : public GBTree {
      std::vector<bst_float>* out_preds,
      int num_group,
      unsigned tree_begin,
-      unsigned tree_end,
-      bool training) {
-    const int nthread = omp_get_max_threads();
+      unsigned tree_end) {
    CHECK_EQ(num_group, model_.learner_model_param_->num_output_group);
-    InitThreadTemp(nthread);
    std::vector<bst_float>& preds = *out_preds;
    CHECK_EQ(model_.param.size_leaf_vector, 0)
        << "size_leaf_vector is enforced to 0 so far";