From bdfa8c0e09f046fe65389433a65426a0c3658ac1 Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Sat, 20 Aug 2016 05:09:03 -0500
Subject: [PATCH] [R-package] a few fixes for R (#1485)

* [R] fix #1465

* [R] add sanity check to fix #1434

* [R] some clean-ups for custom obj&eval; require maximize only for early stopping
---
 R-package/R/callbacks.R                   |  4 +++
 R-package/R/utils.R                       | 34 +++++++++++------------
 R-package/R/xgb.model.dt.tree.R           |  7 ++++-
 R-package/R/xgb.train.R                   | 28 +++++++++++++++----
 R-package/man/cb.cv.predict.Rd            |  1 +
 R-package/man/xgb.train.Rd                | 28 +++++++++++++++----
 R-package/tests/testthat/test_callbacks.R |  9 ++++++
 R-package/tests/testthat/test_helpers.R   |  4 +++
 8 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 4a65c8c34..4d7916d38 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -458,6 +458,7 @@ cb.save.model <- function(save_period = 0, save_name = "xgboost.model") {
 #' \code{basket},
 #' \code{data},
 #' \code{end_iteration},
+#' \code{params},
 #' \code{num_parallel_tree},
 #' \code{num_class}.
 #' 
@@ -491,6 +492,9 @@ cb.cv.predict <- function(save_models = FALSE) {
 
     ntreelimit <- NVL(env$basket$best_ntreelimit, 
                       env$end_iteration * env$num_parallel_tree)
+    if (NVL(env$params[['booster']], '') == 'gblinear') {
+      ntreelimit <- 0 # must be 0 for gblinear
+    }
     for (fd in env$bst_folds) {
       pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE)
       if (is.matrix(pred)) {
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 68c0e3321..2a1b79578 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -17,7 +17,7 @@ NVL <- function(x, val) {
   }
   if (typeof(x) == 'closure')
     return(x)
-  stop('x of unsupported for NVL type')
+  stop("typeof(x) == ", typeof(x), " is not supported by NVL")
 }
 
 
@@ -42,15 +42,15 @@ check.booster.params <- function(params, ...) {
     stop("Same parameters in 'params' and in the call are not allowed. Please check your 'params' list.")
   params <- c(params, dot_params)
   
-  # providing a parameter multiple times only makes sense for 'eval_metric'
+  # providing a parameter multiple times makes sense only for 'eval_metric'
   name_freqs <- table(names(params))
   multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric')
   if (length(multi_names) > 0) {
     warning("The following parameters were provided multiple times:\n\t",
             paste(multi_names, collapse=', '), "\n  Only the last value for each of them will be used.\n")
-    # While xgboost itself would choose the last value for a multi-parameter, 
-    # will do some clean-up here b/c multi-parameters could be used further in R code, and R would 
-    # pick the 1st (not the last) value when multiple elements with the same name are present in a list.
+    # While xgboost internals would choose the last value for a multiple-times parameter, 
+    # enforce it here in R as well (b/c multi-parameters might be used further in R code, 
+    # and R takes the 1st value when multiple elements with the same name are present in a list).
     for (n in multi_names) {
       del_idx <- which(n == names(params))
       del_idx <- del_idx[-length(del_idx)]
@@ -60,9 +60,9 @@ check.booster.params <- function(params, ...) {
   
   # for multiclass, expect num_class to be set
   if (typeof(params[['objective']]) == "character" &&
-    substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:') {
-    if (as.numeric(NVL(params[['num_class']], 0)) < 2)
-      stop("'num_class' > 1 parameter must be set for multiclass classification")
+      substr(NVL(params[['objective']], 'x'), 1, 6) == 'multi:' &&
+      as.numeric(NVL(params[['num_class']], 0)) < 2) {
+        stop("'num_class' > 1 parameter must be set for multiclass classification")
   }
   
   return(params)
@@ -82,9 +82,7 @@ check.custom.obj <- function(env = parent.frame()) {
   if (!is.null(env$params[['objective']]) &&
       typeof(env$params$objective) == 'closure') {
     env$obj <- env$params$objective
-    p <- env$params
-    p$objective <- NULL
-    env$params <- p
+    env$params$objective <- NULL
   }
 }
 
@@ -97,17 +95,19 @@ check.custom.eval <- function(env = parent.frame()) {
   if (!is.null(env$feval) && typeof(env$feval) != 'closure')
     stop("'feval' must be a function")
   
-  if (!is.null(env$feval) && is.null(env$maximize))
-    stop("Please set 'maximize' to indicate whether the metric needs to be maximized or not")
-  
   # handle a situation when custom eval function was provided through params
   if (!is.null(env$params[['eval_metric']]) &&
       typeof(env$params$eval_metric) == 'closure') {
     env$feval <- env$params$eval_metric
-    p <- env$params
-    p[ which(names(p) == 'eval_metric') ] <- NULL
-    env$params <- p
+    env$params$eval_metric <- NULL
   }
+  
+  # require maximize to be set when custom feval and early stopping are used together
+  if (!is.null(env$feval) &&
+      is.null(env$maximize) && (
+        !is.null(env$early_stopping_rounds) || 
+        has.callbacks(env$callbacks, 'cb.early.stop')))
+    stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not")
 }
 
 
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 81fd0fe35..2af98ca87 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -65,10 +65,15 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
     stop("n_first_tree: Has to be a numeric vector of size 1.")
   }
   
-  if(is.null(text)){
+  if (is.null(text)){
     text <- xgb.dump(model = model, with_stats = T)
   }
   
+  if (length(text) < 2 ||
+      sum(stri_detect_regex(text, 'yes=(\\d+),no=(\\d+)')) < 1) {
+    stop("Non-tree model detected! This function can only be used with tree models.")
+  }
+  
   position <- which(!is.na(stri_match_first_regex(text, "booster")))
   
   add.tree.id <- function(x, i) paste(i, x, sep = "-")
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 022e02246..08cb71b84 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -173,9 +173,10 @@
 #' watchlist <- list(eval = dtest, train = dtrain)
 #' 
 #' ## A simple xgb.train example:
-#' param <- list(max_depth = 2, eta = 1, silent = 1, 
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
 #'               objective = "binary:logistic", eval_metric = "auc")
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+#' 
 #' 
 #' ## An xgb.train example where custom objective and evaluation metric are used:
 #' logregobj <- function(preds, dtrain) {
@@ -190,16 +191,33 @@
 #'   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
 #'   return(list(metric = "error", value = err))
 #' }
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+#' 
+#' # These functions could be used by passing them either:
+#' #  as 'objective' and 'eval_metric' parameters in the params list:
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
+#'               objective = logregobj, eval_metric = evalerror)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+#' 
+#' #  or through the ... arguments:
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+#'                  objective = logregobj, eval_metric = evalerror)
+#' 
+#' #  or as dedicated 'obj' and 'feval' parameters of xgb.train:
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+#'                  obj = logregobj, feval = evalerror)
+#' 
 #' 
 #' ## An xgb.train example of using variable learning rates at each iteration:
+#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
 #' my_etas <- list(eta = c(0.5, 0.1))
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 #'                  callbacks = list(cb.reset.parameters(my_etas)))
 #' 
+#' 
 #' ## Explicit use of the cb.evaluation.log callback allows to run 
 #' ## xgb.train silently but still store the evaluation results:
-#' bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 #'                  verbose = 0, callbacks = list(cb.evaluation.log()))
 #' print(bst$evaluation_log)
 #' 
diff --git a/R-package/man/cb.cv.predict.Rd b/R-package/man/cb.cv.predict.Rd
index 34e9f813e..24f35c076 100644
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/cb.cv.predict.Rd
@@ -34,6 +34,7 @@ Callback function expects the following values to be set in its calling frame:
 \code{basket},
 \code{data},
 \code{end_iteration},
+\code{params},
 \code{num_parallel_tree},
 \code{num_class}.
 }
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index a06e21b7b..a464c04e7 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -200,9 +200,10 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 watchlist <- list(eval = dtest, train = dtrain)
 
 ## A simple xgb.train example:
-param <- list(max_depth = 2, eta = 1, silent = 1, 
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
               objective = "binary:logistic", eval_metric = "auc")
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+
 
 ## An xgb.train example where custom objective and evaluation metric are used:
 logregobj <- function(preds, dtrain) {
@@ -217,16 +218,33 @@ evalerror <- function(preds, dtrain) {
   err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
   return(list(metric = "error", value = err))
 }
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist)
+
+# These functions could be used by passing them either:
+#  as 'objective' and 'eval_metric' parameters in the params list:
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, 
+              objective = logregobj, eval_metric = evalerror)
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
+
+#  or through the ... arguments:
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+                 objective = logregobj, eval_metric = evalerror)
+
+#  or as dedicated 'obj' and 'feval' parameters of xgb.train:
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+                 obj = logregobj, feval = evalerror)
+
 
 ## An xgb.train example of using variable learning rates at each iteration:
+param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
 my_etas <- list(eta = c(0.5, 0.1))
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                  callbacks = list(cb.reset.parameters(my_etas)))
 
+
 ## Explicit use of the cb.evaluation.log callback allows to run 
 ## xgb.train silently but still store the evaluation results:
-bst <- xgb.train(param, dtrain, nthread = 2, nrounds = 2, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                  verbose = 0, callbacks = list(cb.evaluation.log()))
 print(bst$evaluation_log)
 
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index fd0f5d56b..26253bf62 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -260,6 +260,15 @@ test_that("prediction in xgb.cv works", {
   expect_true(all(sapply(cvx$models, class) == 'xgb.Booster'))
 })
 
+test_that("prediction in xgb.cv works for gblinear too", {
+  set.seed(11)
+  p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2)
+  cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE)
+  expect_false(is.null(cv$evaluation_log))
+  expect_false(is.null(cv$pred))
+  expect_length(cv$pred, nrow(train$data))
+})
+
 test_that("prediction in early-stopping xgb.cv works", {
   set.seed(1)
   expect_output(
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 4e5da982b..9c63101fb 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -81,6 +81,10 @@ test_that("xgb.model.dt.tree works with and without feature names", {
   expect_output(str(xgb.model.dt.tree(model = bst.Tree)), 'Feature.*\\"3\\"')
 })
 
+test_that("xgb.model.dt.tree throws error for gblinear", {
+  expect_error(xgb.model.dt.tree(model = bst.GLM))
+})
+
 test_that("xgb.importance works with and without feature names", {
   importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
   expect_equal(dim(importance.Tree), c(7, 4))