R maintenance Feb2017 (#2045)

* [R] better argument check in xgb.DMatrix; fixes #1480 * [R] showsd was a dummy; fixes #2044 * [R] better categorical encoding explanation in vignette; fixes #1989 * [R] new roxygen version docs update
2017-02-20 12:02:40 -06:00
parent 63aec12a13
commit b4d97d3cb8
42 changed files with 19 additions and 48 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -37,4 +37,4 @@ Imports:
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
    stringi (>= 0.5.2)
-RoxygenNote: 5.0.1
+RoxygenNote: 6.0.1
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -41,6 +41,7 @@ NULL
 #' Callback closure for printing the result of evaluation
 #' 
 #' @param period  results would be printed every number of periods
+#' @param showsd  whether standard deviations should be printed (when available)
 #' 
 #' @details
 #' The callback function prints the result of evaluation at every \code{period} iterations.
@@ -56,7 +57,7 @@ NULL
 #' \code{\link{callbacks}}
 #' 
 #' @export
-cb.print.evaluation <- function(period=1) {
+cb.print.evaluation <- function(period=1, showsd=TRUE) {
  
  callback <- function(env = parent.frame()) {
    if (length(env$bst_evaluation) == 0 ||
@@ -68,7 +69,8 @@ cb.print.evaluation <- function(period=1) {
    if ((i-1) %% period == 0 ||
        i == env$begin_iteration ||
        i == env$end_iteration) {
-      msg <- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
+      stdev <- if (showsd) env$bst_evaluation_err else NULL
+      msg <- format.eval.string(i, env$bst_evaluation, stdev)
      cat(msg, '\n')
    }
  }
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -20,6 +20,9 @@
 xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
  cnames <- NULL
  if (typeof(data) == "character") {
+    if (length(data) > 1)
+      stop("'data' has class 'character' and length ", length(data),
+           ".\n  'data' accepts either a numeric matrix or a single filename.")
    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
                    PACKAGE = "xgboost")
  } else if (is.matrix(data)) {
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -153,7 +153,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  params <- c(params, list(silent = 1))
  print_every_n <- max( as.integer(print_every_n), 1L)
  if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
+    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd=showsd))
  }
  # evaluation log callback: always is on in CV
  evaluation_log <- list()
--- a/R-package/man/agaricus.test.Rd
+++ b/R-package/man/agaricus.test.Rd
@@ -29,4 +29,3 @@ Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
 School of Information and Computer Science.
 }
 \keyword{datasets}
-
--- a/R-package/man/agaricus.train.Rd
+++ b/R-package/man/agaricus.train.Rd
@@ -29,4 +29,3 @@ Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
 School of Information and Computer Science.
 }
 \keyword{datasets}
-
--- a/R-package/man/callbacks.Rd
+++ b/R-package/man/callbacks.Rd
@@ -35,4 +35,3 @@ with the objects available inside of the \code{xgb.train} and \code{xgb.cv} inte
 \code{\link{xgb.train}},
 \code{\link{xgb.cv}}
 }
-
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/cb.cv.predict.Rd
@@ -41,4 +41,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.early.stop.Rd
+++ b/R-package/man/cb.early.stop.Rd
@@ -60,4 +60,3 @@ Callback function expects the following values to be set in its calling frame:
 \code{\link{callbacks}},
 \code{\link{xgb.attr}}
 }
-
--- a/R-package/man/cb.evaluation.log.Rd
+++ b/R-package/man/cb.evaluation.log.Rd
@@ -29,4 +29,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.print.evaluation.Rd
+++ b/R-package/man/cb.print.evaluation.Rd
@@ -4,10 +4,12 @@
 \alias{cb.print.evaluation}
 \title{Callback closure for printing the result of evaluation}
 \usage{
-cb.print.evaluation(period = 1)
+cb.print.evaluation(period = 1, showsd = TRUE)
 }
 \arguments{
 \item{period}{results would be printed every number of periods}
+
+\item{showsd}{whether standard deviations should be printed (when available)}
 }
 \description{
 Callback closure for printing the result of evaluation
@@ -25,4 +27,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.reset.parameters.Rd
+++ b/R-package/man/cb.reset.parameters.Rd
@@ -34,4 +34,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/cb.save.model.Rd
+++ b/R-package/man/cb.save.model.Rd
@@ -31,4 +31,3 @@ Callback function expects the following values to be set in its calling frame:
 \seealso{
 \code{\link{callbacks}}
 }
-
--- a/R-package/man/dim.xgb.DMatrix.Rd
+++ b/R-package/man/dim.xgb.DMatrix.Rd
@@ -26,4 +26,3 @@ stopifnot(ncol(dtrain) == ncol(train$data))
 stopifnot(all(dim(dtrain) == dim(train$data)))

 }
-
--- a/R-package/man/dimnames.xgb.DMatrix.Rd
+++ b/R-package/man/dimnames.xgb.DMatrix.Rd
@@ -33,4 +33,3 @@ colnames(dtrain) <- make.names(1:ncol(train$data))
 print(dtrain, verbose=TRUE)

 }
-
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -40,4 +40,3 @@ setinfo(dtrain, 'label', 1-labels)
 labels2 <- getinfo(dtrain, 'label')
 stopifnot(all(labels2 == 1-labels))
 }
-
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -126,4 +126,3 @@ plot(err, type='l', ylim=c(0,0.1), xlab='#trees')
 \seealso{
 \code{\link{xgb.train}}.
 }
-
--- a/R-package/man/print.xgb.Booster.Rd
+++ b/R-package/man/print.xgb.Booster.Rd
@@ -27,4 +27,3 @@ print(bst)
 print(bst, verbose=TRUE)

 }
-
--- a/R-package/man/print.xgb.DMatrix.Rd
+++ b/R-package/man/print.xgb.DMatrix.Rd
@@ -26,4 +26,3 @@ dtrain
 print(dtrain, verbose=TRUE)

 }
-
--- a/R-package/man/print.xgb.cv.Rd
+++ b/R-package/man/print.xgb.cv.Rd
@@ -29,4 +29,3 @@ print(cv)
 print(cv, verbose=TRUE)

 }
-
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -41,4 +41,3 @@ setinfo(dtrain, 'label', 1-labels)
 labels2 <- getinfo(dtrain, 'label')
 stopifnot(all.equal(labels2, 1-labels))
 }
-
--- a/R-package/man/slice.xgb.DMatrix.Rd
+++ b/R-package/man/slice.xgb.DMatrix.Rd
@@ -1,9 +1,9 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.DMatrix.R
 \name{slice}
-\alias{[.xgb.DMatrix}
 \alias{slice}
 \alias{slice.xgb.DMatrix}
+\alias{[.xgb.DMatrix}
 \title{Get a new DMatrix containing the specified rows of
 orginal xgb.DMatrix object}
 \usage{
@@ -38,4 +38,3 @@ labels2 <- getinfo(dsub, 'label')
 all.equal(labels1, labels2)

 }
-
--- a/R-package/man/xgb.Booster.complete.Rd
+++ b/R-package/man/xgb.Booster.complete.Rd
@@ -46,4 +46,3 @@ bst1 <- xgb.Booster.complete(bst1)
 print(bst1$handle)

 }
-
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -27,4 +27,3 @@ dtrain <- xgb.DMatrix(train$data, label=train$label)
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 }
-
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -21,4 +21,3 @@ dtrain <- xgb.DMatrix(train$data, label=train$label)
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 }
-
--- a/R-package/man/xgb.attr.Rd
+++ b/R-package/man/xgb.attr.Rd
@@ -83,4 +83,3 @@ xgb.attributes(bst1) <- list(a = NULL, b = NULL)
 print(xgb.attributes(bst1))

 }
-
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@@ -90,4 +90,3 @@ cat(paste("The accuracy was", accuracy.before, "before adding leaf features and
          accuracy.after, "!\\n"))

 }
-
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -140,4 +140,3 @@ print(cv)
 print(cv, verbose=TRUE)

 }
-
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -53,4 +53,3 @@ print(xgb.dump(bst, with_stats = TRUE))
 cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))

 }
-
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -63,4 +63,3 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
 xgb.importance(model = bst)

 }
-
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -38,4 +38,3 @@ pred <- predict(bst, test$data)
 \seealso{
 \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
 }
-
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -67,4 +67,3 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
 merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
 
 }
-
--- a/R-package/man/xgb.parameters.Rd
+++ b/R-package/man/xgb.parameters.Rd
@@ -29,4 +29,3 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 xgb.parameters(bst) <- list(eta = 0.1)

 }
-
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -72,4 +72,3 @@ xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
 \seealso{
 \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
 }
-
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -79,4 +79,3 @@ gg + ggplot2::ylab("Frequency")
 \seealso{
 \code{\link[graphics]{barplot}}.
 }
-
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -56,4 +56,3 @@ p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$d
 print(p)

 }
-
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -74,4 +74,3 @@ xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst,
              trees = 0, show_node_id = TRUE)

 }
-
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -38,4 +38,3 @@ pred <- predict(bst, test$data)
 \seealso{
 \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
 }
-
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -25,4 +25,3 @@ bst <- xgb.load(raw)
 pred <- predict(bst, test$data)

 }
-
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -264,4 +264,3 @@ pred <- predict(bst, agaricus.test$data)
 \code{\link{predict.xgb.Booster}},
 \code{\link{xgb.cv}}
 }
-
--- a/R-package/man/xgboost-deprecated.Rd
+++ b/R-package/man/xgboost-deprecated.Rd
@@ -14,4 +14,3 @@ A deprecation warning is shown when any of the deprecated parameters is used in
 An additional warning is shown when there was a partial match to a deprecated parameter 
 (as R is able to partially match parameter names).
 }
-
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -134,23 +134,24 @@ levels(df[,Treatment])
 ```


-#### One-hot encoding
+#### Encoding categorical features

 Next step, we will transform the categorical data to dummy variables.
-This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
+Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach.
+We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it producess "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).

 The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.

-For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
+For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `TreatmentPlacebo` and the value `0` in the new column `TreatmentTreated`. The column `TreatmentPlacebo` will disappear during the contrast encoding, as it would be absorbed into a common constant intercept column.

 Column `Improved` is excluded because it will be our `label` column, the one we want to predict.

 ```{r, warning=FALSE,message=FALSE}
-sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
+sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1]
 head(sparse_matrix)
 ```

-> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
+> Formula `Improved ~ .` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` column selection removes the intercept column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.

 Create the output `numeric` vector (not as a sparse `Matrix`):