From b4d97d3cb81f72f8967ecd6c4b1e2412e0813239 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Mon, 20 Feb 2017 12:02:40 -0600 Subject: [PATCH] R maintenance Feb2017 (#2045) * [R] better argument check in xgb.DMatrix; fixes #1480 * [R] showsd was a dummy; fixes #2044 * [R] better categorical encoding explanation in vignette; fixes #1989 * [R] new roxygen version docs update --- R-package/DESCRIPTION | 2 +- R-package/R/callbacks.R | 6 ++++-- R-package/R/xgb.DMatrix.R | 3 +++ R-package/R/xgb.cv.R | 2 +- R-package/man/agaricus.test.Rd | 1 - R-package/man/agaricus.train.Rd | 1 - R-package/man/callbacks.Rd | 1 - R-package/man/cb.cv.predict.Rd | 1 - R-package/man/cb.early.stop.Rd | 1 - R-package/man/cb.evaluation.log.Rd | 1 - R-package/man/cb.print.evaluation.Rd | 5 +++-- R-package/man/cb.reset.parameters.Rd | 1 - R-package/man/cb.save.model.Rd | 1 - R-package/man/dim.xgb.DMatrix.Rd | 1 - R-package/man/dimnames.xgb.DMatrix.Rd | 1 - R-package/man/getinfo.Rd | 1 - R-package/man/predict.xgb.Booster.Rd | 1 - R-package/man/print.xgb.Booster.Rd | 1 - R-package/man/print.xgb.DMatrix.Rd | 1 - R-package/man/print.xgb.cv.Rd | 1 - R-package/man/setinfo.Rd | 1 - R-package/man/slice.xgb.DMatrix.Rd | 3 +-- R-package/man/xgb.Booster.complete.Rd | 1 - R-package/man/xgb.DMatrix.Rd | 1 - R-package/man/xgb.DMatrix.save.Rd | 1 - R-package/man/xgb.attr.Rd | 1 - R-package/man/xgb.create.features.Rd | 1 - R-package/man/xgb.cv.Rd | 1 - R-package/man/xgb.dump.Rd | 1 - R-package/man/xgb.importance.Rd | 1 - R-package/man/xgb.load.Rd | 1 - R-package/man/xgb.model.dt.tree.Rd | 1 - R-package/man/xgb.parameters.Rd | 1 - R-package/man/xgb.plot.deepness.Rd | 1 - R-package/man/xgb.plot.importance.Rd | 1 - R-package/man/xgb.plot.multi.trees.Rd | 1 - R-package/man/xgb.plot.tree.Rd | 1 - R-package/man/xgb.save.Rd | 1 - R-package/man/xgb.save.raw.Rd | 1 - R-package/man/xgb.train.Rd | 1 - R-package/man/xgboost-deprecated.Rd | 1 - R-package/vignettes/discoverYourData.Rmd | 11 ++++++----- 42 files changed, 19 insertions(+), 48 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index ef16f5cba..1db99e49a 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -37,4 +37,4 @@ Imports: data.table (>= 1.9.6), magrittr (>= 1.5), stringi (>= 0.5.2) -RoxygenNote: 5.0.1 +RoxygenNote: 6.0.1 diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index c66bbfd6b..9de59b6b6 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -41,6 +41,7 @@ NULL #' Callback closure for printing the result of evaluation #' #' @param period results would be printed every number of periods +#' @param showsd whether standard deviations should be printed (when available) #' #' @details #' The callback function prints the result of evaluation at every \code{period} iterations. @@ -56,7 +57,7 @@ NULL #' \code{\link{callbacks}} #' #' @export -cb.print.evaluation <- function(period=1) { +cb.print.evaluation <- function(period=1, showsd=TRUE) { callback <- function(env = parent.frame()) { if (length(env$bst_evaluation) == 0 || @@ -68,7 +69,8 @@ cb.print.evaluation <- function(period=1) { if ((i-1) %% period == 0 || i == env$begin_iteration || i == env$end_iteration) { - msg <- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) + stdev <- if (showsd) env$bst_evaluation_err else NULL + msg <- format.eval.string(i, env$bst_evaluation, stdev) cat(msg, '\n') } } diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 9dc0d1f26..94c5ad257 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -20,6 +20,9 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { cnames <- NULL if (typeof(data) == "character") { + if (length(data) > 1) + stop("'data' has class 'character' and length ", length(data), + ".\n 'data' accepts either a numeric matrix or a single filename.") handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE = "xgboost") } else if (is.matrix(data)) { diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index d455a4079..f576b2430 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -153,7 +153,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params <- c(params, list(silent = 1)) print_every_n <- max( as.integer(print_every_n), 1L) if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { - callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n)) + callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd=showsd)) } # evaluation log callback: always is on in CV evaluation_log <- list() diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd index 52ff08f86..041ff4e6c 100644 --- a/R-package/man/agaricus.test.Rd +++ b/R-package/man/agaricus.test.Rd @@ -29,4 +29,3 @@ Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository School of Information and Computer Science. } \keyword{datasets} - diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd index e27d3ac25..0c08e8080 100644 --- a/R-package/man/agaricus.train.Rd +++ b/R-package/man/agaricus.train.Rd @@ -29,4 +29,3 @@ Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository School of Information and Computer Science. } \keyword{datasets} - diff --git a/R-package/man/callbacks.Rd b/R-package/man/callbacks.Rd index d49f104f2..94070b867 100644 --- a/R-package/man/callbacks.Rd +++ b/R-package/man/callbacks.Rd @@ -35,4 +35,3 @@ with the objects available inside of the \code{xgb.train} and \code{xgb.cv} inte \code{\link{xgb.train}}, \code{\link{xgb.cv}} } - diff --git a/R-package/man/cb.cv.predict.Rd b/R-package/man/cb.cv.predict.Rd index 24f35c076..f43ccd019 100644 --- a/R-package/man/cb.cv.predict.Rd +++ b/R-package/man/cb.cv.predict.Rd @@ -41,4 +41,3 @@ Callback function expects the following values to be set in its calling frame: \seealso{ \code{\link{callbacks}} } - diff --git a/R-package/man/cb.early.stop.Rd b/R-package/man/cb.early.stop.Rd index eec30d7b5..97bf4cd49 100644 --- a/R-package/man/cb.early.stop.Rd +++ b/R-package/man/cb.early.stop.Rd @@ -60,4 +60,3 @@ Callback function expects the following values to be set in its calling frame: \code{\link{callbacks}}, \code{\link{xgb.attr}} } - diff --git a/R-package/man/cb.evaluation.log.Rd b/R-package/man/cb.evaluation.log.Rd index a71b7f8d3..1e5ffee31 100644 --- a/R-package/man/cb.evaluation.log.Rd +++ b/R-package/man/cb.evaluation.log.Rd @@ -29,4 +29,3 @@ Callback function expects the following values to be set in its calling frame: \seealso{ \code{\link{callbacks}} } - diff --git a/R-package/man/cb.print.evaluation.Rd b/R-package/man/cb.print.evaluation.Rd index aec57fe2d..59b9ba65e 100644 --- a/R-package/man/cb.print.evaluation.Rd +++ b/R-package/man/cb.print.evaluation.Rd @@ -4,10 +4,12 @@ \alias{cb.print.evaluation} \title{Callback closure for printing the result of evaluation} \usage{ -cb.print.evaluation(period = 1) +cb.print.evaluation(period = 1, showsd = TRUE) } \arguments{ \item{period}{results would be printed every number of periods} + +\item{showsd}{whether standard deviations should be printed (when available)} } \description{ Callback closure for printing the result of evaluation @@ -25,4 +27,3 @@ Callback function expects the following values to be set in its calling frame: \seealso{ \code{\link{callbacks}} } - diff --git a/R-package/man/cb.reset.parameters.Rd b/R-package/man/cb.reset.parameters.Rd index 24965c815..66f2a1c48 100644 --- a/R-package/man/cb.reset.parameters.Rd +++ b/R-package/man/cb.reset.parameters.Rd @@ -34,4 +34,3 @@ Callback function expects the following values to be set in its calling frame: \seealso{ \code{\link{callbacks}} } - diff --git a/R-package/man/cb.save.model.Rd b/R-package/man/cb.save.model.Rd index eef9b6295..d867d92d9 100644 --- a/R-package/man/cb.save.model.Rd +++ b/R-package/man/cb.save.model.Rd @@ -31,4 +31,3 @@ Callback function expects the following values to be set in its calling frame: \seealso{ \code{\link{callbacks}} } - diff --git a/R-package/man/dim.xgb.DMatrix.Rd b/R-package/man/dim.xgb.DMatrix.Rd index 168782dec..87b8d14f9 100644 --- a/R-package/man/dim.xgb.DMatrix.Rd +++ b/R-package/man/dim.xgb.DMatrix.Rd @@ -26,4 +26,3 @@ stopifnot(ncol(dtrain) == ncol(train$data)) stopifnot(all(dim(dtrain) == dim(train$data))) } - diff --git a/R-package/man/dimnames.xgb.DMatrix.Rd b/R-package/man/dimnames.xgb.DMatrix.Rd index 0877f294b..e17069d58 100644 --- a/R-package/man/dimnames.xgb.DMatrix.Rd +++ b/R-package/man/dimnames.xgb.DMatrix.Rd @@ -33,4 +33,3 @@ colnames(dtrain) <- make.names(1:ncol(train$data)) print(dtrain, verbose=TRUE) } - diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 16fbe8a79..480ef6b3e 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -40,4 +40,3 @@ setinfo(dtrain, 'label', 1-labels) labels2 <- getinfo(dtrain, 'label') stopifnot(all(labels2 == 1-labels)) } - diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index ed6c456b1..67a06fe2a 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -126,4 +126,3 @@ plot(err, type='l', ylim=c(0,0.1), xlab='#trees') \seealso{ \code{\link{xgb.train}}. } - diff --git a/R-package/man/print.xgb.Booster.Rd b/R-package/man/print.xgb.Booster.Rd index bec142f8b..d684882f5 100644 --- a/R-package/man/print.xgb.Booster.Rd +++ b/R-package/man/print.xgb.Booster.Rd @@ -27,4 +27,3 @@ print(bst) print(bst, verbose=TRUE) } - diff --git a/R-package/man/print.xgb.DMatrix.Rd b/R-package/man/print.xgb.DMatrix.Rd index 0b677c30c..117c9771b 100644 --- a/R-package/man/print.xgb.DMatrix.Rd +++ b/R-package/man/print.xgb.DMatrix.Rd @@ -26,4 +26,3 @@ dtrain print(dtrain, verbose=TRUE) } - diff --git a/R-package/man/print.xgb.cv.Rd b/R-package/man/print.xgb.cv.Rd index 731cc91ce..79c791dd7 100644 --- a/R-package/man/print.xgb.cv.Rd +++ b/R-package/man/print.xgb.cv.Rd @@ -29,4 +29,3 @@ print(cv) print(cv, verbose=TRUE) } - diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index 92c2e3294..e6cf3257f 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -41,4 +41,3 @@ setinfo(dtrain, 'label', 1-labels) labels2 <- getinfo(dtrain, 'label') stopifnot(all.equal(labels2, 1-labels)) } - diff --git a/R-package/man/slice.xgb.DMatrix.Rd b/R-package/man/slice.xgb.DMatrix.Rd index c352203fb..c3df25891 100644 --- a/R-package/man/slice.xgb.DMatrix.Rd +++ b/R-package/man/slice.xgb.DMatrix.Rd @@ -1,9 +1,9 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.DMatrix.R \name{slice} -\alias{[.xgb.DMatrix} \alias{slice} \alias{slice.xgb.DMatrix} +\alias{[.xgb.DMatrix} \title{Get a new DMatrix containing the specified rows of orginal xgb.DMatrix object} \usage{ @@ -38,4 +38,3 @@ labels2 <- getinfo(dsub, 'label') all.equal(labels1, labels2) } - diff --git a/R-package/man/xgb.Booster.complete.Rd b/R-package/man/xgb.Booster.complete.Rd index 0e821e33c..725c11bd5 100644 --- a/R-package/man/xgb.Booster.complete.Rd +++ b/R-package/man/xgb.Booster.complete.Rd @@ -46,4 +46,3 @@ bst1 <- xgb.Booster.complete(bst1) print(bst1$handle) } - diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index cdb8fdf62..7f38c01ed 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -27,4 +27,3 @@ dtrain <- xgb.DMatrix(train$data, label=train$label) xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') dtrain <- xgb.DMatrix('xgb.DMatrix.data') } - diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index 9b0e835be..eb4c95370 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -21,4 +21,3 @@ dtrain <- xgb.DMatrix(train$data, label=train$label) xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') dtrain <- xgb.DMatrix('xgb.DMatrix.data') } - diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd index 691d126cc..d501491bb 100644 --- a/R-package/man/xgb.attr.Rd +++ b/R-package/man/xgb.attr.Rd @@ -83,4 +83,3 @@ xgb.attributes(bst1) <- list(a = NULL, b = NULL) print(xgb.attributes(bst1)) } - diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd index 4f799444b..2f3e5d099 100644 --- a/R-package/man/xgb.create.features.Rd +++ b/R-package/man/xgb.create.features.Rd @@ -90,4 +90,3 @@ cat(paste("The accuracy was", accuracy.before, "before adding leaf features and accuracy.after, "!\\n")) } - diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index e027df4a7..19bba4fdc 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -140,4 +140,3 @@ print(cv) print(cv, verbose=TRUE) } - diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index bd536b4bf..411c456b3 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -53,4 +53,3 @@ print(xgb.dump(bst, with_stats = TRUE)) cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) } - diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 3270a1b70..11a6cc854 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -63,4 +63,3 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep xgb.importance(model = bst) } - diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 1634a8a38..4cfccdd2d 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -38,4 +38,3 @@ pred <- predict(bst, test$data) \seealso{ \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. } - diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 9d7a5056a..59dad6190 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -67,4 +67,3 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)] } - diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.parameters.Rd index 3df866816..ab2695650 100644 --- a/R-package/man/xgb.parameters.Rd +++ b/R-package/man/xgb.parameters.Rd @@ -29,4 +29,3 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2, xgb.parameters(bst) <- list(eta = 0.1) } - diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index 1d91a01f7..eebafdc36 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -72,4 +72,3 @@ xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2) \seealso{ \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}. } - diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 35b30d7af..fcb51e753 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -79,4 +79,3 @@ gg + ggplot2::ylab("Frequency") \seealso{ \code{\link[graphics]{barplot}}. } - diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index faef94555..3f8adcaaa 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -56,4 +56,3 @@ p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$d print(p) } - diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index c4b7a6db3..f32a8b7e0 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -74,4 +74,3 @@ xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst, trees = 0, show_node_id = TRUE) } - diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index 00b32ef78..08126bd87 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -38,4 +38,3 @@ pred <- predict(bst, test$data) \seealso{ \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. } - diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index 7f808529e..bbe7faf2c 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -25,4 +25,3 @@ bst <- xgb.load(raw) pred <- predict(bst, test$data) } - diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 269789b19..c05a8fe7c 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -264,4 +264,3 @@ pred <- predict(bst, agaricus.test$data) \code{\link{predict.xgb.Booster}}, \code{\link{xgb.cv}} } - diff --git a/R-package/man/xgboost-deprecated.Rd b/R-package/man/xgboost-deprecated.Rd index 2cb546212..03eb2b8e0 100644 --- a/R-package/man/xgboost-deprecated.Rd +++ b/R-package/man/xgboost-deprecated.Rd @@ -14,4 +14,3 @@ A deprecation warning is shown when any of the deprecated parameters is used in An additional warning is shown when there was a partial match to a deprecated parameter (as R is able to partially match parameter names). } - diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 9a3d4f033..f5025c5d9 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -134,23 +134,24 @@ levels(df[,Treatment]) ``` -#### One-hot encoding +#### Encoding categorical features Next step, we will transform the categorical data to dummy variables. -This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step. +Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach. +We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it producess "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)). The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`. -For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding. +For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `TreatmentPlacebo` and the value `0` in the new column `TreatmentTreated`. The column `TreatmentPlacebo` will disappear during the contrast encoding, as it would be absorbed into a common constant intercept column. Column `Improved` is excluded because it will be our `label` column, the one we want to predict. ```{r, warning=FALSE,message=FALSE} -sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) +sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[,-1] head(sparse_matrix) ``` -> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console. +> Formula `Improved ~ .` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` column selection removes the intercept column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console. Create the output `numeric` vector (not as a sparse `Matrix`):