diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 4182179ad..c3fac04e3 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -24,6 +24,7 @@ export(cb.save.model) export(getinfo) export(setinfo) export(slice) +export(xgb.Booster.complete) export(xgb.DMatrix) export(xgb.DMatrix.save) export(xgb.attr) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index f35565273..c66bbfd6b 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -507,7 +507,7 @@ cb.cv.predict <- function(save_models = FALSE) { if (save_models) { env$basket$models <- lapply(env$bst_folds, function(fd) { xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1 - xgb.Booster.check(xgb.handleToBooster(fd$bst), saveraw = TRUE) + xgb.Booster.complete(xgb.handleToBooster(fd$bst), saveraw = TRUE) }) } } diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index cf428e4e8..8aafee1f3 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -1,6 +1,6 @@ -# Construct a Booster from cachelist +# Construct an internal xgboost Booster and return a handle to it # internal utility function -xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { +xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) { if (typeof(cachelist) != "list" || any(sapply(cachelist, class) != 'xgb.DMatrix')) { stop("xgb.Booster only accepts list of DMatrix as cachelist") @@ -13,8 +13,8 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { } else if (typeof(modelfile) == "raw") { .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") } else if (class(modelfile) == "xgb.Booster") { - modelfile <- xgb.Booster.check(modelfile, saveraw=TRUE) - .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile$raw, PACKAGE = "xgboost") + bst <- xgb.Booster.complete(modelfile, saveraw=TRUE) + .Call("XGBoosterLoadModelFromRaw_R", handle, bst$raw, PACKAGE = "xgboost") } else { stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") } @@ -34,6 +34,17 @@ xgb.handleToBooster <- function(handle, raw = NULL) { return(bst) } +# Check whether xgb.Booster.handle is null +# internal utility function +is.null.handle <- function(handle) { + if (class(handle) != "xgb.Booster.handle") + stop("argument type must be xgb.Booster.handle") + + if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE="xgboost")) + return(TRUE) + return(FALSE) +} + # Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster # internal utility function xgb.get.handle <- function(object) { @@ -42,32 +53,65 @@ xgb.get.handle <- function(object) { xgb.Booster.handle = object, stop("argument must be of either xgb.Booster or xgb.Booster.handle class") ) - if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE="xgboost")) { + if (is.null.handle(handle)) { stop("invalid xgb.Booster.handle") } handle } -# Check whether an xgb.Booster object is complete -# internal utility function -xgb.Booster.check <- function(bst, saveraw = TRUE) { - if (class(bst) != "xgb.Booster") +#' Restore missing parts of an incomplete xgb.Booster object. +#' +#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing +#' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid) +#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid +#' but it has a raw Booster memory dump). +#' +#' @param object object of class \code{xgb.Booster} +#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data +#' when it doesn't already exist. +#' +#' @details +#' +#' While this method is primarily for internal use, it might be useful in some practical situations. +#' +#' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object, +#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods +#' should still work for such a model object since those methods would be using +#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the +#' \code{xgb.Booster.complete} function once after loading a model as an R-object. That which would +#' prevent further reconstruction (potentially, multiple times) of an internal booster model. +#' +#' @return +#' An object of \code{xgb.Booster} class. +#' +#' @examples +#' +#' data(agaricus.train, package='xgboost') +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, +#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +#' saveRDS(bst, "xgb.model.rds") +#' +#' bst1 <- readRDS("xgb.model.rds") +#' # the handle is invalid: +#' print(bst1$handle) +#' bst1 <- xgb.Booster.complete(bst1) +#' # now the handle points to a valid internal booster model: +#' print(bst1$handle) +#' +#' @export +xgb.Booster.complete <- function(object, saveraw = TRUE) { + if (class(object) != "xgb.Booster") stop("argument type must be xgb.Booster") - isnull <- is.null(bst$handle) - if (!isnull) { - isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost") - } - if (isnull) { - bst$handle <- xgb.Booster(modelfile = bst$raw) + if (is.null.handle(object$handle)) { + object$handle <- xgb.Booster.handle(modelfile = object$raw) } else { - if (is.null(bst$raw) && saveraw) - bst$raw <- xgb.save.raw(bst$handle) + if (is.null(object$raw) && saveraw) + object$raw <- xgb.save.raw(object$handle) } - return(bst) + return(object) } - #' Predict method for eXtreme Gradient Boosting model #' #' Predicted values based on either xgboost model or model handle object. @@ -180,7 +224,7 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) { predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) { - object <- xgb.Booster.check(object, saveraw = FALSE) + object <- xgb.Booster.complete(object, saveraw = FALSE) if (class(newdata) != "xgb.DMatrix") newdata <- xgb.DMatrix(newdata, missing = missing) if (is.null(ntreelimit)) @@ -429,11 +473,10 @@ xgb.ntree <- function(bst) { print.xgb.Booster <- function(x, verbose=FALSE, ...) { cat('##### xgb.Booster\n') - if (is.null(x$handle) || .Call("XGCheckNullPtr_R", x$handle, PACKAGE="xgboost")) { - cat("handle is invalid\n") - return(x) - } - + valid_handle <- is.null.handle(x$handle) + if (!valid_handle) + cat("Handle is invalid! Suggest using xgb.Booster.complete\n") + cat('raw: ') if (!is.null(x$raw)) { cat(format(object.size(x$raw), units="auto"), '\n') @@ -454,7 +497,9 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) { } # TODO: need an interface to access all the xgboosts parameters - attrs <- xgb.attributes(x) + attrs <- character(0) + if (valid_handle) + attrs <- xgb.attributes(x) if (length(attrs) > 0) { cat('xgb.attributes:\n') if (verbose) { @@ -474,15 +519,19 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) { }) } + if (!is.null(x$feature_names)) + cat('# of features:', length(x$feature_names), '\n') + cat('niter: ', x$niter, '\n', sep='') # TODO: uncomment when faster xgb.ntree is implemented #cat('ntree: ', xgb.ntree(x), '\n', sep='') - for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks','evaluation_log','niter'))) { + for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', + 'evaluation_log','niter','feature_names'))) { if (is.atomic(x[[n]])) { - cat(n, ': ', x[[n]], '\n', sep='') + cat(n, ':', x[[n]], '\n', sep=' ') } else { - cat(n, ':\n\t', sep='') + cat(n, ':\n\t', sep=' ') print(x[[n]]) } } diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 732c5c726..9dc0d1f26 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -31,18 +31,13 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { PACKAGE = "xgboost") cnames <- colnames(data) } else { - stop(paste("xgb.DMatrix: does not support to construct from ", - typeof(data))) + stop("xgb.DMatrix does not support construction from ", typeof(data)) } dmat <- handle attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix") - #dmat <- list(handle = handle, colnames = cnames) - #attr(dmat, 'class') <- "xgb.DMatrix" info <- append(info, list(...)) - if (length(info) == 0) - return(dmat) - for (i in 1:length(info)) { + for (i in seq_along(info)) { p <- info[i] setinfo(dmat, names(p), p[[1]]) } @@ -70,11 +65,10 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { dtrain <- xgb.DMatrix(data) } else if (inClass == "xgb.DMatrix") { dtrain <- data - } else if (inClass == "data.frame") { - stop("xgboost only support numerical matrix input, - use 'data.matrix' to transform the data.") + } else if ("data.frame" %in% inClass) { + stop("xgboost doesn't support data.frame as input. Convert it to matrix first.") } else { - stop("xgboost: Invalid input of data") + stop("xgboost: invalid input data") } } return (dtrain) @@ -190,7 +184,7 @@ getinfo.xgb.DMatrix <- function(object, name, ...) { if (typeof(name) != "character" || length(name) != 1 || !name %in% c('label', 'weight', 'base_margin', 'nrow')) { - stop("getinfo: name must one of the following\n", + stop("getinfo: name must be one of the following\n", " 'label', 'weight', 'base_margin', 'nrow'") } if (name != "nrow"){ @@ -266,7 +260,7 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) { PACKAGE = "xgboost") return(TRUE) } - stop(paste("setinfo: unknown info name", name)) + stop("setinfo: unknown info name ", name) return(FALSE) } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 8325f5976..d455a4079 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -181,8 +181,8 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = bst_folds <- lapply(1:length(folds), function(k) { dtest <- slice(dall, folds[[k]]) dtrain <- slice(dall, unlist(folds[-k])) - bst <- xgb.Booster(params, list(dtrain, dtest)) - list(dtrain=dtrain, bst=bst, watchlist=list(train=dtrain, test=dtest), index=folds[[k]]) + handle <- xgb.Booster.handle(params, list(dtrain, dtest)) + list(dtrain=dtrain, bst=handle, watchlist=list(train=dtrain, test=dtest), index=folds[[k]]) }) # a "basket" to collect some results from callbacks basket <- list() diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 86e97b4e7..f05961221 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -1,24 +1,26 @@ -#' Save xgboost model to text file +#' Dump an xgboost model in text format. #' -#' Save a xgboost model to text file. Could be parsed later. +#' Dump an xgboost model in text format. #' #' @param model the model object. -#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector. -#' @param fmap feature map file representing the type of feature. +#' @param fname the name of the text file where to save the model text dump. +#' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector. +#' @param fmap feature map file representing feature types. #' Detailed description could be found at #' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}. #' See demo/ for walkthrough example in R, and #' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} #' for example Format. -#' @param with_stats whether dump statistics of splits -#' When this option is on, the model dump comes with two additional statistics: +#' @param with_stats whether to dump some additional statistics about the splits. +#' When this option is on, the model dump contains two additional values: #' gain is the approximate loss function gain we get in each split; #' cover is the sum of second order gradient in each node. #' @param dump_format either 'text' or 'json' format could be specified. #' @param ... currently not used #' #' @return -#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. +#' If fname is not provided or set to \code{NULL} the function will return the model +#' as a \code{character} vector. Otherwise it will return \code{TRUE}. #' #' @examples #' data(agaricus.train, package='xgboost') @@ -37,7 +39,8 @@ #' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) #' #' @export -xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) { +xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, + dump_format = c("text", "json"), ...) { check.deprecation(...) dump_format <- match.arg(dump_format) if (class(model) != "xgb.Booster") @@ -47,7 +50,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, du if (!(class(fmap) %in% c("character", "NULL") && length(fmap) <= 1)) stop("fmap: argument must be of type character (when provided)") - model <- xgb.Booster.check(model) + model <- xgb.Booster.complete(model) model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), as.character(dump_format), PACKAGE = "xgboost") diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 3c66fa8cf..4fb1f08c4 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -1,102 +1,92 @@ -#' Show importance of features in a model +#' Importance of features in a model. #' -#' Create a \code{data.table} of the most important features of a model. +#' Creates a \code{data.table} of feature importances in a model. #' -#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' @param model generated by the \code{xgb.train} function. -#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional. -#' @param label the label vector used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional. -#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional. -#' -#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +#' @param feature_names character vector of feature names. If the model already +#' contains feature names, those would be used when \code{feature_names=NULL} (default value). +#' Non-null \code{feature_names} could be provided to override those in the model. +#' @param model object of class \code{xgb.Booster}. +#' @param data deprecated. +#' @param label deprecated. +#' @param target deprecated. #' #' @details -#' This function is for both linear and tree models. #' -#' \code{data.table} is returned by the function. -#' The columns are: +#' This function works for both linear and tree models. +#' +#' For linear models, the importance is the absolute magnitude of linear coefficients. +#' For that reason, in order to obtain a meaningful ranking by importance for a linear model, +#' the features need to be on the same scale (which you also would want to do when using either +#' L1 or L2 regularization). +#' +#' @return +#' +#' For a tree model, a \code{data.table} with the following columns: #' \itemize{ -#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump; -#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models); -#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models); -#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. +#' \item \code{Features} names of the features used in the model; +#' \item \code{Gain} represents fractional contribution of each feature to the model based on +#' the total gain of this feature's splits. Higher percentage means a more important +#' predictive feature. +#' \item \code{Cover} metric of the number of observation related to this feature; +#' \item \code{Frequency} percentage representing the relative number of times +#' a feature have been used in trees. #' } #' -#' If you don't provide \code{feature_names}, index of the features will be used instead. +#' A linear model's importance \code{data.table} has only two columns: +#' \itemize{ +#' \item \code{Features} names of the features used in the model; +#' \item \code{Weight} the linear coefficient of this feature. +#' } #' -#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R). -#' -#' Co-occurence count -#' ------------------ -#' -#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. -#' -#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. -#' -#' If you need to remember only one thing: unless you want to leave us early, don't eat a mushroom which has no odor :-) +#' If you don't provide or \code{model} doesn't have \code{feature_names}, +#' index of the features will be used instead. Because the index is extracted from the model dump +#' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). #' #' @examples +#' #' data(agaricus.train, package='xgboost') #' #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' -#' xgb.importance(colnames(agaricus.train$data), model = bst) -#' -#' # Same thing with co-occurence computation this time -#' xgb.importance(colnames(agaricus.train$data), model = bst, -#' data = agaricus.train$data, label = agaricus.train$label) +#' xgb.importance(model = bst) #' #' @export -xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){ - if (!class(feature_names) %in% c("character", "NULL")) { - stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.") - } - - if (class(model) != "xgb.Booster") { - stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") - } - - if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) { - stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.") - } - - if(class(label) == "numeric"){ - if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector") - } +xgb.importance <- function(feature_names = NULL, model = NULL, + data = NULL, label = NULL, target = NULL){ - treeDump <- function(feature_names, text, keepDetail){ - if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature" - xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)] - } + if (!(is.null(data) && is.null(label) && is.null(target))) + warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated") - linearDump <- function(feature_names, text){ - weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric - if(is.null(feature_names)) feature_names <- seq(to = length(weights)) - data.table(Feature = feature_names, Weight = weights) - } - - model.text.dump <- xgb.dump(model = model, with_stats = T) + if (class(model) != "xgb.Booster") + stop("Either 'model' has to be an object of class xgb.Booster") - if(model.text.dump[2] == "bias:"){ - result <- model.text.dump %>% linearDump(feature_names, .) - if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.") - } else { - result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data)) + if (is.null(feature_names) && !is.null(model$feature_names)) + feature_names <- model$feature_names + + if (!class(feature_names) %in% c("character", "NULL")) + stop("feature_names: Has to be a character vector") - # Co-occurence computation - if(!is.null(data) & !is.null(label) & nrow(result) > 0) { - # Take care of missing column - a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0 - # Bind the two Matrix and reorder columns - c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]] - rm(a) - # Apply split - d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split]) - apply(c & d, 2, . %>% target %>% sum) -> vec - - result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][, MissingNo := NULL] - } + model_text_dump <- xgb.dump(model = model, with_stats = TRUE) + + # linear model + if(model_text_dump[2] == "bias:"){ + weights <- which(model_text_dump == "weight:") %>% + {model_text_dump[(. + 1):length(model_text_dump)]} %>% + as.numeric + if(is.null(feature_names)) + feature_names <- seq(to = length(weights)) + result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))] + } else { + # tree model + result <- xgb.model.dt.tree(feature_names = feature_names, text = model_text_dump)[ + Feature != "Leaf", .(Gain = sum(Quality), + Cover = sum(Cover), + Frequency = .N), by = Feature][ + ,`:=`(Gain = Gain / sum(Gain), + Cover = Cover / sum(Cover), + Frequency = Frequency / sum(Frequency))][ + order(Gain, decreasing = TRUE)] } result } @@ -104,4 +94,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c(".", ".N", "Gain", "Frequency", "Feature", "Split", "No", "Missing", "MissingNo", "RealCover")) +globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature")) diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index a93a2c042..cfe5dc87c 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -1,8 +1,23 @@ #' Load xgboost model from binary file #' -#' Load xgboost model from the binary model file +#' Load xgboost model from the binary model file. #' -#' @param modelfile the name of the binary file. +#' @param modelfile the name of the binary input file. +#' +#' @details +#' The input file is expected to contain a model saved in an xgboost-internal binary format +#' using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some +#' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and +#' saved from there in xgboost format, could be loaded from R. +#' +#' Note: a model saved as an R-object, has to be loaded using corresponding R-methods, +#' not \code{xgb.load}. +#' +#' @return +#' An object of \code{xgb.Booster} class. +#' +#' @seealso +#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. #' #' @examples #' data(agaricus.train, package='xgboost') @@ -19,13 +34,13 @@ xgb.load <- function(modelfile) { if (is.null(modelfile)) stop("xgb.load: modelfile cannot be NULL") - handle <- xgb.Booster(modelfile = modelfile) + handle <- xgb.Booster.handle(modelfile = modelfile) # re-use modelfile if it is raw so we do not need to serialize if (typeof(modelfile) == "raw") { bst <- xgb.handleToBooster(handle, modelfile) } else { bst <- xgb.handleToBooster(handle, NULL) } - bst <- xgb.Booster.check(bst, saveraw = TRUE) + bst <- xgb.Booster.complete(bst, saveraw = TRUE) return(bst) } diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index a364aaa70..75adf5f95 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -3,14 +3,16 @@ #' Parse a boosted tree model text dump into a \code{data.table} structure. #' #' @param feature_names character vector of feature names. If the model already -#' contains feature names, this argument should be \code{NULL} (default value) +#' contains feature names, those would be used when \code{feature_names=NULL} (default value). +#' Non-null \code{feature_names} could be provided to override those in the model. #' @param model object of class \code{xgb.Booster} #' @param text \code{character} vector previously generated by the \code{xgb.dump} #' function (where parameter \code{with_stats = TRUE} should have been set). +#' \code{text} takes precedence over \code{model}. #' @param trees an integer vector of tree indices that should be parsed. #' If set to \code{NULL}, all trees of the model are parsed. #' It could be useful, e.g., in multiclass classification to get only -#' the trees of one certain class. IMPORTANT: the tree index in xgboost model +#' the trees of one certain class. IMPORTANT: the tree index in xgboost models #' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees). #' @param ... currently not used. #' @@ -43,7 +45,9 @@ #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) -#' +#' # This bst has feature_names stored in it, so those would be used when +#' # the feature_names parameter is not provided: +#' (dt <- xgb.model.dt.tree(model = bst)) #' #' # How to match feature names of splits that are following a current 'Yes' branch: #' @@ -53,11 +57,6 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, trees = NULL, ...){ check.deprecation(...) - if (!class(feature_names) %in% c("character", "NULL")) { - stop("feature_names: Has to be a vector of character\n", - " or NULL if the model dump already contains feature names.\n", - " Look at this function documentation to see where to get feature names.") - } if (class(model) != "xgb.Booster" & class(text) != "character") { stop("Either 'model' has to be an object of class xgb.Booster\n", @@ -65,12 +64,19 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, " (or NULL if the model was provided).") } + if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names)) + feature_names <- model$feature_names + + if (!class(feature_names) %in% c("character", "NULL")) { + stop("feature_names: Has to be a character vector") + } + if (!class(trees) %in% c("integer", "numeric", "NULL")) { stop("trees: Has to be a vector of integers.") } if (is.null(text)){ - text <- xgb.dump(model = model, with_stats = T) + text <- xgb.dump(model = model, with_stats = TRUE) } if (length(text) < 2 || diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 41b72c8a0..313cc9d6f 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -126,4 +126,4 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label")) +globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label")) diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index 5b2421b7f..8162f0fa2 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -1,9 +1,22 @@ #' Save xgboost model to binary file #' -#' Save xgboost model from xgboost or xgb.train +#' Save xgboost model to a file in binary format. #' -#' @param model the model object. -#' @param fname the name of the file to write. +#' @param model model object of \code{xgb.Booster} class. +#' @param fname name of the file to write. +#' +#' @details +#' This methods allows to save a model in an xgboost-internal binary format which is universal +#' among the various xgboost interfaces. In R, the saved model file could be read-in later +#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter +#' of \code{\link{xgb.train}}. +#' +#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} +#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and +#' corresponding R-methods would need to be used to load it. +#' +#' @seealso +#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. #' #' @examples #' data(agaricus.train, package='xgboost') @@ -22,6 +35,7 @@ xgb.save <- function(model, fname) { if (class(model) != "xgb.Booster") stop("the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.") + model <- xgb.Booster.complete(model, saveraw = FALSE) .Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost") return(TRUE) } diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 2ed2194d3..a9eb863ad 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -1,6 +1,7 @@ #' eXtreme Gradient Boosting Training #' -#' \code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface. +#' \code{xgb.train} is an advanced interface for training an xgboost model. +#' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}. #' #' @param params the list of parameters. #' The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}. @@ -9,8 +10,7 @@ #' 1. General Parameters #' #' \itemize{ -#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} -#' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 +#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}. #' } #' #' 2. Booster Parameters @@ -54,24 +54,26 @@ #' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. #' } #' -#' @param data input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input. -#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file. -#' @param nrounds the max number of iterations -#' @param watchlist what information should be printed when \code{verbose=1} or -#' \code{verbose=2}. Watchlist is used to specify validation set monitoring -#' during training. For example user can specify -#' watchlist=list(validation1=mat1, validation2=mat2) to watch -#' the performance of each round's model on mat1 and mat2 -#' +#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file. +#' @param nrounds max number of boosting iterations. +#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance. +#' Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each +#' of these datasets during each boosting iteration, and stored in the end as a field named +#' \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or +#' \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously +#' printed out during the training. +#' E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track +#' the performance of each round's model on mat1 and mat2. #' @param obj customized objective function. Returns gradient and second order #' gradient with given prediction and dtrain. #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain. -#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print -#' information of performance. If 2, xgboost will print some additional information. -#' Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and -#' \code{\link{cb.print.evaluation}} callback functions. +#' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance. +#' If 2, some additional information will be printed out. +#' Note that setting \code{verbose > 0} automatically engages the +#' \code{cb.print.evaluation(period=1)} callback function. #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}. #' Default is 1 which means all messages are printed. This parameter is passed to the #' \code{\link{cb.print.evaluation}} callback. @@ -106,7 +108,7 @@ #' #' The \code{xgb.train} interface supports advanced features such as \code{watchlist}, #' customized objective and evaluation metric functions, therefore it is more flexible -#' than the \code{\link{xgboost}} interface. +#' than the \code{xgboost} interface. #' #' Parallelization is automatically enabled if \code{OpenMP} is present. #' Number of threads can also be manually specified via \code{nthread} parameter. @@ -132,7 +134,7 @@ #' \itemize{ #' \item \code{cb.print.evaluation} is turned on when \code{verbose > 0}; #' and the \code{print_every_n} parameter is passed to it. -#' \item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present. +#' \item \code{cb.evaluation.log} is on when \code{watchlist} is present. #' \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set. #' \item \code{cb.save.model}: when \code{save_period > 0} is set. #' } @@ -158,6 +160,8 @@ #' (only available with early stopping). #' \item \code{best_score} the best evaluation metric value during early stopping. #' (only available with early stopping). +#' \item \code{feature_names} names of the training dataset features +#' (only when comun names were defined in training data). #' } #' #' @seealso @@ -171,7 +175,7 @@ #' #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) #' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) -#' watchlist <- list(eval = dtest, train = dtrain) +#' watchlist <- list(train = dtrain, eval = dtest) #' #' ## A simple xgb.train example: #' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, @@ -210,17 +214,15 @@ #' #' #' ## An xgb.train example of using variable learning rates at each iteration: -#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2) +#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, +#' objective = "binary:logistic", eval_metric = "auc") #' my_etas <- list(eta = c(0.5, 0.1)) #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, #' callbacks = list(cb.reset.parameters(my_etas))) #' -#' -#' ## Explicit use of the cb.evaluation.log callback allows to run -#' ## xgb.train silently but still store the evaluation results: -#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, -#' verbose = 0, callbacks = list(cb.evaluation.log())) -#' print(bst$evaluation_log) +#' ## Early stopping: +#' bst <- xgb.train(param, dtrain, nrounds = 25, watchlist, +#' early_stopping_rounds = 3) #' #' ## An 'xgboost' interface example: #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, @@ -259,13 +261,13 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), # evaluation printing callback params <- c(params, list(silent = ifelse(verbose > 1, 0, 1))) print_every_n <- max( as.integer(print_every_n), 1L) - if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { + if (!has.callbacks(callbacks, 'cb.print.evaluation') && + verbose) { callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n)) } - # evaluation log callback: it is automatically enabled only when verbose > 0 + # evaluation log callback: it is automatically enabled when watchlist is provided evaluation_log <- list() - if (verbose > 0 && - !has.callbacks(callbacks, 'cb.evaluation.log') && + if (!has.callbacks(callbacks, 'cb.evaluation.log') && length(watchlist) > 0) { callbacks <- add.cb(callbacks, cb.evaluation.log()) } @@ -288,7 +290,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), is_update <- NVL(params[['process_type']], '.') == 'update' # Construct a booster (either a new one or load from xgb_model) - handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model) + handle <- xgb.Booster.handle(params, append(watchlist, dtrain), xgb_model) bst <- xgb.handleToBooster(handle) # extract parameters that can affect the relationship b/w #trees and #iterations @@ -332,7 +334,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), } for (f in cb$finalize) f(finalize=TRUE) - bst <- xgb.Booster.check(bst, saveraw = TRUE) + bst <- xgb.Booster.complete(bst, saveraw = TRUE) # store the total number of boosting iterations bst$niter = end_iteration @@ -354,6 +356,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), bst$call <- match.call() bst$params <- params bst$callbacks <- callbacks + if (!is.null(colnames(dtrain))) + bst$feature_names <- colnames(dtrain) return(bst) } diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index b7ce3d526..f2ce90b12 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -1,4 +1,4 @@ -# Simple interface for training an xgboost model. +# Simple interface for training an xgboost model that wraps \code{xgb.train} # Its documentation is combined with xgb.train. # #' @rdname xgb.train @@ -12,11 +12,9 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, dtrain <- xgb.get.DMatrix(data, label, missing, weight) - watchlist <- list() - if (verbose > 0) - watchlist$train = dtrain + watchlist <- list(train = dtrain) - bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n=print_every_n, + bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n = print_every_n, early_stopping_rounds = early_stopping_rounds, maximize = maximize, save_period = save_period, save_name = save_name, xgb_model = xgb_model, callbacks = callbacks, ...) diff --git a/R-package/man/xgb.Booster.complete.Rd b/R-package/man/xgb.Booster.complete.Rd new file mode 100644 index 000000000..0e821e33c --- /dev/null +++ b/R-package/man/xgb.Booster.complete.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{xgb.Booster.complete} +\alias{xgb.Booster.complete} +\title{Restore missing parts of an incomplete xgb.Booster object.} +\usage{ +xgb.Booster.complete(object, saveraw = TRUE) +} +\arguments{ +\item{object}{object of class \code{xgb.Booster}} + +\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data +when it doesn't already exist.} +} +\value{ +An object of \code{xgb.Booster} class. +} +\description{ +It attempts to complete an \code{xgb.Booster} object by restoring either its missing +raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid) +or its missing internal handle (when its \code{xgb.Booster.handle} is not valid +but it has a raw Booster memory dump). +} +\details{ +While this method is primarily for internal use, it might be useful in some practical situations. + +E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object, +its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods +should still work for such a model object since those methods would be using +\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the +\code{xgb.Booster.complete} function once after loading a model as an R-object. That which would +prevent further reconstruction (potentially, multiple times) of an internal booster model. +} +\examples{ + +data(agaricus.train, package='xgboost') +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, + eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +saveRDS(bst, "xgb.model.rds") + +bst1 <- readRDS("xgb.model.rds") +# the handle is invalid: +print(bst1$handle) +bst1 <- xgb.Booster.complete(bst1) +# now the handle points to a valid internal booster model: +print(bst1$handle) + +} + diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 2ec26c743..bd536b4bf 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/xgb.dump.R \name{xgb.dump} \alias{xgb.dump} -\title{Save xgboost model to text file} +\title{Dump an xgboost model in text format.} \usage{ xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, dump_format = c("text", "json"), ...) @@ -10,17 +10,18 @@ xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, \arguments{ \item{model}{the model object.} -\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.} +\item{fname}{the name of the text file where to save the model text dump. +If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.} -\item{fmap}{feature map file representing the type of feature. +\item{fmap}{feature map file representing feature types. Detailed description could be found at \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}. See demo/ for walkthrough example in R, and \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} for example Format.} -\item{with_stats}{whether dump statistics of splits -When this option is on, the model dump comes with two additional statistics: +\item{with_stats}{whether to dump some additional statistics about the splits. +When this option is on, the model dump contains two additional values: gain is the approximate loss function gain we get in each split; cover is the sum of second order gradient in each node.} @@ -29,10 +30,11 @@ cover is the sum of second order gradient in each node.} \item{...}{currently not used} } \value{ -if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. +If fname is not provided or set to \code{NULL} the function will return the model +as a \code{character} vector. Otherwise it will return \code{TRUE}. } \description{ -Save a xgboost model to text file. Could be parsed later. +Dump an xgboost model in text format. } \examples{ data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 2b0237aa0..3270a1b70 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -2,64 +2,65 @@ % Please edit documentation in R/xgb.importance.R \name{xgb.importance} \alias{xgb.importance} -\title{Show importance of features in a model} +\title{Importance of features in a model.} \usage{ xgb.importance(feature_names = NULL, model = NULL, data = NULL, - label = NULL, target = function(x) ((x + label) == 2)) + label = NULL, target = NULL) } \arguments{ -\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} +\item{feature_names}{character vector of feature names. If the model already +contains feature names, those would be used when \code{feature_names=NULL} (default value). +Non-null \code{feature_names} could be provided to override those in the model.} -\item{model}{generated by the \code{xgb.train} function.} +\item{model}{object of class \code{xgb.Booster}.} -\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} +\item{data}{deprecated.} -\item{label}{the label vector used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} +\item{label}{deprecated.} -\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.} +\item{target}{deprecated.} } \value{ -A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +For a tree model, a \code{data.table} with the following columns: +\itemize{ + \item \code{Features} names of the features used in the model; + \item \code{Gain} represents fractional contribution of each feature to the model based on + the total gain of this feature's splits. Higher percentage means a more important + predictive feature. + \item \code{Cover} metric of the number of observation related to this feature; + \item \code{Frequency} percentage representing the relative number of times + a feature have been used in trees. +} + +A linear model's importance \code{data.table} has only two columns: +\itemize{ + \item \code{Features} names of the features used in the model; + \item \code{Weight} the linear coefficient of this feature. +} + +If you don't provide or \code{model} doesn't have \code{feature_names}, +index of the features will be used instead. Because the index is extracted from the model dump +(based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). } \description{ -Create a \code{data.table} of the most important features of a model. +Creates a \code{data.table} of feature importances in a model. } \details{ -This function is for both linear and tree models. +This function works for both linear and tree models. -\code{data.table} is returned by the function. -The columns are: -\itemize{ - \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump; - \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models); - \item \code{Cover} metric of the number of observation related to this feature (only available for tree models); - \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. -} - -If you don't provide \code{feature_names}, index of the features will be used instead. - -Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R). - -Co-occurence count ------------------- - -The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. - -Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. - -If you need to remember only one thing: unless you want to leave us early, don't eat a mushroom which has no odor :-) +For linear models, the importance is the absolute magnitude of linear coefficients. +For that reason, in order to obtain a meaningful ranking by importance for a linear model, +the features need to be on the same scale (which you also would want to do when using either +L1 or L2 regularization). } \examples{ + data(agaricus.train, package='xgboost') bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") -xgb.importance(colnames(agaricus.train$data), model = bst) - -# Same thing with co-occurence computation this time -xgb.importance(colnames(agaricus.train$data), model = bst, - data = agaricus.train$data, label = agaricus.train$label) +xgb.importance(model = bst) } diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 1499df2d4..1634a8a38 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -7,10 +7,22 @@ xgb.load(modelfile) } \arguments{ -\item{modelfile}{the name of the binary file.} +\item{modelfile}{the name of the binary input file.} +} +\value{ +An object of \code{xgb.Booster} class. } \description{ -Load xgboost model from the binary model file +Load xgboost model from the binary model file. +} +\details{ +The input file is expected to contain a model saved in an xgboost-internal binary format +using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some +appropriate methods from other xgboost interfaces. E.g., a model trained in Python and +saved from there in xgboost format, could be loaded from R. + +Note: a model saved as an R-object, has to be loaded using corresponding R-methods, +not \code{xgb.load}. } \examples{ data(agaricus.train, package='xgboost') @@ -23,4 +35,7 @@ xgb.save(bst, 'xgb.model') bst <- xgb.load('xgb.model') pred <- predict(bst, test$data) } +\seealso{ +\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}. +} diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 8176303c6..9d7a5056a 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -9,17 +9,19 @@ xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL, } \arguments{ \item{feature_names}{character vector of feature names. If the model already -contains feature names, this argument should be \code{NULL} (default value)} +contains feature names, those would be used when \code{feature_names=NULL} (default value). +Non-null \code{feature_names} could be provided to override those in the model.} \item{model}{object of class \code{xgb.Booster}} \item{text}{\code{character} vector previously generated by the \code{xgb.dump} -function (where parameter \code{with_stats = TRUE} should have been set).} +function (where parameter \code{with_stats = TRUE} should have been set). +\code{text} takes precedence over \code{model}.} \item{trees}{an integer vector of tree indices that should be parsed. If set to \code{NULL}, all trees of the model are parsed. It could be useful, e.g., in multiclass classification to get only -the trees of one certain class. IMPORTANT: the tree index in xgboost model +the trees of one certain class. IMPORTANT: the tree index in xgboost models is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).} \item{...}{currently not used.} @@ -56,7 +58,9 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) - +# This bst has feature_names stored in it, so those would be used when +# the feature_names parameter is not provided: +(dt <- xgb.model.dt.tree(model = bst)) # How to match feature names of splits that are following a current 'Yes' branch: diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index 85acdecd0..00b32ef78 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -7,12 +7,22 @@ xgb.save(model, fname) } \arguments{ -\item{model}{the model object.} +\item{model}{model object of \code{xgb.Booster} class.} -\item{fname}{the name of the file to write.} +\item{fname}{name of the file to write.} } \description{ -Save xgboost model from xgboost or xgb.train +Save xgboost model to a file in binary format. +} +\details{ +This methods allows to save a model in an xgboost-internal binary format which is universal +among the various xgboost interfaces. In R, the saved model file could be read-in later +using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter +of \code{\link{xgb.train}}. + +Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} +or \code{\link[base]{save}}). However, it would then only be compatible with R, and +corresponding R-methods would need to be used to load it. } \examples{ data(agaricus.train, package='xgboost') @@ -25,4 +35,7 @@ xgb.save(bst, 'xgb.model') bst <- xgb.load('xgb.model') pred <- predict(bst, test$data) } +\seealso{ +\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. +} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 4f37b78b8..269789b19 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -23,8 +23,7 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL, 1. General Parameters \itemize{ - \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} - \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 + \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}. } 2. Booster Parameters @@ -68,16 +67,19 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL, \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. }} -\item{data}{input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input. -\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.} +\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} -\item{nrounds}{the max number of iterations} +\item{nrounds}{max number of boosting iterations.} -\item{watchlist}{what information should be printed when \code{verbose=1} or -\code{verbose=2}. Watchlist is used to specify validation set monitoring -during training. For example user can specify -watchlist=list(validation1=mat1, validation2=mat2) to watch -the performance of each round's model on mat1 and mat2} +\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance. +Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each +of these datasets during each boosting iteration, and stored in the end as a field named +\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or +\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously +printed out during the training. +E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track +the performance of each round's model on mat1 and mat2.} \item{obj}{customized objective function. Returns gradient and second order gradient with given prediction and dtrain.} @@ -86,10 +88,10 @@ gradient with given prediction and dtrain.} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} -\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print -information of performance. If 2, xgboost will print some additional information. -Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and -\code{\link{cb.print.evaluation}} callback functions.} +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{cb.print.evaluation(period=1)} callback function.} \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}. Default is 1 which means all messages are printed. This parameter is passed to the @@ -151,17 +153,20 @@ An object of class \code{xgb.Booster} with the following elements: (only available with early stopping). \item \code{best_score} the best evaluation metric value during early stopping. (only available with early stopping). + \item \code{feature_names} names of the training dataset features + (only when comun names were defined in training data). } } \description{ -\code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface. +\code{xgb.train} is an advanced interface for training an xgboost model. +The \code{xgboost} function is a simpler wrapper for \code{xgb.train}. } \details{ These are the training functions for \code{xgboost}. The \code{xgb.train} interface supports advanced features such as \code{watchlist}, customized objective and evaluation metric functions, therefore it is more flexible -than the \code{\link{xgboost}} interface. +than the \code{xgboost} interface. Parallelization is automatically enabled if \code{OpenMP} is present. Number of threads can also be manually specified via \code{nthread} parameter. @@ -187,7 +192,7 @@ The following callbacks are automatically created when certain parameters are se \itemize{ \item \code{cb.print.evaluation} is turned on when \code{verbose > 0}; and the \code{print_every_n} parameter is passed to it. - \item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present. + \item \code{cb.evaluation.log} is on when \code{watchlist} is present. \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set. \item \code{cb.save.model}: when \code{save_period > 0} is set. } @@ -198,7 +203,7 @@ data(agaricus.test, package='xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) -watchlist <- list(eval = dtest, train = dtrain) +watchlist <- list(train = dtrain, eval = dtest) ## A simple xgb.train example: param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, @@ -237,17 +242,15 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, ## An xgb.train example of using variable learning rates at each iteration: -param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2) +param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2, + objective = "binary:logistic", eval_metric = "auc") my_etas <- list(eta = c(0.5, 0.1)) bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, callbacks = list(cb.reset.parameters(my_etas))) - -## Explicit use of the cb.evaluation.log callback allows to run -## xgb.train silently but still store the evaluation results: -bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, - verbose = 0, callbacks = list(cb.evaluation.log())) -print(bst$evaluation_log) +## Early stopping: +bst <- xgb.train(param, dtrain, nrounds = 25, watchlist, + early_stopping_rounds = 3) ## An 'xgboost' interface example: bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 7ca96077b..951f46217 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -8,7 +8,9 @@ train <- agaricus.train test <- agaricus.test set.seed(1994) -windows_flag = grepl('Windows', Sys.info()[['sysname']]) +# disable some tests for Win32 +windows_flag = .Platform$OS.type == "windows" && + .Machine$sizeof.pointer != 8 test_that("train and predict binary classification", { nrounds = 2 @@ -109,7 +111,7 @@ test_that("train and predict RF with softprob", { set.seed(11) bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds, - objective = "multi:softprob", num_class=3, + objective = "multi:softprob", num_class=3, verbose = 0, num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5) expect_equal(bst$niter, 15) expect_equal(xgb.ntree(bst), 15*3*4) @@ -144,25 +146,25 @@ test_that("training continuation works", { # for the reference, use 4 iterations at once: set.seed(11) - bst <- xgb.train(param, dtrain, nrounds = 4, watchlist) + bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0) # first two iterations: set.seed(11) - bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist) + bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0) # continue for two more: - bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1) + bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1) if (!windows_flag) expect_equal(bst$raw, bst2$raw) expect_false(is.null(bst2$evaluation_log)) expect_equal(dim(bst2$evaluation_log), c(4, 2)) expect_equal(bst2$evaluation_log, bst$evaluation_log) # test continuing from raw model data - bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1$raw) + bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw) if (!windows_flag) expect_equal(bst$raw, bst2$raw) expect_equal(dim(bst2$evaluation_log), c(2, 2)) # test continuing from a model in file xgb.save(bst1, "xgboost.model") - bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = "xgboost.model") + bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model") if (!windows_flag) expect_equal(bst$raw, bst2$raw) expect_equal(dim(bst2$evaluation_log), c(2, 2)) @@ -171,9 +173,11 @@ test_that("training continuation works", { test_that("xgb.cv works", { set.seed(11) - cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5, - eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", - verbose=TRUE) + expect_output( + cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5, + eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", + verbose=TRUE) + , "train-error:") expect_is(cv, 'xgb.cv.synchronous') expect_false(is.null(cv$evaluation_log)) expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03) diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index a95d10797..57dc6c5d4 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -107,18 +107,27 @@ test_that("cb.evaluation.log works as expected", { param <- list(objective = "binary:logistic", max_depth = 4, nthread = 2) +test_that("can store evaluation_log without printing", { + expect_silent( + bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0) + ) + expect_false(is.null(bst$evaluation_log)) + expect_false(is.null(bst$evaluation_log$train_error)) + expect_lt(bst$evaluation_log[, min(train_error)], 0.2) +}) + test_that("cb.reset.parameters works as expected", { # fixed eta set.seed(111) - bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9) + bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0) expect_false(is.null(bst0$evaluation_log)) expect_false(is.null(bst0$evaluation_log$train_error)) # same eta but re-set as a vector parameter in the callback set.seed(111) my_par <- list(eta = c(0.9, 0.9)) - bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, + bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) expect_false(is.null(bst1$evaluation_log$train_error)) expect_equal(bst0$evaluation_log$train_error, @@ -127,7 +136,7 @@ test_that("cb.reset.parameters works as expected", { # same eta but re-set via a function in the callback set.seed(111) my_par <- list(eta = function(itr, itr_end) 0.9) - bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, + bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) expect_false(is.null(bst2$evaluation_log$train_error)) expect_equal(bst0$evaluation_log$train_error, @@ -136,7 +145,7 @@ test_that("cb.reset.parameters works as expected", { # different eta re-set as a vector parameter in the callback set.seed(111) my_par <- list(eta = c(0.6, 0.5)) - bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, + bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) expect_false(is.null(bst3$evaluation_log$train_error)) expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error)) @@ -144,18 +153,18 @@ test_that("cb.reset.parameters works as expected", { # resetting multiple parameters at the same time runs with no error my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8)) expect_error( - bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, + bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) , NA) # NA = no error # CV works as well expect_error( - bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, + bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) , NA) # NA = no error # expect no learning with 0 learning rate my_par <- list(eta = c(0., 0.)) - bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, + bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, callbacks = list(cb.reset.parameters(my_par))) expect_false(is.null(bstX$evaluation_log$train_error)) er <- unique(bstX$evaluation_log$train_error) @@ -167,7 +176,7 @@ test_that("cb.save.model works as expected", { files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model') for (f in files) if (file.exists(f)) file.remove(f) - bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, + bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0, save_period = 1, save_name = "xgboost_%02d.model") expect_true(file.exists('xgboost_01.model')) expect_true(file.exists('xgboost_02.model')) @@ -178,7 +187,8 @@ test_that("cb.save.model works as expected", { expect_equal(bst$raw, b2$raw) # save_period = 0 saves the last iteration's model - bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, save_period = 0) + bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0, + save_period = 0) expect_true(file.exists('xgboost.model')) b2 <- xgb.load('xgboost.model') expect_equal(bst$raw, b2$raw) @@ -186,16 +196,6 @@ test_that("cb.save.model works as expected", { for (f in files) if (file.exists(f)) file.remove(f) }) -test_that("can store evaluation_log without printing", { - expect_silent( - bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, - verbose = 0, callbacks = list(cb.evaluation.log())) - ) - expect_false(is.null(bst$evaluation_log)) - expect_false(is.null(bst$evaluation_log$train_error)) - expect_lt(bst$evaluation_log[, min(train_error)], 0.2) -}) - test_that("early stopping xgb.train works", { set.seed(11) expect_output( @@ -211,6 +211,13 @@ test_that("early stopping xgb.train works", { err_pred <- err(ltest, pred) err_log <- bst$evaluation_log[bst$best_iteration, test_error] expect_equal(err_log, err_pred, tolerance = 5e-6) + + set.seed(11) + expect_silent( + bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3, + early_stopping_rounds = 3, maximize = FALSE, verbose = 0) + ) + expect_equal(bst$evaluation_log, bst0$evaluation_log) }) test_that("early stopping using a specific metric works", { @@ -248,7 +255,7 @@ test_that("early stopping xgb.cv works", { test_that("prediction in xgb.cv works", { set.seed(11) nrounds = 4 - cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE) + cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0) expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$pred)) expect_length(cv$pred, nrow(train$data)) @@ -258,7 +265,7 @@ test_that("prediction in xgb.cv works", { # save CV models set.seed(11) - cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, + cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0, callbacks = list(cb.cv.predict(save_models = TRUE))) expect_equal(cv$evaluation_log, cvx$evaluation_log) expect_length(cvx$models, 5) @@ -268,7 +275,7 @@ test_that("prediction in xgb.cv works", { test_that("prediction in xgb.cv works for gblinear too", { set.seed(11) p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2) - cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE) + cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0) expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$pred)) expect_length(cv$pred, nrow(train$data)) @@ -300,7 +307,7 @@ test_that("prediction in xgb.cv for softprob works", { expect_warning( cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4, eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2, - subsample = 0.8, gamma = 2, + subsample = 0.8, gamma = 2, verbose = 0, prediction = TRUE, objective = "multi:softprob", num_class = 3) , NA) expect_false(is.null(cv$pred)) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 0aea2f0a7..965e3f480 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -21,6 +21,15 @@ test_that("xgb.DMatrix: basic construction, saving, loading", { dtest3 <- xgb.DMatrix(tmp_file) unlink(tmp_file) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) + + # from a libsvm text file + tmp <- c("0 1:1 2:1","1 3:1","0 1:1") + tmp_file <- 'tmp.libsvm' + writeLines(tmp, tmp_file) + dtest4 <- xgb.DMatrix(tmp_file) + expect_equal(dim(dtest4), c(3, 4)) + expect_equal(getinfo(dtest4, 'label'), c(0,1,0)) + unlink(tmp_file) }) test_that("xgb.DMatrix: getinfo & setinfo", { diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 4536fe50e..b12d98d29 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -3,7 +3,7 @@ context('Test helper functions') require(xgboost) require(data.table) require(Matrix) -require(vcd) +require(vcd, quietly = TRUE) set.seed(1982) data(Arthritis) @@ -15,10 +15,12 @@ sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) label <- df[, ifelse(Improved == "Marked", 1, 0)] bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9, - eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic", booster = "gbtree") + eta = 1, nthread = 2, nrounds = 10, verbose = 0, + objective = "binary:logistic", booster = "gbtree") bst.GLM <- xgboost(data = sparse_matrix, label = label, - eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic", booster = "gblinear") + eta = 1, nthread = 2, nrounds = 10, verbose = 0, + objective = "binary:logistic", booster = "gblinear") feature.names <- colnames(sparse_matrix) @@ -100,12 +102,37 @@ if (grepl('Windows', Sys.info()[['sysname']]) || grepl('Linux', Sys.info()[['sys }) } +test_that("xgb.Booster serializing as R object works", { + saveRDS(bst.Tree, 'xgb.model.rds') + bst <- readRDS('xgb.model.rds') + dtrain <- xgb.DMatrix(sparse_matrix, label = label) + expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain)) + expect_equal(xgb.dump(bst.Tree), xgb.dump(bst)) + xgb.save(bst, 'xgb.model') + nil_ptr <- new("externalptr") + class(nil_ptr) <- "xgb.Booster.handle" + expect_true(identical(bst$handle, nil_ptr)) + bst <- xgb.Booster.complete(bst) + expect_true(!identical(bst$handle, nil_ptr)) + expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain)) +}) + test_that("xgb.model.dt.tree works with and without feature names", { names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree) expect_equal(names.dt.trees, names(dt.tree)) expect_equal(dim(dt.tree), c(162, 10)) - expect_output(str(xgb.model.dt.tree(model = bst.Tree)), 'Feature.*\\"3\\"') + expect_output(str(dt.tree), 'Feature.*\\"Age\\"') + + dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree) + expect_equal(dt.tree, dt.tree.0) + + # when model contains no feature names: + bst.Tree.x <- bst.Tree + bst.Tree.x$feature_names <- NULL + dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x) + expect_output(str(dt.tree.x), 'Feature.*\\"3\\"') + expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE]) }) test_that("xgb.model.dt.tree throws error for gblinear", { @@ -116,7 +143,17 @@ test_that("xgb.importance works with and without feature names", { importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree) expect_equal(dim(importance.Tree), c(7, 4)) expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency")) - expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"') + expect_output(str(importance.Tree), 'Feature.*\\"Age\\"') + + importance.Tree.0 <- xgb.importance(model = bst.Tree) + expect_equal(importance.Tree, importance.Tree.0) + + # when model contains no feature names: + bst.Tree.x <- bst.Tree + bst.Tree.x$feature_names <- NULL + importance.Tree.x <- xgb.importance(model = bst.Tree) + expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE]) + imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance")) xgb.ggplot.importance(importance_matrix = importance.Tree) diff --git a/R-package/tests/testthat/test_monotone.R b/R-package/tests/testthat/test_monotone.R index 822fefb65..9991e917e 100644 --- a/R-package/tests/testthat/test_monotone.R +++ b/R-package/tests/testthat/test_monotone.R @@ -10,8 +10,8 @@ train = matrix(x, ncol = 1) test_that("monotone constraints for regression", { bst = xgboost(data = train, label = y, max_depth = 2, - eta = 0.1, nthread = 2, nrounds = 100, - monotone_constraints = -1) + eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, + monotone_constraints = -1) pred = predict(bst, train) diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R index 1cbed1c33..8518711fb 100644 --- a/R-package/tests/testthat/test_update.R +++ b/R-package/tests/testthat/test_update.R @@ -9,24 +9,23 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) test_that("updating the model works", { watchlist = list(train = dtrain, test = dtest) - cb = list(cb.evaluation.log()) # to run silent, but store eval. log - + # no-subsampling p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2) set.seed(11) - bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb) + bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0) tr1 <- xgb.model.dt.tree(model = bst1) # with subsampling p2 <- modifyList(p1, list(subsample = 0.1)) set.seed(11) - bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb) + bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0) tr2 <- xgb.model.dt.tree(model = bst2) # the same no-subsampling boosting with an extra 'refresh' updater: p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE)) set.seed(11) - bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb) + bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0) tr1r <- xgb.model.dt.tree(model = bst1r) # all should be the same when no subsampling expect_equal(bst1$evaluation_log, bst1r$evaluation_log) @@ -35,7 +34,7 @@ test_that("updating the model works", { # the same boosting with subsampling with an extra 'refresh' updater: p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE)) set.seed(11) - bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb) + bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0) tr2r <- xgb.model.dt.tree(model = bst2r) # should be the same evaluation but different gains and larger cover expect_equal(bst2$evaluation_log, bst2r$evaluation_log) @@ -45,7 +44,7 @@ test_that("updating the model works", { # process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data: p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE)) - bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst1) + bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1) tr1u <- xgb.model.dt.tree(model = bst1u) # all should be the same when no subsampling expect_equal(bst1$evaluation_log, bst1u$evaluation_log) @@ -53,7 +52,7 @@ test_that("updating the model works", { # process type 'update' for model with subsampling, refreshing only the tree stats from training data: p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) - bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst2) + bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2) tr2u <- xgb.model.dt.tree(model = bst2u) # should be the same evaluation but different gains and larger cover expect_equal(bst2$evaluation_log, bst2u$evaluation_log) @@ -66,7 +65,7 @@ test_that("updating the model works", { # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data: p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) - bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst1) + bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1) tr1ut <- xgb.model.dt.tree(model = bst1ut) # should be the same evaluations but different gains and smaller cover (test data is smaller) expect_equal(bst1$evaluation_log, bst1ut$evaluation_log) diff --git a/doc/build.md b/doc/build.md index d41cc5f0e..d2eaf99cd 100644 --- a/doc/build.md +++ b/doc/build.md @@ -189,7 +189,15 @@ There are several ways to install the package: ## R Package Installation -You can install R package from cran just like other packages, or you can install from our weekly updated drat repo: +### Installing pre-packaged version + +You can install xgboost from CRAN just like any other R package: + +```r +install.packages("xgboost") +``` + +Or you can install it from our weekly updated drat repo: ```r install.packages("drat", repos="https://cran.rstudio.com") @@ -197,10 +205,8 @@ drat:::addRepo("dmlc") install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source") ``` -If you would like to use the latest xgboost version and already compiled xgboost, use `library(devtools); install('xgboost/R-package')` to install manually xgboost package (change the path accordingly to where you compiled xgboost). - -For OSX users, single threaded version will be installed, to install multi-threaded version. -First follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compiler, then: +For OSX users, single threaded version will be installed. To install multi-threaded version, +first follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compiler, then: - Set the `Makevars` file in highest piority for R. @@ -214,24 +220,35 @@ First follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compi install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source") ``` -Due to the usage of submodule, `install_github` is no longer support to install the -latest version of R package. To install the latest version run the following bash script, +### Installing the development version + +Make sure you have installed git and a recent C++ compiler supporting C++11 (e.g., g++-4.6 or higher). +On Windows, Rtools must be installed, and its bin directory has to be added to PATH during the installation. +And see the previous subsection for an OSX tip. + +Due to the use of git-submodules, `devtools::install_github` can no longer be used to install the latest version of R package. +Thus, one has to run git to check out the code first: ```bash git clone --recursive https://github.com/dmlc/xgboost cd xgboost git submodule init git submodule update -alias make='mingw32-make' -cd dmlc-core -make -j4 -cd ../rabit -make lib/librabit_empty.a -j4 -cd .. -cp make/mingw64.mk config.mk -make -j4 +cd R-package +R CMD INSTALL . ``` +If the last line fails because of "R: command not found", it means that R was not set up to run from command line. +In this case, just start R as you would normally do and run the following: + +```r +setwd('wherever/you/cloned/it/xgboost/R-package/') +install.packages('.', repos = NULL, type="source") +``` + +If all fails, try [building the shared library](#build-the-shared-library) to see whether a problem is specific to R package or not. + + ## Trouble Shooting 1. **Compile failed after `git pull`**