[R] various R code maintenance (#1964)
* [R] xgb.save must work when handle in nil but raw exists * [R] print.xgb.Booster should still print other info when handle is nil * [R] rename internal function xgb.Booster to xgb.Booster.handle to make its intent clear * [R] rename xgb.Booster.check to xgb.Booster.complete and make it visible; more docs * [R] storing evaluation_log should depend only on watchlist, not on verbose * [R] reduce the excessive chattiness of unit tests * [R] only disable some tests in windows when it's not 64-bit * [R] clean-up xgb.DMatrix * [R] test xgb.DMatrix loading from libsvm text file * [R] store feature_names in xgb.Booster, use them from utility functions * [R] remove non-functional co-occurence computation from xgb.importance * [R] verbose=0 is enough without a callback * [R] added forgotten xgb.Booster.complete.Rd; cran check fixes * [R] update installation instructions
This commit is contained in:
parent
a073a2c3d4
commit
2b5b96d760
@ -24,6 +24,7 @@ export(cb.save.model)
|
|||||||
export(getinfo)
|
export(getinfo)
|
||||||
export(setinfo)
|
export(setinfo)
|
||||||
export(slice)
|
export(slice)
|
||||||
|
export(xgb.Booster.complete)
|
||||||
export(xgb.DMatrix)
|
export(xgb.DMatrix)
|
||||||
export(xgb.DMatrix.save)
|
export(xgb.DMatrix.save)
|
||||||
export(xgb.attr)
|
export(xgb.attr)
|
||||||
|
|||||||
@ -507,7 +507,7 @@ cb.cv.predict <- function(save_models = FALSE) {
|
|||||||
if (save_models) {
|
if (save_models) {
|
||||||
env$basket$models <- lapply(env$bst_folds, function(fd) {
|
env$basket$models <- lapply(env$bst_folds, function(fd) {
|
||||||
xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
|
xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
|
||||||
xgb.Booster.check(xgb.handleToBooster(fd$bst), saveraw = TRUE)
|
xgb.Booster.complete(xgb.handleToBooster(fd$bst), saveraw = TRUE)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Construct a Booster from cachelist
|
# Construct an internal xgboost Booster and return a handle to it
|
||||||
# internal utility function
|
# internal utility function
|
||||||
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||||
if (typeof(cachelist) != "list" ||
|
if (typeof(cachelist) != "list" ||
|
||||||
any(sapply(cachelist, class) != 'xgb.DMatrix')) {
|
any(sapply(cachelist, class) != 'xgb.DMatrix')) {
|
||||||
stop("xgb.Booster only accepts list of DMatrix as cachelist")
|
stop("xgb.Booster only accepts list of DMatrix as cachelist")
|
||||||
@ -13,8 +13,8 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
|||||||
} else if (typeof(modelfile) == "raw") {
|
} else if (typeof(modelfile) == "raw") {
|
||||||
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
|
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
|
||||||
} else if (class(modelfile) == "xgb.Booster") {
|
} else if (class(modelfile) == "xgb.Booster") {
|
||||||
modelfile <- xgb.Booster.check(modelfile, saveraw=TRUE)
|
bst <- xgb.Booster.complete(modelfile, saveraw=TRUE)
|
||||||
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile$raw, PACKAGE = "xgboost")
|
.Call("XGBoosterLoadModelFromRaw_R", handle, bst$raw, PACKAGE = "xgboost")
|
||||||
} else {
|
} else {
|
||||||
stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
|
stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
|
||||||
}
|
}
|
||||||
@ -34,6 +34,17 @@ xgb.handleToBooster <- function(handle, raw = NULL) {
|
|||||||
return(bst)
|
return(bst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Check whether xgb.Booster.handle is null
|
||||||
|
# internal utility function
|
||||||
|
is.null.handle <- function(handle) {
|
||||||
|
if (class(handle) != "xgb.Booster.handle")
|
||||||
|
stop("argument type must be xgb.Booster.handle")
|
||||||
|
|
||||||
|
if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE="xgboost"))
|
||||||
|
return(TRUE)
|
||||||
|
return(FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
# Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster
|
# Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster
|
||||||
# internal utility function
|
# internal utility function
|
||||||
xgb.get.handle <- function(object) {
|
xgb.get.handle <- function(object) {
|
||||||
@ -42,32 +53,65 @@ xgb.get.handle <- function(object) {
|
|||||||
xgb.Booster.handle = object,
|
xgb.Booster.handle = object,
|
||||||
stop("argument must be of either xgb.Booster or xgb.Booster.handle class")
|
stop("argument must be of either xgb.Booster or xgb.Booster.handle class")
|
||||||
)
|
)
|
||||||
if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE="xgboost")) {
|
if (is.null.handle(handle)) {
|
||||||
stop("invalid xgb.Booster.handle")
|
stop("invalid xgb.Booster.handle")
|
||||||
}
|
}
|
||||||
handle
|
handle
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check whether an xgb.Booster object is complete
|
#' Restore missing parts of an incomplete xgb.Booster object.
|
||||||
# internal utility function
|
#'
|
||||||
xgb.Booster.check <- function(bst, saveraw = TRUE) {
|
#' It attempts to complete an \code{xgb.Booster} object by restoring either its missing
|
||||||
if (class(bst) != "xgb.Booster")
|
#' raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
|
||||||
|
#' or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
|
||||||
|
#' but it has a raw Booster memory dump).
|
||||||
|
#'
|
||||||
|
#' @param object object of class \code{xgb.Booster}
|
||||||
|
#' @param saveraw a flag indicating whether to append \code{raw} Booster memory dump data
|
||||||
|
#' when it doesn't already exist.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#'
|
||||||
|
#' While this method is primarily for internal use, it might be useful in some practical situations.
|
||||||
|
#'
|
||||||
|
#' E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
|
||||||
|
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
|
||||||
|
#' should still work for such a model object since those methods would be using
|
||||||
|
#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
|
||||||
|
#' \code{xgb.Booster.complete} function once after loading a model as an R-object. That which would
|
||||||
|
#' prevent further reconstruction (potentially, multiple times) of an internal booster model.
|
||||||
|
#'
|
||||||
|
#' @return
|
||||||
|
#' An object of \code{xgb.Booster} class.
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#'
|
||||||
|
#' data(agaricus.train, package='xgboost')
|
||||||
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||||
|
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
|
#' saveRDS(bst, "xgb.model.rds")
|
||||||
|
#'
|
||||||
|
#' bst1 <- readRDS("xgb.model.rds")
|
||||||
|
#' # the handle is invalid:
|
||||||
|
#' print(bst1$handle)
|
||||||
|
#' bst1 <- xgb.Booster.complete(bst1)
|
||||||
|
#' # now the handle points to a valid internal booster model:
|
||||||
|
#' print(bst1$handle)
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
xgb.Booster.complete <- function(object, saveraw = TRUE) {
|
||||||
|
if (class(object) != "xgb.Booster")
|
||||||
stop("argument type must be xgb.Booster")
|
stop("argument type must be xgb.Booster")
|
||||||
|
|
||||||
isnull <- is.null(bst$handle)
|
if (is.null.handle(object$handle)) {
|
||||||
if (!isnull) {
|
object$handle <- xgb.Booster.handle(modelfile = object$raw)
|
||||||
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
|
|
||||||
}
|
|
||||||
if (isnull) {
|
|
||||||
bst$handle <- xgb.Booster(modelfile = bst$raw)
|
|
||||||
} else {
|
} else {
|
||||||
if (is.null(bst$raw) && saveraw)
|
if (is.null(object$raw) && saveraw)
|
||||||
bst$raw <- xgb.save.raw(bst$handle)
|
object$raw <- xgb.save.raw(object$handle)
|
||||||
}
|
}
|
||||||
return(bst)
|
return(object)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#' Predict method for eXtreme Gradient Boosting model
|
#' Predict method for eXtreme Gradient Boosting model
|
||||||
#'
|
#'
|
||||||
#' Predicted values based on either xgboost model or model handle object.
|
#' Predicted values based on either xgboost model or model handle object.
|
||||||
@ -180,7 +224,7 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) {
|
|||||||
predict.xgb.Booster <- function(object, newdata, missing = NA,
|
predict.xgb.Booster <- function(object, newdata, missing = NA,
|
||||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) {
|
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) {
|
||||||
|
|
||||||
object <- xgb.Booster.check(object, saveraw = FALSE)
|
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
||||||
if (class(newdata) != "xgb.DMatrix")
|
if (class(newdata) != "xgb.DMatrix")
|
||||||
newdata <- xgb.DMatrix(newdata, missing = missing)
|
newdata <- xgb.DMatrix(newdata, missing = missing)
|
||||||
if (is.null(ntreelimit))
|
if (is.null(ntreelimit))
|
||||||
@ -429,10 +473,9 @@ xgb.ntree <- function(bst) {
|
|||||||
print.xgb.Booster <- function(x, verbose=FALSE, ...) {
|
print.xgb.Booster <- function(x, verbose=FALSE, ...) {
|
||||||
cat('##### xgb.Booster\n')
|
cat('##### xgb.Booster\n')
|
||||||
|
|
||||||
if (is.null(x$handle) || .Call("XGCheckNullPtr_R", x$handle, PACKAGE="xgboost")) {
|
valid_handle <- is.null.handle(x$handle)
|
||||||
cat("handle is invalid\n")
|
if (!valid_handle)
|
||||||
return(x)
|
cat("Handle is invalid! Suggest using xgb.Booster.complete\n")
|
||||||
}
|
|
||||||
|
|
||||||
cat('raw: ')
|
cat('raw: ')
|
||||||
if (!is.null(x$raw)) {
|
if (!is.null(x$raw)) {
|
||||||
@ -454,6 +497,8 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
|
|||||||
}
|
}
|
||||||
# TODO: need an interface to access all the xgboosts parameters
|
# TODO: need an interface to access all the xgboosts parameters
|
||||||
|
|
||||||
|
attrs <- character(0)
|
||||||
|
if (valid_handle)
|
||||||
attrs <- xgb.attributes(x)
|
attrs <- xgb.attributes(x)
|
||||||
if (length(attrs) > 0) {
|
if (length(attrs) > 0) {
|
||||||
cat('xgb.attributes:\n')
|
cat('xgb.attributes:\n')
|
||||||
@ -474,11 +519,15 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!is.null(x$feature_names))
|
||||||
|
cat('# of features:', length(x$feature_names), '\n')
|
||||||
|
|
||||||
cat('niter: ', x$niter, '\n', sep='')
|
cat('niter: ', x$niter, '\n', sep='')
|
||||||
# TODO: uncomment when faster xgb.ntree is implemented
|
# TODO: uncomment when faster xgb.ntree is implemented
|
||||||
#cat('ntree: ', xgb.ntree(x), '\n', sep='')
|
#cat('ntree: ', xgb.ntree(x), '\n', sep='')
|
||||||
|
|
||||||
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks','evaluation_log','niter'))) {
|
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
|
||||||
|
'evaluation_log','niter','feature_names'))) {
|
||||||
if (is.atomic(x[[n]])) {
|
if (is.atomic(x[[n]])) {
|
||||||
cat(n, ':', x[[n]], '\n', sep=' ')
|
cat(n, ':', x[[n]], '\n', sep=' ')
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -31,18 +31,13 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
|
|||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
cnames <- colnames(data)
|
cnames <- colnames(data)
|
||||||
} else {
|
} else {
|
||||||
stop(paste("xgb.DMatrix: does not support to construct from ",
|
stop("xgb.DMatrix does not support construction from ", typeof(data))
|
||||||
typeof(data)))
|
|
||||||
}
|
}
|
||||||
dmat <- handle
|
dmat <- handle
|
||||||
attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix")
|
attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix")
|
||||||
#dmat <- list(handle = handle, colnames = cnames)
|
|
||||||
#attr(dmat, 'class') <- "xgb.DMatrix"
|
|
||||||
|
|
||||||
info <- append(info, list(...))
|
info <- append(info, list(...))
|
||||||
if (length(info) == 0)
|
for (i in seq_along(info)) {
|
||||||
return(dmat)
|
|
||||||
for (i in 1:length(info)) {
|
|
||||||
p <- info[i]
|
p <- info[i]
|
||||||
setinfo(dmat, names(p), p[[1]])
|
setinfo(dmat, names(p), p[[1]])
|
||||||
}
|
}
|
||||||
@ -70,11 +65,10 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
|
|||||||
dtrain <- xgb.DMatrix(data)
|
dtrain <- xgb.DMatrix(data)
|
||||||
} else if (inClass == "xgb.DMatrix") {
|
} else if (inClass == "xgb.DMatrix") {
|
||||||
dtrain <- data
|
dtrain <- data
|
||||||
} else if (inClass == "data.frame") {
|
} else if ("data.frame" %in% inClass) {
|
||||||
stop("xgboost only support numerical matrix input,
|
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
|
||||||
use 'data.matrix' to transform the data.")
|
|
||||||
} else {
|
} else {
|
||||||
stop("xgboost: Invalid input of data")
|
stop("xgboost: invalid input data")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (dtrain)
|
return (dtrain)
|
||||||
@ -190,7 +184,7 @@ getinfo.xgb.DMatrix <- function(object, name, ...) {
|
|||||||
if (typeof(name) != "character" ||
|
if (typeof(name) != "character" ||
|
||||||
length(name) != 1 ||
|
length(name) != 1 ||
|
||||||
!name %in% c('label', 'weight', 'base_margin', 'nrow')) {
|
!name %in% c('label', 'weight', 'base_margin', 'nrow')) {
|
||||||
stop("getinfo: name must one of the following\n",
|
stop("getinfo: name must be one of the following\n",
|
||||||
" 'label', 'weight', 'base_margin', 'nrow'")
|
" 'label', 'weight', 'base_margin', 'nrow'")
|
||||||
}
|
}
|
||||||
if (name != "nrow"){
|
if (name != "nrow"){
|
||||||
@ -266,7 +260,7 @@ setinfo.xgb.DMatrix <- function(object, name, info, ...) {
|
|||||||
PACKAGE = "xgboost")
|
PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
stop(paste("setinfo: unknown info name", name))
|
stop("setinfo: unknown info name ", name)
|
||||||
return(FALSE)
|
return(FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -181,8 +181,8 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
bst_folds <- lapply(1:length(folds), function(k) {
|
bst_folds <- lapply(1:length(folds), function(k) {
|
||||||
dtest <- slice(dall, folds[[k]])
|
dtest <- slice(dall, folds[[k]])
|
||||||
dtrain <- slice(dall, unlist(folds[-k]))
|
dtrain <- slice(dall, unlist(folds[-k]))
|
||||||
bst <- xgb.Booster(params, list(dtrain, dtest))
|
handle <- xgb.Booster.handle(params, list(dtrain, dtest))
|
||||||
list(dtrain=dtrain, bst=bst, watchlist=list(train=dtrain, test=dtest), index=folds[[k]])
|
list(dtrain=dtrain, bst=handle, watchlist=list(train=dtrain, test=dtest), index=folds[[k]])
|
||||||
})
|
})
|
||||||
# a "basket" to collect some results from callbacks
|
# a "basket" to collect some results from callbacks
|
||||||
basket <- list()
|
basket <- list()
|
||||||
|
|||||||
@ -1,24 +1,26 @@
|
|||||||
#' Save xgboost model to text file
|
#' Dump an xgboost model in text format.
|
||||||
#'
|
#'
|
||||||
#' Save a xgboost model to text file. Could be parsed later.
|
#' Dump an xgboost model in text format.
|
||||||
#'
|
#'
|
||||||
#' @param model the model object.
|
#' @param model the model object.
|
||||||
#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
|
#' @param fname the name of the text file where to save the model text dump.
|
||||||
#' @param fmap feature map file representing the type of feature.
|
#' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.
|
||||||
|
#' @param fmap feature map file representing feature types.
|
||||||
#' Detailed description could be found at
|
#' Detailed description could be found at
|
||||||
#' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
|
#' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
|
||||||
#' See demo/ for walkthrough example in R, and
|
#' See demo/ for walkthrough example in R, and
|
||||||
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||||
#' for example Format.
|
#' for example Format.
|
||||||
#' @param with_stats whether dump statistics of splits
|
#' @param with_stats whether to dump some additional statistics about the splits.
|
||||||
#' When this option is on, the model dump comes with two additional statistics:
|
#' When this option is on, the model dump contains two additional values:
|
||||||
#' gain is the approximate loss function gain we get in each split;
|
#' gain is the approximate loss function gain we get in each split;
|
||||||
#' cover is the sum of second order gradient in each node.
|
#' cover is the sum of second order gradient in each node.
|
||||||
#' @param dump_format either 'text' or 'json' format could be specified.
|
#' @param dump_format either 'text' or 'json' format could be specified.
|
||||||
#' @param ... currently not used
|
#' @param ... currently not used
|
||||||
#'
|
#'
|
||||||
#' @return
|
#' @return
|
||||||
#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
#' If fname is not provided or set to \code{NULL} the function will return the model
|
||||||
|
#' as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
@ -37,7 +39,8 @@
|
|||||||
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) {
|
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE,
|
||||||
|
dump_format = c("text", "json"), ...) {
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
dump_format <- match.arg(dump_format)
|
dump_format <- match.arg(dump_format)
|
||||||
if (class(model) != "xgb.Booster")
|
if (class(model) != "xgb.Booster")
|
||||||
@ -47,7 +50,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, du
|
|||||||
if (!(class(fmap) %in% c("character", "NULL") && length(fmap) <= 1))
|
if (!(class(fmap) %in% c("character", "NULL") && length(fmap) <= 1))
|
||||||
stop("fmap: argument must be of type character (when provided)")
|
stop("fmap: argument must be of type character (when provided)")
|
||||||
|
|
||||||
model <- xgb.Booster.check(model)
|
model <- xgb.Booster.complete(model)
|
||||||
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
|
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
|
||||||
as.character(dump_format), PACKAGE = "xgboost")
|
as.character(dump_format), PACKAGE = "xgboost")
|
||||||
|
|
||||||
|
|||||||
@ -1,102 +1,92 @@
|
|||||||
#' Show importance of features in a model
|
#' Importance of features in a model.
|
||||||
#'
|
#'
|
||||||
#' Create a \code{data.table} of the most important features of a model.
|
#' Creates a \code{data.table} of feature importances in a model.
|
||||||
#'
|
#'
|
||||||
#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
#' @param feature_names character vector of feature names. If the model already
|
||||||
#' @param model generated by the \code{xgb.train} function.
|
#' contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||||
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
#' Non-null \code{feature_names} could be provided to override those in the model.
|
||||||
#' @param label the label vector used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
#' @param model object of class \code{xgb.Booster}.
|
||||||
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
|
#' @param data deprecated.
|
||||||
#'
|
#' @param label deprecated.
|
||||||
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
#' @param target deprecated.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' This function is for both linear and tree models.
|
|
||||||
#'
|
#'
|
||||||
#' \code{data.table} is returned by the function.
|
#' This function works for both linear and tree models.
|
||||||
#' The columns are:
|
#'
|
||||||
|
#' For linear models, the importance is the absolute magnitude of linear coefficients.
|
||||||
|
#' For that reason, in order to obtain a meaningful ranking by importance for a linear model,
|
||||||
|
#' the features need to be on the same scale (which you also would want to do when using either
|
||||||
|
#' L1 or L2 regularization).
|
||||||
|
#'
|
||||||
|
#' @return
|
||||||
|
#'
|
||||||
|
#' For a tree model, a \code{data.table} with the following columns:
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
#' \item \code{Features} names of the features used in the model;
|
||||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
#' \item \code{Gain} represents fractional contribution of each feature to the model based on
|
||||||
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
|
#' the total gain of this feature's splits. Higher percentage means a more important
|
||||||
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
|
#' predictive feature.
|
||||||
|
#' \item \code{Cover} metric of the number of observation related to this feature;
|
||||||
|
#' \item \code{Frequency} percentage representing the relative number of times
|
||||||
|
#' a feature have been used in trees.
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' If you don't provide \code{feature_names}, index of the features will be used instead.
|
#' A linear model's importance \code{data.table} has only two columns:
|
||||||
|
#' \itemize{
|
||||||
|
#' \item \code{Features} names of the features used in the model;
|
||||||
|
#' \item \code{Weight} the linear coefficient of this feature.
|
||||||
|
#' }
|
||||||
#'
|
#'
|
||||||
#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
|
#' If you don't provide or \code{model} doesn't have \code{feature_names},
|
||||||
#'
|
#' index of the features will be used instead. Because the index is extracted from the model dump
|
||||||
#' Co-occurence count
|
#' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
|
||||||
#' ------------------
|
|
||||||
#'
|
|
||||||
#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
|
|
||||||
#'
|
|
||||||
#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
|
|
||||||
#'
|
|
||||||
#' If you need to remember only one thing: unless you want to leave us early, don't eat a mushroom which has no odor :-)
|
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
|
#'
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||||
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' xgb.importance(colnames(agaricus.train$data), model = bst)
|
#' xgb.importance(model = bst)
|
||||||
#'
|
|
||||||
#' # Same thing with co-occurence computation this time
|
|
||||||
#' xgb.importance(colnames(agaricus.train$data), model = bst,
|
|
||||||
#' data = agaricus.train$data, label = agaricus.train$label)
|
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
xgb.importance <- function(feature_names = NULL, model = NULL,
|
||||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
data = NULL, label = NULL, target = NULL){
|
||||||
stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (class(model) != "xgb.Booster") {
|
if (!(is.null(data) && is.null(label) && is.null(target)))
|
||||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
|
||||||
}
|
|
||||||
|
|
||||||
if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) {
|
if (class(model) != "xgb.Booster")
|
||||||
stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
|
stop("Either 'model' has to be an object of class xgb.Booster")
|
||||||
}
|
|
||||||
|
|
||||||
if(class(label) == "numeric"){
|
if (is.null(feature_names) && !is.null(model$feature_names))
|
||||||
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
|
feature_names <- model$feature_names
|
||||||
}
|
|
||||||
|
|
||||||
treeDump <- function(feature_names, text, keepDetail){
|
if (!class(feature_names) %in% c("character", "NULL"))
|
||||||
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
|
stop("feature_names: Has to be a character vector")
|
||||||
xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
|
|
||||||
}
|
|
||||||
|
|
||||||
linearDump <- function(feature_names, text){
|
model_text_dump <- xgb.dump(model = model, with_stats = TRUE)
|
||||||
weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric
|
|
||||||
if(is.null(feature_names)) feature_names <- seq(to = length(weights))
|
|
||||||
data.table(Feature = feature_names, Weight = weights)
|
|
||||||
}
|
|
||||||
|
|
||||||
model.text.dump <- xgb.dump(model = model, with_stats = T)
|
# linear model
|
||||||
|
if(model_text_dump[2] == "bias:"){
|
||||||
if(model.text.dump[2] == "bias:"){
|
weights <- which(model_text_dump == "weight:") %>%
|
||||||
result <- model.text.dump %>% linearDump(feature_names, .)
|
{model_text_dump[(. + 1):length(model_text_dump)]} %>%
|
||||||
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
|
as.numeric
|
||||||
|
if(is.null(feature_names))
|
||||||
|
feature_names <- seq(to = length(weights))
|
||||||
|
result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
|
||||||
} else {
|
} else {
|
||||||
result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data))
|
# tree model
|
||||||
|
result <- xgb.model.dt.tree(feature_names = feature_names, text = model_text_dump)[
|
||||||
# Co-occurence computation
|
Feature != "Leaf", .(Gain = sum(Quality),
|
||||||
if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
|
Cover = sum(Cover),
|
||||||
# Take care of missing column
|
Frequency = .N), by = Feature][
|
||||||
a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0
|
,`:=`(Gain = Gain / sum(Gain),
|
||||||
# Bind the two Matrix and reorder columns
|
Cover = Cover / sum(Cover),
|
||||||
c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]]
|
Frequency = Frequency / sum(Frequency))][
|
||||||
rm(a)
|
order(Gain, decreasing = TRUE)]
|
||||||
# Apply split
|
|
||||||
d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
|
|
||||||
apply(c & d, 2, . %>% target %>% sum) -> vec
|
|
||||||
|
|
||||||
result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][, MissingNo := NULL]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
@ -104,4 +94,4 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe
|
|||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(c(".", ".N", "Gain", "Frequency", "Feature", "Split", "No", "Missing", "MissingNo", "RealCover"))
|
globalVariables(c(".", ".N", "Gain", "Cover", "Frequency", "Feature"))
|
||||||
|
|||||||
@ -1,8 +1,23 @@
|
|||||||
#' Load xgboost model from binary file
|
#' Load xgboost model from binary file
|
||||||
#'
|
#'
|
||||||
#' Load xgboost model from the binary model file
|
#' Load xgboost model from the binary model file.
|
||||||
#'
|
#'
|
||||||
#' @param modelfile the name of the binary file.
|
#' @param modelfile the name of the binary input file.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' The input file is expected to contain a model saved in an xgboost-internal binary format
|
||||||
|
#' using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
|
||||||
|
#' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
|
||||||
|
#' saved from there in xgboost format, could be loaded from R.
|
||||||
|
#'
|
||||||
|
#' Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
|
||||||
|
#' not \code{xgb.load}.
|
||||||
|
#'
|
||||||
|
#' @return
|
||||||
|
#' An object of \code{xgb.Booster} class.
|
||||||
|
#'
|
||||||
|
#' @seealso
|
||||||
|
#' \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
@ -19,13 +34,13 @@ xgb.load <- function(modelfile) {
|
|||||||
if (is.null(modelfile))
|
if (is.null(modelfile))
|
||||||
stop("xgb.load: modelfile cannot be NULL")
|
stop("xgb.load: modelfile cannot be NULL")
|
||||||
|
|
||||||
handle <- xgb.Booster(modelfile = modelfile)
|
handle <- xgb.Booster.handle(modelfile = modelfile)
|
||||||
# re-use modelfile if it is raw so we do not need to serialize
|
# re-use modelfile if it is raw so we do not need to serialize
|
||||||
if (typeof(modelfile) == "raw") {
|
if (typeof(modelfile) == "raw") {
|
||||||
bst <- xgb.handleToBooster(handle, modelfile)
|
bst <- xgb.handleToBooster(handle, modelfile)
|
||||||
} else {
|
} else {
|
||||||
bst <- xgb.handleToBooster(handle, NULL)
|
bst <- xgb.handleToBooster(handle, NULL)
|
||||||
}
|
}
|
||||||
bst <- xgb.Booster.check(bst, saveraw = TRUE)
|
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||||
return(bst)
|
return(bst)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,14 +3,16 @@
|
|||||||
#' Parse a boosted tree model text dump into a \code{data.table} structure.
|
#' Parse a boosted tree model text dump into a \code{data.table} structure.
|
||||||
#'
|
#'
|
||||||
#' @param feature_names character vector of feature names. If the model already
|
#' @param feature_names character vector of feature names. If the model already
|
||||||
#' contains feature names, this argument should be \code{NULL} (default value)
|
#' contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||||
|
#' Non-null \code{feature_names} could be provided to override those in the model.
|
||||||
#' @param model object of class \code{xgb.Booster}
|
#' @param model object of class \code{xgb.Booster}
|
||||||
#' @param text \code{character} vector previously generated by the \code{xgb.dump}
|
#' @param text \code{character} vector previously generated by the \code{xgb.dump}
|
||||||
#' function (where parameter \code{with_stats = TRUE} should have been set).
|
#' function (where parameter \code{with_stats = TRUE} should have been set).
|
||||||
|
#' \code{text} takes precedence over \code{model}.
|
||||||
#' @param trees an integer vector of tree indices that should be parsed.
|
#' @param trees an integer vector of tree indices that should be parsed.
|
||||||
#' If set to \code{NULL}, all trees of the model are parsed.
|
#' If set to \code{NULL}, all trees of the model are parsed.
|
||||||
#' It could be useful, e.g., in multiclass classification to get only
|
#' It could be useful, e.g., in multiclass classification to get only
|
||||||
#' the trees of one certain class. IMPORTANT: the tree index in xgboost model
|
#' the trees of one certain class. IMPORTANT: the tree index in xgboost models
|
||||||
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
|
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
|
||||||
#' @param ... currently not used.
|
#' @param ... currently not used.
|
||||||
#'
|
#'
|
||||||
@ -43,7 +45,9 @@
|
|||||||
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
||||||
#'
|
#' # This bst has feature_names stored in it, so those would be used when
|
||||||
|
#' # the feature_names parameter is not provided:
|
||||||
|
#' (dt <- xgb.model.dt.tree(model = bst))
|
||||||
#'
|
#'
|
||||||
#' # How to match feature names of splits that are following a current 'Yes' branch:
|
#' # How to match feature names of splits that are following a current 'Yes' branch:
|
||||||
#'
|
#'
|
||||||
@ -53,11 +57,6 @@
|
|||||||
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
||||||
trees = NULL, ...){
|
trees = NULL, ...){
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
|
||||||
stop("feature_names: Has to be a vector of character\n",
|
|
||||||
" or NULL if the model dump already contains feature names.\n",
|
|
||||||
" Look at this function documentation to see where to get feature names.")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (class(model) != "xgb.Booster" & class(text) != "character") {
|
if (class(model) != "xgb.Booster" & class(text) != "character") {
|
||||||
stop("Either 'model' has to be an object of class xgb.Booster\n",
|
stop("Either 'model' has to be an object of class xgb.Booster\n",
|
||||||
@ -65,12 +64,19 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
|||||||
" (or NULL if the model was provided).")
|
" (or NULL if the model was provided).")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
|
||||||
|
feature_names <- model$feature_names
|
||||||
|
|
||||||
|
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||||
|
stop("feature_names: Has to be a character vector")
|
||||||
|
}
|
||||||
|
|
||||||
if (!class(trees) %in% c("integer", "numeric", "NULL")) {
|
if (!class(trees) %in% c("integer", "numeric", "NULL")) {
|
||||||
stop("trees: Has to be a vector of integers.")
|
stop("trees: Has to be a vector of integers.")
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is.null(text)){
|
if (is.null(text)){
|
||||||
text <- xgb.dump(model = model, with_stats = T)
|
text <- xgb.dump(model = model, with_stats = TRUE)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (length(text) < 2 ||
|
if (length(text) < 2 ||
|
||||||
|
|||||||
@ -126,4 +126,4 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
|
|||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label"))
|
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))
|
||||||
|
|||||||
@ -1,9 +1,22 @@
|
|||||||
#' Save xgboost model to binary file
|
#' Save xgboost model to binary file
|
||||||
#'
|
#'
|
||||||
#' Save xgboost model from xgboost or xgb.train
|
#' Save xgboost model to a file in binary format.
|
||||||
#'
|
#'
|
||||||
#' @param model the model object.
|
#' @param model model object of \code{xgb.Booster} class.
|
||||||
#' @param fname the name of the file to write.
|
#' @param fname name of the file to write.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' This methods allows to save a model in an xgboost-internal binary format which is universal
|
||||||
|
#' among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||||
|
#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||||
|
#' of \code{\link{xgb.train}}.
|
||||||
|
#'
|
||||||
|
#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||||
|
#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||||
|
#' corresponding R-methods would need to be used to load it.
|
||||||
|
#'
|
||||||
|
#' @seealso
|
||||||
|
#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
@ -22,6 +35,7 @@ xgb.save <- function(model, fname) {
|
|||||||
if (class(model) != "xgb.Booster")
|
if (class(model) != "xgb.Booster")
|
||||||
stop("the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.")
|
stop("the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.")
|
||||||
|
|
||||||
|
model <- xgb.Booster.complete(model, saveraw = FALSE)
|
||||||
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
|
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
|
||||||
return(TRUE)
|
return(TRUE)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#' eXtreme Gradient Boosting Training
|
#' eXtreme Gradient Boosting Training
|
||||||
#'
|
#'
|
||||||
#' \code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
|
#' \code{xgb.train} is an advanced interface for training an xgboost model.
|
||||||
|
#' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
|
||||||
#'
|
#'
|
||||||
#' @param params the list of parameters.
|
#' @param params the list of parameters.
|
||||||
#' The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
|
#' The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}.
|
||||||
@ -9,8 +10,7 @@
|
|||||||
#' 1. General Parameters
|
#' 1. General Parameters
|
||||||
#'
|
#'
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
|
#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
|
||||||
#' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
|
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' 2. Booster Parameters
|
#' 2. Booster Parameters
|
||||||
@ -54,24 +54,26 @@
|
|||||||
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
#' \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' @param data input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
|
#' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
|
||||||
#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.
|
#' \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
|
||||||
#' @param nrounds the max number of iterations
|
#' @param nrounds max number of boosting iterations.
|
||||||
#' @param watchlist what information should be printed when \code{verbose=1} or
|
#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
|
||||||
#' \code{verbose=2}. Watchlist is used to specify validation set monitoring
|
#' Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
|
||||||
#' during training. For example user can specify
|
#' of these datasets during each boosting iteration, and stored in the end as a field named
|
||||||
#' watchlist=list(validation1=mat1, validation2=mat2) to watch
|
#' \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
|
||||||
#' the performance of each round's model on mat1 and mat2
|
#' \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
|
||||||
#'
|
#' printed out during the training.
|
||||||
|
#' E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
|
||||||
|
#' the performance of each round's model on mat1 and mat2.
|
||||||
#' @param obj customized objective function. Returns gradient and second order
|
#' @param obj customized objective function. Returns gradient and second order
|
||||||
#' gradient with given prediction and dtrain.
|
#' gradient with given prediction and dtrain.
|
||||||
#' @param feval custimized evaluation function. Returns
|
#' @param feval custimized evaluation function. Returns
|
||||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||||
#' prediction and dtrain.
|
#' prediction and dtrain.
|
||||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
#' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
|
||||||
#' information of performance. If 2, xgboost will print some additional information.
|
#' If 2, some additional information will be printed out.
|
||||||
#' Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and
|
#' Note that setting \code{verbose > 0} automatically engages the
|
||||||
#' \code{\link{cb.print.evaluation}} callback functions.
|
#' \code{cb.print.evaluation(period=1)} callback function.
|
||||||
#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
|
#' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
|
||||||
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
#' Default is 1 which means all messages are printed. This parameter is passed to the
|
||||||
#' \code{\link{cb.print.evaluation}} callback.
|
#' \code{\link{cb.print.evaluation}} callback.
|
||||||
@ -106,7 +108,7 @@
|
|||||||
#'
|
#'
|
||||||
#' The \code{xgb.train} interface supports advanced features such as \code{watchlist},
|
#' The \code{xgb.train} interface supports advanced features such as \code{watchlist},
|
||||||
#' customized objective and evaluation metric functions, therefore it is more flexible
|
#' customized objective and evaluation metric functions, therefore it is more flexible
|
||||||
#' than the \code{\link{xgboost}} interface.
|
#' than the \code{xgboost} interface.
|
||||||
#'
|
#'
|
||||||
#' Parallelization is automatically enabled if \code{OpenMP} is present.
|
#' Parallelization is automatically enabled if \code{OpenMP} is present.
|
||||||
#' Number of threads can also be manually specified via \code{nthread} parameter.
|
#' Number of threads can also be manually specified via \code{nthread} parameter.
|
||||||
@ -132,7 +134,7 @@
|
|||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
|
#' \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
|
||||||
#' and the \code{print_every_n} parameter is passed to it.
|
#' and the \code{print_every_n} parameter is passed to it.
|
||||||
#' \item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present.
|
#' \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
|
||||||
#' \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
#' \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
||||||
#' \item \code{cb.save.model}: when \code{save_period > 0} is set.
|
#' \item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||||
#' }
|
#' }
|
||||||
@ -158,6 +160,8 @@
|
|||||||
#' (only available with early stopping).
|
#' (only available with early stopping).
|
||||||
#' \item \code{best_score} the best evaluation metric value during early stopping.
|
#' \item \code{best_score} the best evaluation metric value during early stopping.
|
||||||
#' (only available with early stopping).
|
#' (only available with early stopping).
|
||||||
|
#' \item \code{feature_names} names of the training dataset features
|
||||||
|
#' (only when comun names were defined in training data).
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
#' @seealso
|
#' @seealso
|
||||||
@ -171,7 +175,7 @@
|
|||||||
#'
|
#'
|
||||||
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
#' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
#' dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||||
#' watchlist <- list(eval = dtest, train = dtrain)
|
#' watchlist <- list(train = dtrain, eval = dtest)
|
||||||
#'
|
#'
|
||||||
#' ## A simple xgb.train example:
|
#' ## A simple xgb.train example:
|
||||||
#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
|
#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
|
||||||
@ -210,17 +214,15 @@
|
|||||||
#'
|
#'
|
||||||
#'
|
#'
|
||||||
#' ## An xgb.train example of using variable learning rates at each iteration:
|
#' ## An xgb.train example of using variable learning rates at each iteration:
|
||||||
#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
|
#' param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
|
||||||
|
#' objective = "binary:logistic", eval_metric = "auc")
|
||||||
#' my_etas <- list(eta = c(0.5, 0.1))
|
#' my_etas <- list(eta = c(0.5, 0.1))
|
||||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||||
#' callbacks = list(cb.reset.parameters(my_etas)))
|
#' callbacks = list(cb.reset.parameters(my_etas)))
|
||||||
#'
|
#'
|
||||||
#'
|
#' ## Early stopping:
|
||||||
#' ## Explicit use of the cb.evaluation.log callback allows to run
|
#' bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
|
||||||
#' ## xgb.train silently but still store the evaluation results:
|
#' early_stopping_rounds = 3)
|
||||||
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
|
||||||
#' verbose = 0, callbacks = list(cb.evaluation.log()))
|
|
||||||
#' print(bst$evaluation_log)
|
|
||||||
#'
|
#'
|
||||||
#' ## An 'xgboost' interface example:
|
#' ## An 'xgboost' interface example:
|
||||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
||||||
@ -259,13 +261,13 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
|||||||
# evaluation printing callback
|
# evaluation printing callback
|
||||||
params <- c(params, list(silent = ifelse(verbose > 1, 0, 1)))
|
params <- c(params, list(silent = ifelse(verbose > 1, 0, 1)))
|
||||||
print_every_n <- max( as.integer(print_every_n), 1L)
|
print_every_n <- max( as.integer(print_every_n), 1L)
|
||||||
if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
|
if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
|
||||||
|
verbose) {
|
||||||
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
|
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
|
||||||
}
|
}
|
||||||
# evaluation log callback: it is automatically enabled only when verbose > 0
|
# evaluation log callback: it is automatically enabled when watchlist is provided
|
||||||
evaluation_log <- list()
|
evaluation_log <- list()
|
||||||
if (verbose > 0 &&
|
if (!has.callbacks(callbacks, 'cb.evaluation.log') &&
|
||||||
!has.callbacks(callbacks, 'cb.evaluation.log') &&
|
|
||||||
length(watchlist) > 0) {
|
length(watchlist) > 0) {
|
||||||
callbacks <- add.cb(callbacks, cb.evaluation.log())
|
callbacks <- add.cb(callbacks, cb.evaluation.log())
|
||||||
}
|
}
|
||||||
@ -288,7 +290,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
|||||||
is_update <- NVL(params[['process_type']], '.') == 'update'
|
is_update <- NVL(params[['process_type']], '.') == 'update'
|
||||||
|
|
||||||
# Construct a booster (either a new one or load from xgb_model)
|
# Construct a booster (either a new one or load from xgb_model)
|
||||||
handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model)
|
handle <- xgb.Booster.handle(params, append(watchlist, dtrain), xgb_model)
|
||||||
bst <- xgb.handleToBooster(handle)
|
bst <- xgb.handleToBooster(handle)
|
||||||
|
|
||||||
# extract parameters that can affect the relationship b/w #trees and #iterations
|
# extract parameters that can affect the relationship b/w #trees and #iterations
|
||||||
@ -332,7 +334,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
|||||||
}
|
}
|
||||||
for (f in cb$finalize) f(finalize=TRUE)
|
for (f in cb$finalize) f(finalize=TRUE)
|
||||||
|
|
||||||
bst <- xgb.Booster.check(bst, saveraw = TRUE)
|
bst <- xgb.Booster.complete(bst, saveraw = TRUE)
|
||||||
|
|
||||||
# store the total number of boosting iterations
|
# store the total number of boosting iterations
|
||||||
bst$niter = end_iteration
|
bst$niter = end_iteration
|
||||||
@ -354,6 +356,8 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
|
|||||||
bst$call <- match.call()
|
bst$call <- match.call()
|
||||||
bst$params <- params
|
bst$params <- params
|
||||||
bst$callbacks <- callbacks
|
bst$callbacks <- callbacks
|
||||||
|
if (!is.null(colnames(dtrain)))
|
||||||
|
bst$feature_names <- colnames(dtrain)
|
||||||
|
|
||||||
return(bst)
|
return(bst)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
# Simple interface for training an xgboost model.
|
# Simple interface for training an xgboost model that wraps \code{xgb.train}
|
||||||
# Its documentation is combined with xgb.train.
|
# Its documentation is combined with xgb.train.
|
||||||
#
|
#
|
||||||
#' @rdname xgb.train
|
#' @rdname xgb.train
|
||||||
@ -12,9 +12,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
|
|||||||
|
|
||||||
dtrain <- xgb.get.DMatrix(data, label, missing, weight)
|
dtrain <- xgb.get.DMatrix(data, label, missing, weight)
|
||||||
|
|
||||||
watchlist <- list()
|
watchlist <- list(train = dtrain)
|
||||||
if (verbose > 0)
|
|
||||||
watchlist$train = dtrain
|
|
||||||
|
|
||||||
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n = print_every_n,
|
bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n = print_every_n,
|
||||||
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
|
early_stopping_rounds = early_stopping_rounds, maximize = maximize,
|
||||||
|
|||||||
49
R-package/man/xgb.Booster.complete.Rd
Normal file
49
R-package/man/xgb.Booster.complete.Rd
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/xgb.Booster.R
|
||||||
|
\name{xgb.Booster.complete}
|
||||||
|
\alias{xgb.Booster.complete}
|
||||||
|
\title{Restore missing parts of an incomplete xgb.Booster object.}
|
||||||
|
\usage{
|
||||||
|
xgb.Booster.complete(object, saveraw = TRUE)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{object}{object of class \code{xgb.Booster}}
|
||||||
|
|
||||||
|
\item{saveraw}{a flag indicating whether to append \code{raw} Booster memory dump data
|
||||||
|
when it doesn't already exist.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An object of \code{xgb.Booster} class.
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
It attempts to complete an \code{xgb.Booster} object by restoring either its missing
|
||||||
|
raw model memory dump (when it has no \code{raw} data but its \code{xgb.Booster.handle} is valid)
|
||||||
|
or its missing internal handle (when its \code{xgb.Booster.handle} is not valid
|
||||||
|
but it has a raw Booster memory dump).
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
While this method is primarily for internal use, it might be useful in some practical situations.
|
||||||
|
|
||||||
|
E.g., when an \code{xgb.Booster} model is saved as an R object and then is loaded as an R object,
|
||||||
|
its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
|
||||||
|
should still work for such a model object since those methods would be using
|
||||||
|
\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
|
||||||
|
\code{xgb.Booster.complete} function once after loading a model as an R-object. That which would
|
||||||
|
prevent further reconstruction (potentially, multiple times) of an internal booster model.
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
|
||||||
|
data(agaricus.train, package='xgboost')
|
||||||
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||||
|
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
|
saveRDS(bst, "xgb.model.rds")
|
||||||
|
|
||||||
|
bst1 <- readRDS("xgb.model.rds")
|
||||||
|
# the handle is invalid:
|
||||||
|
print(bst1$handle)
|
||||||
|
bst1 <- xgb.Booster.complete(bst1)
|
||||||
|
# now the handle points to a valid internal booster model:
|
||||||
|
print(bst1$handle)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -2,7 +2,7 @@
|
|||||||
% Please edit documentation in R/xgb.dump.R
|
% Please edit documentation in R/xgb.dump.R
|
||||||
\name{xgb.dump}
|
\name{xgb.dump}
|
||||||
\alias{xgb.dump}
|
\alias{xgb.dump}
|
||||||
\title{Save xgboost model to text file}
|
\title{Dump an xgboost model in text format.}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
|
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
|
||||||
dump_format = c("text", "json"), ...)
|
dump_format = c("text", "json"), ...)
|
||||||
@ -10,17 +10,18 @@ xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
|
|||||||
\arguments{
|
\arguments{
|
||||||
\item{model}{the model object.}
|
\item{model}{the model object.}
|
||||||
|
|
||||||
\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
|
\item{fname}{the name of the text file where to save the model text dump.
|
||||||
|
If not provided or set to \code{NULL}, the model is returned as a \code{character} vector.}
|
||||||
|
|
||||||
\item{fmap}{feature map file representing the type of feature.
|
\item{fmap}{feature map file representing feature types.
|
||||||
Detailed description could be found at
|
Detailed description could be found at
|
||||||
\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
|
\url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
|
||||||
See demo/ for walkthrough example in R, and
|
See demo/ for walkthrough example in R, and
|
||||||
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
|
||||||
for example Format.}
|
for example Format.}
|
||||||
|
|
||||||
\item{with_stats}{whether dump statistics of splits
|
\item{with_stats}{whether to dump some additional statistics about the splits.
|
||||||
When this option is on, the model dump comes with two additional statistics:
|
When this option is on, the model dump contains two additional values:
|
||||||
gain is the approximate loss function gain we get in each split;
|
gain is the approximate loss function gain we get in each split;
|
||||||
cover is the sum of second order gradient in each node.}
|
cover is the sum of second order gradient in each node.}
|
||||||
|
|
||||||
@ -29,10 +30,11 @@ cover is the sum of second order gradient in each node.}
|
|||||||
\item{...}{currently not used}
|
\item{...}{currently not used}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
If fname is not provided or set to \code{NULL} the function will return the model
|
||||||
|
as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Save a xgboost model to text file. Could be parsed later.
|
Dump an xgboost model in text format.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|||||||
@ -2,64 +2,65 @@
|
|||||||
% Please edit documentation in R/xgb.importance.R
|
% Please edit documentation in R/xgb.importance.R
|
||||||
\name{xgb.importance}
|
\name{xgb.importance}
|
||||||
\alias{xgb.importance}
|
\alias{xgb.importance}
|
||||||
\title{Show importance of features in a model}
|
\title{Importance of features in a model.}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.importance(feature_names = NULL, model = NULL, data = NULL,
|
xgb.importance(feature_names = NULL, model = NULL, data = NULL,
|
||||||
label = NULL, target = function(x) ((x + label) == 2))
|
label = NULL, target = NULL)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
\item{feature_names}{character vector of feature names. If the model already
|
||||||
|
contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||||
|
Non-null \code{feature_names} could be provided to override those in the model.}
|
||||||
|
|
||||||
\item{model}{generated by the \code{xgb.train} function.}
|
\item{model}{object of class \code{xgb.Booster}.}
|
||||||
|
|
||||||
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
\item{data}{deprecated.}
|
||||||
|
|
||||||
\item{label}{the label vector used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
\item{label}{deprecated.}
|
||||||
|
|
||||||
\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
|
\item{target}{deprecated.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
For a tree model, a \code{data.table} with the following columns:
|
||||||
|
\itemize{
|
||||||
|
\item \code{Features} names of the features used in the model;
|
||||||
|
\item \code{Gain} represents fractional contribution of each feature to the model based on
|
||||||
|
the total gain of this feature's splits. Higher percentage means a more important
|
||||||
|
predictive feature.
|
||||||
|
\item \code{Cover} metric of the number of observation related to this feature;
|
||||||
|
\item \code{Frequency} percentage representing the relative number of times
|
||||||
|
a feature have been used in trees.
|
||||||
|
}
|
||||||
|
|
||||||
|
A linear model's importance \code{data.table} has only two columns:
|
||||||
|
\itemize{
|
||||||
|
\item \code{Features} names of the features used in the model;
|
||||||
|
\item \code{Weight} the linear coefficient of this feature.
|
||||||
|
}
|
||||||
|
|
||||||
|
If you don't provide or \code{model} doesn't have \code{feature_names},
|
||||||
|
index of the features will be used instead. Because the index is extracted from the model dump
|
||||||
|
(based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R).
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Create a \code{data.table} of the most important features of a model.
|
Creates a \code{data.table} of feature importances in a model.
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
This function is for both linear and tree models.
|
This function works for both linear and tree models.
|
||||||
|
|
||||||
\code{data.table} is returned by the function.
|
For linear models, the importance is the absolute magnitude of linear coefficients.
|
||||||
The columns are:
|
For that reason, in order to obtain a meaningful ranking by importance for a linear model,
|
||||||
\itemize{
|
the features need to be on the same scale (which you also would want to do when using either
|
||||||
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
L1 or L2 regularization).
|
||||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
|
||||||
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
|
|
||||||
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
|
|
||||||
}
|
|
||||||
|
|
||||||
If you don't provide \code{feature_names}, index of the features will be used instead.
|
|
||||||
|
|
||||||
Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
|
|
||||||
|
|
||||||
Co-occurence count
|
|
||||||
------------------
|
|
||||||
|
|
||||||
The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom.
|
|
||||||
|
|
||||||
Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
|
|
||||||
|
|
||||||
If you need to remember only one thing: unless you want to leave us early, don't eat a mushroom which has no odor :-)
|
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
|
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
||||||
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||||
|
|
||||||
xgb.importance(colnames(agaricus.train$data), model = bst)
|
xgb.importance(model = bst)
|
||||||
|
|
||||||
# Same thing with co-occurence computation this time
|
|
||||||
xgb.importance(colnames(agaricus.train$data), model = bst,
|
|
||||||
data = agaricus.train$data, label = agaricus.train$label)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -7,10 +7,22 @@
|
|||||||
xgb.load(modelfile)
|
xgb.load(modelfile)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{modelfile}{the name of the binary file.}
|
\item{modelfile}{the name of the binary input file.}
|
||||||
|
}
|
||||||
|
\value{
|
||||||
|
An object of \code{xgb.Booster} class.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Load xgboost model from the binary model file
|
Load xgboost model from the binary model file.
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
The input file is expected to contain a model saved in an xgboost-internal binary format
|
||||||
|
using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
|
||||||
|
appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
|
||||||
|
saved from there in xgboost format, could be loaded from R.
|
||||||
|
|
||||||
|
Note: a model saved as an R-object, has to be loaded using corresponding R-methods,
|
||||||
|
not \code{xgb.load}.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
@ -23,4 +35,7 @@ xgb.save(bst, 'xgb.model')
|
|||||||
bst <- xgb.load('xgb.model')
|
bst <- xgb.load('xgb.model')
|
||||||
pred <- predict(bst, test$data)
|
pred <- predict(bst, test$data)
|
||||||
}
|
}
|
||||||
|
\seealso{
|
||||||
|
\code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -9,17 +9,19 @@ xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
|
|||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{feature_names}{character vector of feature names. If the model already
|
\item{feature_names}{character vector of feature names. If the model already
|
||||||
contains feature names, this argument should be \code{NULL} (default value)}
|
contains feature names, those would be used when \code{feature_names=NULL} (default value).
|
||||||
|
Non-null \code{feature_names} could be provided to override those in the model.}
|
||||||
|
|
||||||
\item{model}{object of class \code{xgb.Booster}}
|
\item{model}{object of class \code{xgb.Booster}}
|
||||||
|
|
||||||
\item{text}{\code{character} vector previously generated by the \code{xgb.dump}
|
\item{text}{\code{character} vector previously generated by the \code{xgb.dump}
|
||||||
function (where parameter \code{with_stats = TRUE} should have been set).}
|
function (where parameter \code{with_stats = TRUE} should have been set).
|
||||||
|
\code{text} takes precedence over \code{model}.}
|
||||||
|
|
||||||
\item{trees}{an integer vector of tree indices that should be parsed.
|
\item{trees}{an integer vector of tree indices that should be parsed.
|
||||||
If set to \code{NULL}, all trees of the model are parsed.
|
If set to \code{NULL}, all trees of the model are parsed.
|
||||||
It could be useful, e.g., in multiclass classification to get only
|
It could be useful, e.g., in multiclass classification to get only
|
||||||
the trees of one certain class. IMPORTANT: the tree index in xgboost model
|
the trees of one certain class. IMPORTANT: the tree index in xgboost models
|
||||||
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
|
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
|
||||||
|
|
||||||
\item{...}{currently not used.}
|
\item{...}{currently not used.}
|
||||||
@ -56,7 +58,9 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
|
|||||||
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
|
||||||
|
|
||||||
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
|
||||||
|
# This bst has feature_names stored in it, so those would be used when
|
||||||
|
# the feature_names parameter is not provided:
|
||||||
|
(dt <- xgb.model.dt.tree(model = bst))
|
||||||
|
|
||||||
# How to match feature names of splits that are following a current 'Yes' branch:
|
# How to match feature names of splits that are following a current 'Yes' branch:
|
||||||
|
|
||||||
|
|||||||
@ -7,12 +7,22 @@
|
|||||||
xgb.save(model, fname)
|
xgb.save(model, fname)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{model}{the model object.}
|
\item{model}{model object of \code{xgb.Booster} class.}
|
||||||
|
|
||||||
\item{fname}{the name of the file to write.}
|
\item{fname}{name of the file to write.}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Save xgboost model from xgboost or xgb.train
|
Save xgboost model to a file in binary format.
|
||||||
|
}
|
||||||
|
\details{
|
||||||
|
This methods allows to save a model in an xgboost-internal binary format which is universal
|
||||||
|
among the various xgboost interfaces. In R, the saved model file could be read-in later
|
||||||
|
using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter
|
||||||
|
of \code{\link{xgb.train}}.
|
||||||
|
|
||||||
|
Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}}
|
||||||
|
or \code{\link[base]{save}}). However, it would then only be compatible with R, and
|
||||||
|
corresponding R-methods would need to be used to load it.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
@ -25,4 +35,7 @@ xgb.save(bst, 'xgb.model')
|
|||||||
bst <- xgb.load('xgb.model')
|
bst <- xgb.load('xgb.model')
|
||||||
pred <- predict(bst, test$data)
|
pred <- predict(bst, test$data)
|
||||||
}
|
}
|
||||||
|
\seealso{
|
||||||
|
\code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -23,8 +23,7 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
|||||||
1. General Parameters
|
1. General Parameters
|
||||||
|
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
|
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}.
|
||||||
\item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
2. Booster Parameters
|
2. Booster Parameters
|
||||||
@ -68,16 +67,19 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
|
|||||||
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
\item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||||
}}
|
}}
|
||||||
|
|
||||||
\item{data}{input dataset. \code{xgb.train} takes only an \code{xgb.DMatrix} as the input.
|
\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
|
||||||
\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or local data file.}
|
\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.}
|
||||||
|
|
||||||
\item{nrounds}{the max number of iterations}
|
\item{nrounds}{max number of boosting iterations.}
|
||||||
|
|
||||||
\item{watchlist}{what information should be printed when \code{verbose=1} or
|
\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
|
||||||
\code{verbose=2}. Watchlist is used to specify validation set monitoring
|
Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
|
||||||
during training. For example user can specify
|
of these datasets during each boosting iteration, and stored in the end as a field named
|
||||||
watchlist=list(validation1=mat1, validation2=mat2) to watch
|
\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
|
||||||
the performance of each round's model on mat1 and mat2}
|
\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
|
||||||
|
printed out during the training.
|
||||||
|
E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
|
||||||
|
the performance of each round's model on mat1 and mat2.}
|
||||||
|
|
||||||
\item{obj}{customized objective function. Returns gradient and second order
|
\item{obj}{customized objective function. Returns gradient and second order
|
||||||
gradient with given prediction and dtrain.}
|
gradient with given prediction and dtrain.}
|
||||||
@ -86,10 +88,10 @@ gradient with given prediction and dtrain.}
|
|||||||
\code{list(metric='metric-name', value='metric-value')} with given
|
\code{list(metric='metric-name', value='metric-value')} with given
|
||||||
prediction and dtrain.}
|
prediction and dtrain.}
|
||||||
|
|
||||||
\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
|
\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance.
|
||||||
information of performance. If 2, xgboost will print some additional information.
|
If 2, some additional information will be printed out.
|
||||||
Setting \code{verbose > 0} automatically engages the \code{\link{cb.evaluation.log}} and
|
Note that setting \code{verbose > 0} automatically engages the
|
||||||
\code{\link{cb.print.evaluation}} callback functions.}
|
\code{cb.print.evaluation(period=1)} callback function.}
|
||||||
|
|
||||||
\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
|
\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
|
||||||
Default is 1 which means all messages are printed. This parameter is passed to the
|
Default is 1 which means all messages are printed. This parameter is passed to the
|
||||||
@ -151,17 +153,20 @@ An object of class \code{xgb.Booster} with the following elements:
|
|||||||
(only available with early stopping).
|
(only available with early stopping).
|
||||||
\item \code{best_score} the best evaluation metric value during early stopping.
|
\item \code{best_score} the best evaluation metric value during early stopping.
|
||||||
(only available with early stopping).
|
(only available with early stopping).
|
||||||
|
\item \code{feature_names} names of the training dataset features
|
||||||
|
(only when comun names were defined in training data).
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
\code{xgb.train} is an advanced interface for training an xgboost model. The \code{xgboost} function provides a simpler interface.
|
\code{xgb.train} is an advanced interface for training an xgboost model.
|
||||||
|
The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
These are the training functions for \code{xgboost}.
|
These are the training functions for \code{xgboost}.
|
||||||
|
|
||||||
The \code{xgb.train} interface supports advanced features such as \code{watchlist},
|
The \code{xgb.train} interface supports advanced features such as \code{watchlist},
|
||||||
customized objective and evaluation metric functions, therefore it is more flexible
|
customized objective and evaluation metric functions, therefore it is more flexible
|
||||||
than the \code{\link{xgboost}} interface.
|
than the \code{xgboost} interface.
|
||||||
|
|
||||||
Parallelization is automatically enabled if \code{OpenMP} is present.
|
Parallelization is automatically enabled if \code{OpenMP} is present.
|
||||||
Number of threads can also be manually specified via \code{nthread} parameter.
|
Number of threads can also be manually specified via \code{nthread} parameter.
|
||||||
@ -187,7 +192,7 @@ The following callbacks are automatically created when certain parameters are se
|
|||||||
\itemize{
|
\itemize{
|
||||||
\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
|
\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
|
||||||
and the \code{print_every_n} parameter is passed to it.
|
and the \code{print_every_n} parameter is passed to it.
|
||||||
\item \code{cb.evaluation.log} is on when \code{verbose > 0} and \code{watchlist} is present.
|
\item \code{cb.evaluation.log} is on when \code{watchlist} is present.
|
||||||
\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
|
||||||
\item \code{cb.save.model}: when \code{save_period > 0} is set.
|
\item \code{cb.save.model}: when \code{save_period > 0} is set.
|
||||||
}
|
}
|
||||||
@ -198,7 +203,7 @@ data(agaricus.test, package='xgboost')
|
|||||||
|
|
||||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||||
watchlist <- list(eval = dtest, train = dtrain)
|
watchlist <- list(train = dtrain, eval = dtest)
|
||||||
|
|
||||||
## A simple xgb.train example:
|
## A simple xgb.train example:
|
||||||
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
|
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
|
||||||
@ -237,17 +242,15 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
|||||||
|
|
||||||
|
|
||||||
## An xgb.train example of using variable learning rates at each iteration:
|
## An xgb.train example of using variable learning rates at each iteration:
|
||||||
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2)
|
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
|
||||||
|
objective = "binary:logistic", eval_metric = "auc")
|
||||||
my_etas <- list(eta = c(0.5, 0.1))
|
my_etas <- list(eta = c(0.5, 0.1))
|
||||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||||
callbacks = list(cb.reset.parameters(my_etas)))
|
callbacks = list(cb.reset.parameters(my_etas)))
|
||||||
|
|
||||||
|
## Early stopping:
|
||||||
## Explicit use of the cb.evaluation.log callback allows to run
|
bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
|
||||||
## xgb.train silently but still store the evaluation results:
|
early_stopping_rounds = 3)
|
||||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
|
||||||
verbose = 0, callbacks = list(cb.evaluation.log()))
|
|
||||||
print(bst$evaluation_log)
|
|
||||||
|
|
||||||
## An 'xgboost' interface example:
|
## An 'xgboost' interface example:
|
||||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
|
||||||
|
|||||||
@ -8,7 +8,9 @@ train <- agaricus.train
|
|||||||
test <- agaricus.test
|
test <- agaricus.test
|
||||||
set.seed(1994)
|
set.seed(1994)
|
||||||
|
|
||||||
windows_flag = grepl('Windows', Sys.info()[['sysname']])
|
# disable some tests for Win32
|
||||||
|
windows_flag = .Platform$OS.type == "windows" &&
|
||||||
|
.Machine$sizeof.pointer != 8
|
||||||
|
|
||||||
test_that("train and predict binary classification", {
|
test_that("train and predict binary classification", {
|
||||||
nrounds = 2
|
nrounds = 2
|
||||||
@ -109,7 +111,7 @@ test_that("train and predict RF with softprob", {
|
|||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
|
||||||
max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
|
max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
|
||||||
objective = "multi:softprob", num_class=3,
|
objective = "multi:softprob", num_class=3, verbose = 0,
|
||||||
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
|
num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
|
||||||
expect_equal(bst$niter, 15)
|
expect_equal(bst$niter, 15)
|
||||||
expect_equal(xgb.ntree(bst), 15*3*4)
|
expect_equal(xgb.ntree(bst), 15*3*4)
|
||||||
@ -144,25 +146,25 @@ test_that("training continuation works", {
|
|||||||
|
|
||||||
# for the reference, use 4 iterations at once:
|
# for the reference, use 4 iterations at once:
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst <- xgb.train(param, dtrain, nrounds = 4, watchlist)
|
bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
|
||||||
# first two iterations:
|
# first two iterations:
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist)
|
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
|
||||||
# continue for two more:
|
# continue for two more:
|
||||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1)
|
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
|
||||||
if (!windows_flag)
|
if (!windows_flag)
|
||||||
expect_equal(bst$raw, bst2$raw)
|
expect_equal(bst$raw, bst2$raw)
|
||||||
expect_false(is.null(bst2$evaluation_log))
|
expect_false(is.null(bst2$evaluation_log))
|
||||||
expect_equal(dim(bst2$evaluation_log), c(4, 2))
|
expect_equal(dim(bst2$evaluation_log), c(4, 2))
|
||||||
expect_equal(bst2$evaluation_log, bst$evaluation_log)
|
expect_equal(bst2$evaluation_log, bst$evaluation_log)
|
||||||
# test continuing from raw model data
|
# test continuing from raw model data
|
||||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1$raw)
|
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
|
||||||
if (!windows_flag)
|
if (!windows_flag)
|
||||||
expect_equal(bst$raw, bst2$raw)
|
expect_equal(bst$raw, bst2$raw)
|
||||||
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
||||||
# test continuing from a model in file
|
# test continuing from a model in file
|
||||||
xgb.save(bst1, "xgboost.model")
|
xgb.save(bst1, "xgboost.model")
|
||||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = "xgboost.model")
|
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model")
|
||||||
if (!windows_flag)
|
if (!windows_flag)
|
||||||
expect_equal(bst$raw, bst2$raw)
|
expect_equal(bst$raw, bst2$raw)
|
||||||
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
expect_equal(dim(bst2$evaluation_log), c(2, 2))
|
||||||
@ -171,9 +173,11 @@ test_that("training continuation works", {
|
|||||||
|
|
||||||
test_that("xgb.cv works", {
|
test_that("xgb.cv works", {
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
|
expect_output(
|
||||||
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
|
cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
|
||||||
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
|
eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
|
||||||
verbose=TRUE)
|
verbose=TRUE)
|
||||||
|
, "train-error:")
|
||||||
expect_is(cv, 'xgb.cv.synchronous')
|
expect_is(cv, 'xgb.cv.synchronous')
|
||||||
expect_false(is.null(cv$evaluation_log))
|
expect_false(is.null(cv$evaluation_log))
|
||||||
expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
|
expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
|
||||||
|
|||||||
@ -107,18 +107,27 @@ test_that("cb.evaluation.log works as expected", {
|
|||||||
|
|
||||||
param <- list(objective = "binary:logistic", max_depth = 4, nthread = 2)
|
param <- list(objective = "binary:logistic", max_depth = 4, nthread = 2)
|
||||||
|
|
||||||
|
test_that("can store evaluation_log without printing", {
|
||||||
|
expect_silent(
|
||||||
|
bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
|
||||||
|
)
|
||||||
|
expect_false(is.null(bst$evaluation_log))
|
||||||
|
expect_false(is.null(bst$evaluation_log$train_error))
|
||||||
|
expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
|
||||||
|
})
|
||||||
|
|
||||||
test_that("cb.reset.parameters works as expected", {
|
test_that("cb.reset.parameters works as expected", {
|
||||||
|
|
||||||
# fixed eta
|
# fixed eta
|
||||||
set.seed(111)
|
set.seed(111)
|
||||||
bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9)
|
bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
|
||||||
expect_false(is.null(bst0$evaluation_log))
|
expect_false(is.null(bst0$evaluation_log))
|
||||||
expect_false(is.null(bst0$evaluation_log$train_error))
|
expect_false(is.null(bst0$evaluation_log$train_error))
|
||||||
|
|
||||||
# same eta but re-set as a vector parameter in the callback
|
# same eta but re-set as a vector parameter in the callback
|
||||||
set.seed(111)
|
set.seed(111)
|
||||||
my_par <- list(eta = c(0.9, 0.9))
|
my_par <- list(eta = c(0.9, 0.9))
|
||||||
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
expect_false(is.null(bst1$evaluation_log$train_error))
|
expect_false(is.null(bst1$evaluation_log$train_error))
|
||||||
expect_equal(bst0$evaluation_log$train_error,
|
expect_equal(bst0$evaluation_log$train_error,
|
||||||
@ -127,7 +136,7 @@ test_that("cb.reset.parameters works as expected", {
|
|||||||
# same eta but re-set via a function in the callback
|
# same eta but re-set via a function in the callback
|
||||||
set.seed(111)
|
set.seed(111)
|
||||||
my_par <- list(eta = function(itr, itr_end) 0.9)
|
my_par <- list(eta = function(itr, itr_end) 0.9)
|
||||||
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
expect_false(is.null(bst2$evaluation_log$train_error))
|
expect_false(is.null(bst2$evaluation_log$train_error))
|
||||||
expect_equal(bst0$evaluation_log$train_error,
|
expect_equal(bst0$evaluation_log$train_error,
|
||||||
@ -136,7 +145,7 @@ test_that("cb.reset.parameters works as expected", {
|
|||||||
# different eta re-set as a vector parameter in the callback
|
# different eta re-set as a vector parameter in the callback
|
||||||
set.seed(111)
|
set.seed(111)
|
||||||
my_par <- list(eta = c(0.6, 0.5))
|
my_par <- list(eta = c(0.6, 0.5))
|
||||||
bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
expect_false(is.null(bst3$evaluation_log$train_error))
|
expect_false(is.null(bst3$evaluation_log$train_error))
|
||||||
expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
|
expect_false(all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
|
||||||
@ -144,18 +153,18 @@ test_that("cb.reset.parameters works as expected", {
|
|||||||
# resetting multiple parameters at the same time runs with no error
|
# resetting multiple parameters at the same time runs with no error
|
||||||
my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
|
my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
|
||||||
expect_error(
|
expect_error(
|
||||||
bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
, NA) # NA = no error
|
, NA) # NA = no error
|
||||||
# CV works as well
|
# CV works as well
|
||||||
expect_error(
|
expect_error(
|
||||||
bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2,
|
bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
, NA) # NA = no error
|
, NA) # NA = no error
|
||||||
|
|
||||||
# expect no learning with 0 learning rate
|
# expect no learning with 0 learning rate
|
||||||
my_par <- list(eta = c(0., 0.))
|
my_par <- list(eta = c(0., 0.))
|
||||||
bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
expect_false(is.null(bstX$evaluation_log$train_error))
|
expect_false(is.null(bstX$evaluation_log$train_error))
|
||||||
er <- unique(bstX$evaluation_log$train_error)
|
er <- unique(bstX$evaluation_log$train_error)
|
||||||
@ -167,7 +176,7 @@ test_that("cb.save.model works as expected", {
|
|||||||
files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model')
|
files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model')
|
||||||
for (f in files) if (file.exists(f)) file.remove(f)
|
for (f in files) if (file.exists(f)) file.remove(f)
|
||||||
|
|
||||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1,
|
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
|
||||||
save_period = 1, save_name = "xgboost_%02d.model")
|
save_period = 1, save_name = "xgboost_%02d.model")
|
||||||
expect_true(file.exists('xgboost_01.model'))
|
expect_true(file.exists('xgboost_01.model'))
|
||||||
expect_true(file.exists('xgboost_02.model'))
|
expect_true(file.exists('xgboost_02.model'))
|
||||||
@ -178,7 +187,8 @@ test_that("cb.save.model works as expected", {
|
|||||||
expect_equal(bst$raw, b2$raw)
|
expect_equal(bst$raw, b2$raw)
|
||||||
|
|
||||||
# save_period = 0 saves the last iteration's model
|
# save_period = 0 saves the last iteration's model
|
||||||
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, save_period = 0)
|
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
|
||||||
|
save_period = 0)
|
||||||
expect_true(file.exists('xgboost.model'))
|
expect_true(file.exists('xgboost.model'))
|
||||||
b2 <- xgb.load('xgboost.model')
|
b2 <- xgb.load('xgboost.model')
|
||||||
expect_equal(bst$raw, b2$raw)
|
expect_equal(bst$raw, b2$raw)
|
||||||
@ -186,16 +196,6 @@ test_that("cb.save.model works as expected", {
|
|||||||
for (f in files) if (file.exists(f)) file.remove(f)
|
for (f in files) if (file.exists(f)) file.remove(f)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("can store evaluation_log without printing", {
|
|
||||||
expect_silent(
|
|
||||||
bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1,
|
|
||||||
verbose = 0, callbacks = list(cb.evaluation.log()))
|
|
||||||
)
|
|
||||||
expect_false(is.null(bst$evaluation_log))
|
|
||||||
expect_false(is.null(bst$evaluation_log$train_error))
|
|
||||||
expect_lt(bst$evaluation_log[, min(train_error)], 0.2)
|
|
||||||
})
|
|
||||||
|
|
||||||
test_that("early stopping xgb.train works", {
|
test_that("early stopping xgb.train works", {
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
expect_output(
|
expect_output(
|
||||||
@ -211,6 +211,13 @@ test_that("early stopping xgb.train works", {
|
|||||||
err_pred <- err(ltest, pred)
|
err_pred <- err(ltest, pred)
|
||||||
err_log <- bst$evaluation_log[bst$best_iteration, test_error]
|
err_log <- bst$evaluation_log[bst$best_iteration, test_error]
|
||||||
expect_equal(err_log, err_pred, tolerance = 5e-6)
|
expect_equal(err_log, err_pred, tolerance = 5e-6)
|
||||||
|
|
||||||
|
set.seed(11)
|
||||||
|
expect_silent(
|
||||||
|
bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
|
||||||
|
early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
|
||||||
|
)
|
||||||
|
expect_equal(bst$evaluation_log, bst0$evaluation_log)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("early stopping using a specific metric works", {
|
test_that("early stopping using a specific metric works", {
|
||||||
@ -248,7 +255,7 @@ test_that("early stopping xgb.cv works", {
|
|||||||
test_that("prediction in xgb.cv works", {
|
test_that("prediction in xgb.cv works", {
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
nrounds = 4
|
nrounds = 4
|
||||||
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE)
|
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
|
||||||
expect_false(is.null(cv$evaluation_log))
|
expect_false(is.null(cv$evaluation_log))
|
||||||
expect_false(is.null(cv$pred))
|
expect_false(is.null(cv$pred))
|
||||||
expect_length(cv$pred, nrow(train$data))
|
expect_length(cv$pred, nrow(train$data))
|
||||||
@ -258,7 +265,7 @@ test_that("prediction in xgb.cv works", {
|
|||||||
|
|
||||||
# save CV models
|
# save CV models
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE,
|
cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0,
|
||||||
callbacks = list(cb.cv.predict(save_models = TRUE)))
|
callbacks = list(cb.cv.predict(save_models = TRUE)))
|
||||||
expect_equal(cv$evaluation_log, cvx$evaluation_log)
|
expect_equal(cv$evaluation_log, cvx$evaluation_log)
|
||||||
expect_length(cvx$models, 5)
|
expect_length(cvx$models, 5)
|
||||||
@ -268,7 +275,7 @@ test_that("prediction in xgb.cv works", {
|
|||||||
test_that("prediction in xgb.cv works for gblinear too", {
|
test_that("prediction in xgb.cv works for gblinear too", {
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2)
|
p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2)
|
||||||
cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE)
|
cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
|
||||||
expect_false(is.null(cv$evaluation_log))
|
expect_false(is.null(cv$evaluation_log))
|
||||||
expect_false(is.null(cv$pred))
|
expect_false(is.null(cv$pred))
|
||||||
expect_length(cv$pred, nrow(train$data))
|
expect_length(cv$pred, nrow(train$data))
|
||||||
@ -300,7 +307,7 @@ test_that("prediction in xgb.cv for softprob works", {
|
|||||||
expect_warning(
|
expect_warning(
|
||||||
cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
|
cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
|
||||||
eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
|
eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
|
||||||
subsample = 0.8, gamma = 2,
|
subsample = 0.8, gamma = 2, verbose = 0,
|
||||||
prediction = TRUE, objective = "multi:softprob", num_class = 3)
|
prediction = TRUE, objective = "multi:softprob", num_class = 3)
|
||||||
, NA)
|
, NA)
|
||||||
expect_false(is.null(cv$pred))
|
expect_false(is.null(cv$pred))
|
||||||
|
|||||||
@ -21,6 +21,15 @@ test_that("xgb.DMatrix: basic construction, saving, loading", {
|
|||||||
dtest3 <- xgb.DMatrix(tmp_file)
|
dtest3 <- xgb.DMatrix(tmp_file)
|
||||||
unlink(tmp_file)
|
unlink(tmp_file)
|
||||||
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
|
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
|
||||||
|
|
||||||
|
# from a libsvm text file
|
||||||
|
tmp <- c("0 1:1 2:1","1 3:1","0 1:1")
|
||||||
|
tmp_file <- 'tmp.libsvm'
|
||||||
|
writeLines(tmp, tmp_file)
|
||||||
|
dtest4 <- xgb.DMatrix(tmp_file)
|
||||||
|
expect_equal(dim(dtest4), c(3, 4))
|
||||||
|
expect_equal(getinfo(dtest4, 'label'), c(0,1,0))
|
||||||
|
unlink(tmp_file)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.DMatrix: getinfo & setinfo", {
|
test_that("xgb.DMatrix: getinfo & setinfo", {
|
||||||
|
|||||||
@ -3,7 +3,7 @@ context('Test helper functions')
|
|||||||
require(xgboost)
|
require(xgboost)
|
||||||
require(data.table)
|
require(data.table)
|
||||||
require(Matrix)
|
require(Matrix)
|
||||||
require(vcd)
|
require(vcd, quietly = TRUE)
|
||||||
|
|
||||||
set.seed(1982)
|
set.seed(1982)
|
||||||
data(Arthritis)
|
data(Arthritis)
|
||||||
@ -15,10 +15,12 @@ sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
|
|||||||
label <- df[, ifelse(Improved == "Marked", 1, 0)]
|
label <- df[, ifelse(Improved == "Marked", 1, 0)]
|
||||||
|
|
||||||
bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
|
bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
|
||||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic", booster = "gbtree")
|
eta = 1, nthread = 2, nrounds = 10, verbose = 0,
|
||||||
|
objective = "binary:logistic", booster = "gbtree")
|
||||||
|
|
||||||
bst.GLM <- xgboost(data = sparse_matrix, label = label,
|
bst.GLM <- xgboost(data = sparse_matrix, label = label,
|
||||||
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic", booster = "gblinear")
|
eta = 1, nthread = 2, nrounds = 10, verbose = 0,
|
||||||
|
objective = "binary:logistic", booster = "gblinear")
|
||||||
|
|
||||||
feature.names <- colnames(sparse_matrix)
|
feature.names <- colnames(sparse_matrix)
|
||||||
|
|
||||||
@ -100,12 +102,37 @@ if (grepl('Windows', Sys.info()[['sysname']]) || grepl('Linux', Sys.info()[['sys
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test_that("xgb.Booster serializing as R object works", {
|
||||||
|
saveRDS(bst.Tree, 'xgb.model.rds')
|
||||||
|
bst <- readRDS('xgb.model.rds')
|
||||||
|
dtrain <- xgb.DMatrix(sparse_matrix, label = label)
|
||||||
|
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain))
|
||||||
|
expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
|
||||||
|
xgb.save(bst, 'xgb.model')
|
||||||
|
nil_ptr <- new("externalptr")
|
||||||
|
class(nil_ptr) <- "xgb.Booster.handle"
|
||||||
|
expect_true(identical(bst$handle, nil_ptr))
|
||||||
|
bst <- xgb.Booster.complete(bst)
|
||||||
|
expect_true(!identical(bst$handle, nil_ptr))
|
||||||
|
expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain))
|
||||||
|
})
|
||||||
|
|
||||||
test_that("xgb.model.dt.tree works with and without feature names", {
|
test_that("xgb.model.dt.tree works with and without feature names", {
|
||||||
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
|
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
|
||||||
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
||||||
expect_equal(names.dt.trees, names(dt.tree))
|
expect_equal(names.dt.trees, names(dt.tree))
|
||||||
expect_equal(dim(dt.tree), c(162, 10))
|
expect_equal(dim(dt.tree), c(162, 10))
|
||||||
expect_output(str(xgb.model.dt.tree(model = bst.Tree)), 'Feature.*\\"3\\"')
|
expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
|
||||||
|
|
||||||
|
dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
|
||||||
|
expect_equal(dt.tree, dt.tree.0)
|
||||||
|
|
||||||
|
# when model contains no feature names:
|
||||||
|
bst.Tree.x <- bst.Tree
|
||||||
|
bst.Tree.x$feature_names <- NULL
|
||||||
|
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
|
||||||
|
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
|
||||||
|
expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.model.dt.tree throws error for gblinear", {
|
test_that("xgb.model.dt.tree throws error for gblinear", {
|
||||||
@ -116,7 +143,17 @@ test_that("xgb.importance works with and without feature names", {
|
|||||||
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
|
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
|
||||||
expect_equal(dim(importance.Tree), c(7, 4))
|
expect_equal(dim(importance.Tree), c(7, 4))
|
||||||
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
|
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
|
||||||
expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"')
|
expect_output(str(importance.Tree), 'Feature.*\\"Age\\"')
|
||||||
|
|
||||||
|
importance.Tree.0 <- xgb.importance(model = bst.Tree)
|
||||||
|
expect_equal(importance.Tree, importance.Tree.0)
|
||||||
|
|
||||||
|
# when model contains no feature names:
|
||||||
|
bst.Tree.x <- bst.Tree
|
||||||
|
bst.Tree.x$feature_names <- NULL
|
||||||
|
importance.Tree.x <- xgb.importance(model = bst.Tree)
|
||||||
|
expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE])
|
||||||
|
|
||||||
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
|
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
|
||||||
expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
|
expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
|
||||||
xgb.ggplot.importance(importance_matrix = importance.Tree)
|
xgb.ggplot.importance(importance_matrix = importance.Tree)
|
||||||
|
|||||||
@ -10,7 +10,7 @@ train = matrix(x, ncol = 1)
|
|||||||
|
|
||||||
test_that("monotone constraints for regression", {
|
test_that("monotone constraints for regression", {
|
||||||
bst = xgboost(data = train, label = y, max_depth = 2,
|
bst = xgboost(data = train, label = y, max_depth = 2,
|
||||||
eta = 0.1, nthread = 2, nrounds = 100,
|
eta = 0.1, nthread = 2, nrounds = 100, verbose = 0,
|
||||||
monotone_constraints = -1)
|
monotone_constraints = -1)
|
||||||
|
|
||||||
pred = predict(bst, train)
|
pred = predict(bst, train)
|
||||||
|
|||||||
@ -9,24 +9,23 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
|||||||
|
|
||||||
test_that("updating the model works", {
|
test_that("updating the model works", {
|
||||||
watchlist = list(train = dtrain, test = dtest)
|
watchlist = list(train = dtrain, test = dtest)
|
||||||
cb = list(cb.evaluation.log()) # to run silent, but store eval. log
|
|
||||||
|
|
||||||
# no-subsampling
|
# no-subsampling
|
||||||
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
|
p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
|
bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||||
tr1 <- xgb.model.dt.tree(model = bst1)
|
tr1 <- xgb.model.dt.tree(model = bst1)
|
||||||
|
|
||||||
# with subsampling
|
# with subsampling
|
||||||
p2 <- modifyList(p1, list(subsample = 0.1))
|
p2 <- modifyList(p1, list(subsample = 0.1))
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
|
bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||||
tr2 <- xgb.model.dt.tree(model = bst2)
|
tr2 <- xgb.model.dt.tree(model = bst2)
|
||||||
|
|
||||||
# the same no-subsampling boosting with an extra 'refresh' updater:
|
# the same no-subsampling boosting with an extra 'refresh' updater:
|
||||||
p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
|
p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
|
bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||||
tr1r <- xgb.model.dt.tree(model = bst1r)
|
tr1r <- xgb.model.dt.tree(model = bst1r)
|
||||||
# all should be the same when no subsampling
|
# all should be the same when no subsampling
|
||||||
expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
|
expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
|
||||||
@ -35,7 +34,7 @@ test_that("updating the model works", {
|
|||||||
# the same boosting with subsampling with an extra 'refresh' updater:
|
# the same boosting with subsampling with an extra 'refresh' updater:
|
||||||
p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
|
p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
|
||||||
set.seed(11)
|
set.seed(11)
|
||||||
bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
|
bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0)
|
||||||
tr2r <- xgb.model.dt.tree(model = bst2r)
|
tr2r <- xgb.model.dt.tree(model = bst2r)
|
||||||
# should be the same evaluation but different gains and larger cover
|
# should be the same evaluation but different gains and larger cover
|
||||||
expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
|
expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
|
||||||
@ -45,7 +44,7 @@ test_that("updating the model works", {
|
|||||||
|
|
||||||
# process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
|
# process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
|
||||||
p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
|
p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
|
||||||
bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst1)
|
bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
|
||||||
tr1u <- xgb.model.dt.tree(model = bst1u)
|
tr1u <- xgb.model.dt.tree(model = bst1u)
|
||||||
# all should be the same when no subsampling
|
# all should be the same when no subsampling
|
||||||
expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
|
expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
|
||||||
@ -53,7 +52,7 @@ test_that("updating the model works", {
|
|||||||
|
|
||||||
# process type 'update' for model with subsampling, refreshing only the tree stats from training data:
|
# process type 'update' for model with subsampling, refreshing only the tree stats from training data:
|
||||||
p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
||||||
bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst2)
|
bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
|
||||||
tr2u <- xgb.model.dt.tree(model = bst2u)
|
tr2u <- xgb.model.dt.tree(model = bst2u)
|
||||||
# should be the same evaluation but different gains and larger cover
|
# should be the same evaluation but different gains and larger cover
|
||||||
expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
|
expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
|
||||||
@ -66,7 +65,7 @@ test_that("updating the model works", {
|
|||||||
|
|
||||||
# process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
|
# process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
|
||||||
p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
|
||||||
bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst1)
|
bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
|
||||||
tr1ut <- xgb.model.dt.tree(model = bst1ut)
|
tr1ut <- xgb.model.dt.tree(model = bst1ut)
|
||||||
# should be the same evaluations but different gains and smaller cover (test data is smaller)
|
# should be the same evaluations but different gains and smaller cover (test data is smaller)
|
||||||
expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
|
expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
|
||||||
|
|||||||
47
doc/build.md
47
doc/build.md
@ -189,7 +189,15 @@ There are several ways to install the package:
|
|||||||
|
|
||||||
## R Package Installation
|
## R Package Installation
|
||||||
|
|
||||||
You can install R package from cran just like other packages, or you can install from our weekly updated drat repo:
|
### Installing pre-packaged version
|
||||||
|
|
||||||
|
You can install xgboost from CRAN just like any other R package:
|
||||||
|
|
||||||
|
```r
|
||||||
|
install.packages("xgboost")
|
||||||
|
```
|
||||||
|
|
||||||
|
Or you can install it from our weekly updated drat repo:
|
||||||
|
|
||||||
```r
|
```r
|
||||||
install.packages("drat", repos="https://cran.rstudio.com")
|
install.packages("drat", repos="https://cran.rstudio.com")
|
||||||
@ -197,10 +205,8 @@ drat:::addRepo("dmlc")
|
|||||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
||||||
```
|
```
|
||||||
|
|
||||||
If you would like to use the latest xgboost version and already compiled xgboost, use `library(devtools); install('xgboost/R-package')` to install manually xgboost package (change the path accordingly to where you compiled xgboost).
|
For OSX users, single threaded version will be installed. To install multi-threaded version,
|
||||||
|
first follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compiler, then:
|
||||||
For OSX users, single threaded version will be installed, to install multi-threaded version.
|
|
||||||
First follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compiler, then:
|
|
||||||
|
|
||||||
- Set the `Makevars` file in highest piority for R.
|
- Set the `Makevars` file in highest piority for R.
|
||||||
|
|
||||||
@ -214,24 +220,35 @@ First follow [Building on OSX](#building-on-osx) to get the OpenMP enabled compi
|
|||||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
||||||
```
|
```
|
||||||
|
|
||||||
Due to the usage of submodule, `install_github` is no longer support to install the
|
### Installing the development version
|
||||||
latest version of R package. To install the latest version run the following bash script,
|
|
||||||
|
Make sure you have installed git and a recent C++ compiler supporting C++11 (e.g., g++-4.6 or higher).
|
||||||
|
On Windows, Rtools must be installed, and its bin directory has to be added to PATH during the installation.
|
||||||
|
And see the previous subsection for an OSX tip.
|
||||||
|
|
||||||
|
Due to the use of git-submodules, `devtools::install_github` can no longer be used to install the latest version of R package.
|
||||||
|
Thus, one has to run git to check out the code first:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone --recursive https://github.com/dmlc/xgboost
|
git clone --recursive https://github.com/dmlc/xgboost
|
||||||
cd xgboost
|
cd xgboost
|
||||||
git submodule init
|
git submodule init
|
||||||
git submodule update
|
git submodule update
|
||||||
alias make='mingw32-make'
|
cd R-package
|
||||||
cd dmlc-core
|
R CMD INSTALL .
|
||||||
make -j4
|
|
||||||
cd ../rabit
|
|
||||||
make lib/librabit_empty.a -j4
|
|
||||||
cd ..
|
|
||||||
cp make/mingw64.mk config.mk
|
|
||||||
make -j4
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If the last line fails because of "R: command not found", it means that R was not set up to run from command line.
|
||||||
|
In this case, just start R as you would normally do and run the following:
|
||||||
|
|
||||||
|
```r
|
||||||
|
setwd('wherever/you/cloned/it/xgboost/R-package/')
|
||||||
|
install.packages('.', repos = NULL, type="source")
|
||||||
|
```
|
||||||
|
|
||||||
|
If all fails, try [building the shared library](#build-the-shared-library) to see whether a problem is specific to R package or not.
|
||||||
|
|
||||||
|
|
||||||
## Trouble Shooting
|
## Trouble Shooting
|
||||||
|
|
||||||
1. **Compile failed after `git pull`**
|
1. **Compile failed after `git pull`**
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user