[R] maintenance Apr 2017 (#2237)

* [R] make sure things work for a single split model; fixes #2191

* [R] add option use_int_id to xgb.model.dt.tree

* [R] add example of exporting tree plot to a file

* [R] set save_period = NULL as default in xgboost() to be the same as in xgb.train; fixes #2182

* [R] it's a good practice after CRAN releases to bump up package version in dev

* [R] allow xgb.DMatrix construction from integer dense matrices

* [R] xgb.DMatrix: silent parameter; improve documentation

* [R] xgb.model.dt.tree code style changes

* [R] update NEWS with parameter changes

* [R] code safety & style; handle non-strict matrix and inherited classes of input and model; fixes #2242

* [R] change to x.y.z.p R-package versioning scheme and set version to 0.6.4.3

* [R] add an R package versioning section to the contributors guide

* [R] R-package/README.md: clean up the redundant old installation instructions, link the contributors guide
This commit is contained in:
Vadim Khotilovich 2017-05-02 00:51:34 -05:00 committed by Tong He
parent d769b6bcb5
commit a375ad2822
29 changed files with 351 additions and 246 deletions

View File

@ -12,6 +12,11 @@ This file records the changes in xgboost library in reverse chronological order.
- Thread local variable is upgraded so it is automatically freed at thread exit. - Thread local variable is upgraded so it is automatically freed at thread exit.
* Migrate to C++11 * Migrate to C++11
- The current master version now requires C++11 enabled compiled(g++4.8 or higher) - The current master version now requires C++11 enabled compiled(g++4.8 or higher)
* R package:
- New parameters:
- `silent` in `xgb.DMatrix()`
- `use_int_id` in `xgb.model.dt.tree()`
- Default value of the `save_period` parameter in `xgboost()` changed to NULL (consistent with `xgb.train()`).
## v0.6 (2016.07.29) ## v0.6 (2016.07.29)
* Version 0.5 is skipped due to major improvements in the core * Version 0.5 is skipped due to major improvements in the core

View File

@ -1,7 +1,7 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 0.6-4 Version: 0.6.4.3
Date: 2017-01-04 Date: 2017-01-04
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>, Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,

View File

@ -57,7 +57,7 @@ NULL
#' \code{\link{callbacks}} #' \code{\link{callbacks}}
#' #'
#' @export #' @export
cb.print.evaluation <- function(period=1, showsd=TRUE) { cb.print.evaluation <- function(period = 1, showsd = TRUE) {
callback <- function(env = parent.frame()) { callback <- function(env = parent.frame()) {
if (length(env$bst_evaluation) == 0 || if (length(env$bst_evaluation) == 0 ||
@ -132,7 +132,7 @@ cb.evaluation.log <- function() {
cnames <- numeric(len) cnames <- numeric(len)
cnames[c(TRUE, FALSE)] <- means cnames[c(TRUE, FALSE)] <- means
cnames[c(FALSE, TRUE)] <- stds cnames[c(FALSE, TRUE)] <- stds
env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with=FALSE] env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with = FALSE]
} }
} }
@ -290,8 +290,8 @@ cb.reset.parameters <- function(new_params) {
#' \code{\link{xgb.attr}} #' \code{\link{xgb.attr}}
#' #'
#' @export #' @export
cb.early.stop <- function(stopping_rounds, maximize=FALSE, cb.early.stop <- function(stopping_rounds, maximize = FALSE,
metric_name=NULL, verbose=TRUE) { metric_name = NULL, verbose = TRUE) {
# state variables # state variables
best_iteration <- -1 best_iteration <- -1
best_ntreelimit <- -1 best_ntreelimit <- -1
@ -308,7 +308,7 @@ cb.early.stop <- function(stopping_rounds, maximize=FALSE,
metric_idx <<- which(gsub('-', '_', metric_name) == eval_names) metric_idx <<- which(gsub('-', '_', metric_name) == eval_names)
if (length(metric_idx) == 0) if (length(metric_idx) == 0)
stop("'metric_name' for early stopping is not one of the following:\n", stop("'metric_name' for early stopping is not one of the following:\n",
paste(eval_names, collapse=' '), '\n') paste(eval_names, collapse = ' '), '\n')
} }
if (is.null(metric_name) && if (is.null(metric_name) &&
length(env$bst_evaluation) > 1) { length(env$bst_evaluation) > 1) {
@ -334,7 +334,7 @@ cb.early.stop <- function(stopping_rounds, maximize=FALSE,
env$stop_condition <- FALSE env$stop_condition <- FALSE
if (!is.null(env$bst)) { if (!is.null(env$bst)) {
if (class(env$bst) != 'xgb.Booster') if (!inherits(env$bst, 'xgb.Booster'))
stop("'bst' in the parent frame must be an 'xgb.Booster'") stop("'bst' in the parent frame must be an 'xgb.Booster'")
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) { if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
best_score <<- as.numeric(best_score) best_score <<- as.numeric(best_score)
@ -529,7 +529,7 @@ cb.cv.predict <- function(save_models = FALSE) {
# #
# Format the evaluation metric string # Format the evaluation metric string
format.eval.string <- function(iter, eval_res, eval_err=NULL) { format.eval.string <- function(iter, eval_res, eval_err = NULL) {
if (length(eval_res) == 0) if (length(eval_res) == 0)
stop('no evaluation results') stop('no evaluation results')
enames <- names(eval_res) enames <- names(eval_res)
@ -539,9 +539,9 @@ format.eval.string <- function(iter, eval_res, eval_err=NULL) {
if (!is.null(eval_err)) { if (!is.null(eval_err)) {
if (length(eval_res) != length(eval_err)) if (length(eval_res) != length(eval_err))
stop('eval_res & eval_err lengths mismatch') stop('eval_res & eval_err lengths mismatch')
res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse='\t') res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse = '\t')
} else { } else {
res <- paste0(sprintf("%s:%f", enames, eval_res), collapse='\t') res <- paste0(sprintf("%s:%f", enames, eval_res), collapse = '\t')
} }
return(paste0(iter, res)) return(paste0(iter, res))
} }

View File

@ -47,7 +47,7 @@ check.booster.params <- function(params, ...) {
multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric') multi_names <- setdiff(names(name_freqs[name_freqs > 1]), 'eval_metric')
if (length(multi_names) > 0) { if (length(multi_names) > 0) {
warning("The following parameters were provided multiple times:\n\t", warning("The following parameters were provided multiple times:\n\t",
paste(multi_names, collapse=', '), "\n Only the last value for each of them will be used.\n") paste(multi_names, collapse = ', '), "\n Only the last value for each of them will be used.\n")
# While xgboost internals would choose the last value for a multiple-times parameter, # While xgboost internals would choose the last value for a multiple-times parameter,
# enforce it here in R as well (b/c multi-parameters might be used further in R code, # enforce it here in R as well (b/c multi-parameters might be used further in R code,
# and R takes the 1st value when multiple elements with the same name are present in a list). # and R takes the 1st value when multiple elements with the same name are present in a list).
@ -120,22 +120,22 @@ check.custom.eval <- function(env = parent.frame()) {
} }
# Update booster with dtrain for an iteration # Update a booster handle for an iteration with dtrain data
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) { xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
if (class(booster) != "xgb.Booster.handle") { if (!identical(class(booster_handle), "xgb.Booster.handle")) {
stop("first argument type must be xgb.Booster.handle") stop("booster_handle must be of xgb.Booster.handle class")
} }
if (class(dtrain) != "xgb.DMatrix") { if (!inherits(dtrain, "xgb.DMatrix")) {
stop("second argument type must be xgb.DMatrix") stop("dtrain must be of xgb.DMatrix class")
} }
if (is.null(obj)) { if (is.null(obj)) {
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, .Call("XGBoosterUpdateOneIter_R", booster_handle, as.integer(iter), dtrain,
PACKAGE = "xgboost") PACKAGE = "xgboost")
} else { } else {
pred <- predict(booster, dtrain) pred <- predict(booster_handle, dtrain)
gpair <- obj(pred, dtrain) gpair <- obj(pred, dtrain)
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost") .Call("XGBoosterBoostOneIter_R", booster_handle, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
} }
return(TRUE) return(TRUE)
} }
@ -144,16 +144,16 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
# Evaluate one iteration. # Evaluate one iteration.
# Returns a named vector of evaluation metrics # Returns a named vector of evaluation metrics
# with the names in a 'datasetname-metricname' format. # with the names in a 'datasetname-metricname' format.
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) { xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
if (class(booster) != "xgb.Booster.handle") if (!identical(class(booster_handle), "xgb.Booster.handle"))
stop("first argument type must be xgb.Booster.handle") stop("class of booster_handle must be xgb.Booster.handle")
if (length(watchlist) == 0) if (length(watchlist) == 0)
return(NULL) return(NULL)
evnames <- names(watchlist) evnames <- names(watchlist)
if (is.null(feval)) { if (is.null(feval)) {
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, msg <- .Call("XGBoosterEvalOneIter_R", booster_handle, as.integer(iter), watchlist,
as.list(evnames), PACKAGE = "xgboost") as.list(evnames), PACKAGE = "xgboost")
msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1] msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
@ -161,7 +161,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
} else { } else {
res <- sapply(seq_along(watchlist), function(j) { res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]] w <- watchlist[[j]]
preds <- predict(booster, w) # predict using all trees preds <- predict(booster_handle, w) # predict using all trees
eval_res <- feval(preds, w) eval_res <- feval(preds, w)
out <- eval_res$value out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric) names(out) <- paste0(evnames[j], "-", eval_res$metric)
@ -180,7 +180,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
generate.cv.folds <- function(nfold, nrows, stratified, label, params) { generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
# cannot do it for rank # cannot do it for rank
if (exists('objective', where=params) && if (exists('objective', where = params) &&
is.character(params$objective) && is.character(params$objective) &&
strtrim(params$objective, 5) == 'rank:') { strtrim(params$objective, 5) == 'rank:') {
stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n", stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
@ -195,7 +195,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
# - For classification, need to convert y labels to factor before making the folds, # - For classification, need to convert y labels to factor before making the folds,
# and then do stratification by factor levels. # and then do stratification by factor levels.
# - For regression, leave y numeric and do stratification by quantiles. # - For regression, leave y numeric and do stratification by quantiles.
if (exists('objective', where=params) && if (exists('objective', where = params) &&
is.character(params$objective)) { is.character(params$objective)) {
# If 'objective' provided in params, assume that y is a classification label # If 'objective' provided in params, assume that y is a classification label
# unless objective is reg:linear # unless objective is reg:linear
@ -306,7 +306,7 @@ depr_par_lut <- matrix(c(
'plot.width','plot_width', 'plot.width','plot_width',
'n_first_tree', 'trees', 'n_first_tree', 'trees',
'dummy', 'DUMMY' 'dummy', 'DUMMY'
), ncol=2, byrow = TRUE) ), ncol = 2, byrow = TRUE)
colnames(depr_par_lut) <- c('old', 'new') colnames(depr_par_lut) <- c('old', 'new')
# Checks the dot-parameters for deprecated names # Checks the dot-parameters for deprecated names
@ -331,7 +331,7 @@ check.deprecation <- function(..., env = parent.frame()) {
if (!ex_match[i]) { if (!ex_match[i]) {
warning("'", pars_par, "' was partially matched to '", old_par,"'") warning("'", pars_par, "' was partially matched to '", old_par,"'")
} }
.Deprecated(new_par, old=old_par, package = 'xgboost') .Deprecated(new_par, old = old_par, package = 'xgboost')
if (new_par != 'NULL') { if (new_par != 'NULL') {
eval(parse(text = paste(new_par, '<-', pars[[pars_par]])), envir = env) eval(parse(text = paste(new_par, '<-', pars[[pars_par]])), envir = env)
} }

View File

@ -1,19 +1,19 @@
# Construct an internal xgboost Booster and return a handle to it # Construct an internal xgboost Booster and return a handle to it.
# internal utility function # internal utility function
xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) { xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list" || if (typeof(cachelist) != "list" ||
any(sapply(cachelist, class) != 'xgb.DMatrix')) { !all(sapply(cachelist, inherits, 'xgb.DMatrix'))) {
stop("xgb.Booster only accepts list of DMatrix as cachelist") stop("cachelist must be a list of xgb.DMatrix objects")
} }
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
if (!is.null(modelfile)) { if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") { if (typeof(modelfile) == "character") {
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") .Call("XGBoosterLoadModel_R", handle, modelfile[1], PACKAGE = "xgboost")
} else if (typeof(modelfile) == "raw") { } else if (typeof(modelfile) == "raw") {
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
} else if (class(modelfile) == "xgb.Booster") { } else if (inherits(modelfile, "xgb.Booster")) {
bst <- xgb.Booster.complete(modelfile, saveraw=TRUE) bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
.Call("XGBoosterLoadModelFromRaw_R", handle, bst$raw, PACKAGE = "xgboost") .Call("XGBoosterLoadModelFromRaw_R", handle, bst$raw, PACKAGE = "xgboost")
} else { } else {
stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object") stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
@ -37,10 +37,10 @@ xgb.handleToBooster <- function(handle, raw = NULL) {
# Check whether xgb.Booster.handle is null # Check whether xgb.Booster.handle is null
# internal utility function # internal utility function
is.null.handle <- function(handle) { is.null.handle <- function(handle) {
if (class(handle) != "xgb.Booster.handle") if (!identical(class(handle), "xgb.Booster.handle"))
stop("argument type must be xgb.Booster.handle") stop("argument type must be xgb.Booster.handle")
if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE="xgboost")) if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE = "xgboost"))
return(TRUE) return(TRUE)
return(FALSE) return(FALSE)
} }
@ -78,8 +78,8 @@ xgb.get.handle <- function(object) {
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods #' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
#' should still work for such a model object since those methods would be using #' should still work for such a model object since those methods would be using
#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the #' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
#' \code{xgb.Booster.complete} function once after loading a model as an R-object. That which would #' \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
#' prevent further reconstruction (potentially, multiple times) of an internal booster model. #' That would prevent further repeated implicit reconstruction of an internal booster model.
#' #'
#' @return #' @return
#' An object of \code{xgb.Booster} class. #' An object of \code{xgb.Booster} class.
@ -94,13 +94,14 @@ xgb.get.handle <- function(object) {
#' bst1 <- readRDS("xgb.model.rds") #' bst1 <- readRDS("xgb.model.rds")
#' # the handle is invalid: #' # the handle is invalid:
#' print(bst1$handle) #' print(bst1$handle)
#'
#' bst1 <- xgb.Booster.complete(bst1) #' bst1 <- xgb.Booster.complete(bst1)
#' # now the handle points to a valid internal booster model: #' # now the handle points to a valid internal booster model:
#' print(bst1$handle) #' print(bst1$handle)
#' #'
#' @export #' @export
xgb.Booster.complete <- function(object, saveraw = TRUE) { xgb.Booster.complete <- function(object, saveraw = TRUE) {
if (class(object) != "xgb.Booster") if (!inherits(object, "xgb.Booster"))
stop("argument type must be xgb.Booster") stop("argument type must be xgb.Booster")
if (is.null.handle(object$handle)) { if (is.null.handle(object$handle)) {
@ -225,7 +226,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) { outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (class(newdata) != "xgb.DMatrix") if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing) newdata <- xgb.DMatrix(newdata, missing = missing)
if (is.null(ntreelimit)) if (is.null(ntreelimit))
ntreelimit <- NVL(object$best_ntreelimit, 0) ntreelimit <- NVL(object$best_ntreelimit, 0)
@ -337,7 +338,7 @@ predict.xgb.Booster.handle <- function(object, ...) {
xgb.attr <- function(object, name) { xgb.attr <- function(object, name) {
if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name") if (is.null(name) || nchar(as.character(name[1])) == 0) stop("invalid attribute name")
handle <- xgb.get.handle(object) handle <- xgb.get.handle(object)
.Call("XGBoosterGetAttr_R", handle, as.character(name[1]), PACKAGE="xgboost") .Call("XGBoosterGetAttr_R", handle, as.character(name[1]), PACKAGE = "xgboost")
} }
#' @rdname xgb.attr #' @rdname xgb.attr
@ -354,7 +355,7 @@ xgb.attr <- function(object, name) {
value <- as.character(value[1]) value <- as.character(value[1])
} }
} }
.Call("XGBoosterSetAttr_R", handle, as.character(name[1]), value, PACKAGE="xgboost") .Call("XGBoosterSetAttr_R", handle, as.character(name[1]), value, PACKAGE = "xgboost")
if (is(object, 'xgb.Booster') && !is.null(object$raw)) { if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
object$raw <- xgb.save.raw(object$handle) object$raw <- xgb.save.raw(object$handle)
} }
@ -365,10 +366,10 @@ xgb.attr <- function(object, name) {
#' @export #' @export
xgb.attributes <- function(object) { xgb.attributes <- function(object) {
handle <- xgb.get.handle(object) handle <- xgb.get.handle(object)
attr_names <- .Call("XGBoosterGetAttrNames_R", handle, PACKAGE="xgboost") attr_names <- .Call("XGBoosterGetAttrNames_R", handle, PACKAGE = "xgboost")
if (is.null(attr_names)) return(NULL) if (is.null(attr_names)) return(NULL)
res <- lapply(attr_names, function(x) { res <- lapply(attr_names, function(x) {
.Call("XGBoosterGetAttr_R", handle, x, PACKAGE="xgboost") .Call("XGBoosterGetAttr_R", handle, x, PACKAGE = "xgboost")
}) })
names(res) <- attr_names names(res) <- attr_names
res res
@ -393,7 +394,7 @@ xgb.attributes <- function(object) {
}) })
handle <- xgb.get.handle(object) handle <- xgb.get.handle(object)
for (i in seq_along(a)) { for (i in seq_along(a)) {
.Call("XGBoosterSetAttr_R", handle, names(a[i]), a[[i]], PACKAGE="xgboost") .Call("XGBoosterSetAttr_R", handle, names(a[i]), a[[i]], PACKAGE = "xgboost")
} }
if (is(object, 'xgb.Booster') && !is.null(object$raw)) { if (is(object, 'xgb.Booster') && !is.null(object$raw)) {
object$raw <- xgb.save.raw(object$handle) object$raw <- xgb.save.raw(object$handle)
@ -442,8 +443,8 @@ xgb.attributes <- function(object) {
object object
} }
# Extract # of trees in a model # Extract the number of trees in a model.
# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration # TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration.
# internal utility function # internal utility function
xgb.ntree <- function(bst) { xgb.ntree <- function(bst) {
length(grep('^booster', xgb.dump(bst))) length(grep('^booster', xgb.dump(bst)))
@ -470,7 +471,7 @@ xgb.ntree <- function(bst) {
#' #'
#' @method print xgb.Booster #' @method print xgb.Booster
#' @export #' @export
print.xgb.Booster <- function(x, verbose=FALSE, ...) { print.xgb.Booster <- function(x, verbose = FALSE, ...) {
cat('##### xgb.Booster\n') cat('##### xgb.Booster\n')
valid_handle <- is.null.handle(x$handle) valid_handle <- is.null.handle(x$handle)
@ -479,7 +480,7 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
cat('raw: ') cat('raw: ')
if (!is.null(x$raw)) { if (!is.null(x$raw)) {
cat(format(object.size(x$raw), units="auto"), '\n') cat(format(object.size(x$raw), units = "auto"), '\n')
} else { } else {
cat('NULL\n') cat('NULL\n')
} }
@ -493,7 +494,7 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
cat( ' ', cat( ' ',
paste(names(x$params), paste(names(x$params),
paste0('"', unlist(x$params), '"'), paste0('"', unlist(x$params), '"'),
sep=' = ', collapse=', '), '\n', sep='') sep = ' = ', collapse = ', '), '\n', sep = '')
} }
# TODO: need an interface to access all the xgboosts parameters # TODO: need an interface to access all the xgboosts parameters
@ -505,9 +506,9 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
if (verbose) { if (verbose) {
cat( paste(paste0(' ',names(attrs)), cat( paste(paste0(' ',names(attrs)),
paste0('"', unlist(attrs), '"'), paste0('"', unlist(attrs), '"'),
sep=' = ', collapse='\n'), '\n', sep='') sep = ' = ', collapse = '\n'), '\n', sep = '')
} else { } else {
cat(' ', paste(names(attrs), collapse=', '), '\n', sep='') cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '')
} }
} }
@ -522,16 +523,16 @@ print.xgb.Booster <- function(x, verbose=FALSE, ...) {
if (!is.null(x$feature_names)) if (!is.null(x$feature_names))
cat('# of features:', length(x$feature_names), '\n') cat('# of features:', length(x$feature_names), '\n')
cat('niter: ', x$niter, '\n', sep='') cat('niter: ', x$niter, '\n', sep = '')
# TODO: uncomment when faster xgb.ntree is implemented # TODO: uncomment when faster xgb.ntree is implemented
#cat('ntree: ', xgb.ntree(x), '\n', sep='') #cat('ntree: ', xgb.ntree(x), '\n', sep='')
for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks',
'evaluation_log','niter','feature_names'))) { 'evaluation_log','niter','feature_names'))) {
if (is.atomic(x[[n]])) { if (is.atomic(x[[n]])) {
cat(n, ':', x[[n]], '\n', sep=' ') cat(n, ':', x[[n]], '\n', sep = ' ')
} else { } else {
cat(n, ':\n\t', sep=' ') cat(n, ':\n\t', sep = ' ')
print(x[[n]]) print(x[[n]])
} }
} }

View File

@ -1,14 +1,17 @@
#' Construct xgb.DMatrix object #' Construct xgb.DMatrix object
#' #'
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix #' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
#' or local file (that was created previously by saving an \code{xgb.DMatrix}). #' Supported input file formats are either a libsvm text file or a binary file that was created previously by
#' \code{\link{xgb.DMatrix.save}}).
#' #'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename #' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
#' @param info a list of information of the xgb.DMatrix object #' string representing a filename.
#' @param missing Missing is only used when input is dense matrix, pick a float #' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. #' See \code{\link{setinfo}} for the specific allowed kinds of
# #' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' @param ... other information to pass to \code{info}. #' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
@ -17,19 +20,19 @@
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export #' @export
xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, ...) {
cnames <- NULL cnames <- NULL
if (typeof(data) == "character") { if (typeof(data) == "character") {
if (length(data) > 1) if (length(data) > 1)
stop("'data' has class 'character' and length ", length(data), stop("'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename.") ".\n 'data' accepts either a numeric matrix or a single filename.")
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(silent),
PACKAGE = "xgboost") PACKAGE = "xgboost")
} else if (is.matrix(data)) { } else if (is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
PACKAGE = "xgboost") PACKAGE = "xgboost")
cnames <- colnames(data) cnames <- colnames(data)
} else if (class(data) == "dgCMatrix") { } else if (inherits(data, "dgCMatrix")) {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data), handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data),
PACKAGE = "xgboost") PACKAGE = "xgboost")
cnames <- colnames(data) cnames <- colnames(data)
@ -51,10 +54,9 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
# get dmatrix from data, label # get dmatrix from data, label
# internal helper method # internal helper method
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
inClass <- class(data) if (inherits(data, "dgCMatrix") || is.matrix(data)) {
if ("dgCMatrix" %in% inClass || "matrix" %in% inClass ) {
if (is.null(label)) { if (is.null(label)) {
stop("xgboost: need label when data is a matrix") stop("label must be provided when data is a matrix")
} }
dtrain <- xgb.DMatrix(data, label = label, missing = missing) dtrain <- xgb.DMatrix(data, label = label, missing = missing)
if (!is.null(weight)){ if (!is.null(weight)){
@ -64,11 +66,11 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
if (!is.null(label)) { if (!is.null(label)) {
warning("xgboost: label will be ignored.") warning("xgboost: label will be ignored.")
} }
if (inClass == "character") { if (is.character(data)) {
dtrain <- xgb.DMatrix(data) dtrain <- xgb.DMatrix(data[1])
} else if (inClass == "xgb.DMatrix") { } else if (inherits(data, "xgb.DMatrix")) {
dtrain <- data dtrain <- data
} else if ("data.frame" %in% inClass) { } else if (inherits(data, "data.frame")) {
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.") stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
} else { } else {
stop("xgboost: invalid input data") stop("xgboost: invalid input data")
@ -98,8 +100,8 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
#' #'
#' @export #' @export
dim.xgb.DMatrix <- function(x) { dim.xgb.DMatrix <- function(x) {
c(.Call("XGDMatrixNumRow_R", x, PACKAGE="xgboost"), c(.Call("XGDMatrixNumRow_R", x, PACKAGE = "xgboost"),
.Call("XGDMatrixNumCol_R", x, PACKAGE="xgboost")) .Call("XGDMatrixNumCol_R", x, PACKAGE = "xgboost"))
} }
@ -297,8 +299,8 @@ slice <- function(object, ...) UseMethod("slice")
#' @rdname slice.xgb.DMatrix #' @rdname slice.xgb.DMatrix
#' @export #' @export
slice.xgb.DMatrix <- function(object, idxset, ...) { slice.xgb.DMatrix <- function(object, idxset, ...) {
if (class(object) != "xgb.DMatrix") { if (!inherits(object, "xgb.DMatrix")) {
stop("slice: first argument dtrain must be xgb.DMatrix") stop("object must be xgb.DMatrix")
} }
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost") ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
@ -317,7 +319,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
#' @rdname slice.xgb.DMatrix #' @rdname slice.xgb.DMatrix
#' @export #' @export
`[.xgb.DMatrix` <- function(object, idxset, colset=NULL) { `[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
slice(object, idxset) slice(object, idxset)
} }
@ -341,7 +343,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) {
#' #'
#' @method print xgb.DMatrix #' @method print xgb.DMatrix
#' @export #' @export
print.xgb.DMatrix <- function(x, verbose=FALSE, ...) { print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- c() infos <- c()
if(length(getinfo(x, 'label')) > 0) infos <- 'label' if(length(getinfo(x, 'label')) > 0) infos <- 'label'
@ -353,7 +355,7 @@ print.xgb.DMatrix <- function(x, verbose=FALSE, ...) {
cat(' colnames:') cat(' colnames:')
if (verbose & !is.null(cnames)) { if (verbose & !is.null(cnames)) {
cat("\n'") cat("\n'")
cat(cnames, sep="','") cat(cnames, sep = "','")
cat("'") cat("'")
} else { } else {
if (is.null(cnames)) cat(' no') if (is.null(cnames)) cat(' no')

View File

@ -15,9 +15,9 @@
xgb.DMatrix.save <- function(dmatrix, fname) { xgb.DMatrix.save <- function(dmatrix, fname) {
if (typeof(fname) != "character") if (typeof(fname) != "character")
stop("fname must be character") stop("fname must be character")
if (class(dmatrix) != "xgb.DMatrix") if (!inherits(dmatrix, "xgb.DMatrix"))
stop("the input data must be xgb.DMatrix") stop("dmatrix must be xgb.DMatrix")
.Call("XGDMatrixSaveBinary_R", dmatrix, fname, 0L, PACKAGE = "xgboost") .Call("XGDMatrixSaveBinary_R", dmatrix, fname[1], 0L, PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }

View File

@ -130,13 +130,13 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
# stop("Either 'eval_metric' or 'feval' must be provided for CV") # stop("Either 'eval_metric' or 'feval' must be provided for CV")
# Check the labels # Check the labels
if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) || if ( (inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
(class(data) != 'xgb.DMatrix' && is.null(label))) (!inherits(data, 'xgb.DMatrix') && is.null(label)))
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
# CV folds # CV folds
if(!is.null(folds)) { if(!is.null(folds)) {
if(class(folds) != "list" || length(folds) < 2) if(!is.list(folds) || length(folds) < 2)
stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold") stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
nfold <- length(folds) nfold <- length(folds)
} else { } else {
@ -153,7 +153,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
params <- c(params, list(silent = 1)) params <- c(params, list(silent = 1))
print_every_n <- max( as.integer(print_every_n), 1L) print_every_n <- max( as.integer(print_every_n), 1L)
if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd=showsd)) callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
} }
# evaluation log callback: always is on in CV # evaluation log callback: always is on in CV
evaluation_log <- list() evaluation_log <- list()
@ -165,12 +165,12 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
if (!is.null(early_stopping_rounds) && if (!is.null(early_stopping_rounds) &&
!has.callbacks(callbacks, 'cb.early.stop')) { !has.callbacks(callbacks, 'cb.early.stop')) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
maximize=maximize, verbose=verbose)) maximize = maximize, verbose = verbose))
} }
# CV-predictions callback # CV-predictions callback
if (prediction && if (prediction &&
!has.callbacks(callbacks, 'cb.cv.predict')) { !has.callbacks(callbacks, 'cb.cv.predict')) {
callbacks <- add.cb(callbacks, cb.cv.predict(save_models=FALSE)) callbacks <- add.cb(callbacks, cb.cv.predict(save_models = FALSE))
} }
# Sort the callbacks into categories # Sort the callbacks into categories
cb <- categorize.callbacks(callbacks) cb <- categorize.callbacks(callbacks)
@ -182,7 +182,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
dtest <- slice(dall, folds[[k]]) dtest <- slice(dall, folds[[k]])
dtrain <- slice(dall, unlist(folds[-k])) dtrain <- slice(dall, unlist(folds[-k]))
handle <- xgb.Booster.handle(params, list(dtrain, dtest)) handle <- xgb.Booster.handle(params, list(dtrain, dtest))
list(dtrain=dtrain, bst=handle, watchlist=list(train=dtrain, test=dtest), index=folds[[k]]) list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]])
}) })
# a "basket" to collect some results from callbacks # a "basket" to collect some results from callbacks
basket <- list() basket <- list()
@ -212,7 +212,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
if (stop_condition) break if (stop_condition) break
} }
for (f in cb$finalize) f(finalize=TRUE) for (f in cb$finalize) f(finalize = TRUE)
# the CV result # the CV result
ret <- list( ret <- list(
@ -254,8 +254,8 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
#' @rdname print.xgb.cv #' @rdname print.xgb.cv
#' @method print xgb.cv.synchronous #' @method print xgb.cv.synchronous
#' @export #' @export
print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) { print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
cat('##### xgb.cv ', length(x$folds), '-folds\n', sep='') cat('##### xgb.cv ', length(x$folds), '-folds\n', sep = '')
if (verbose) { if (verbose) {
if (!is.null(x$call)) { if (!is.null(x$call)) {
@ -267,7 +267,7 @@ print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) {
cat( ' ', cat( ' ',
paste(names(x$params), paste(names(x$params),
paste0('"', unlist(x$params), '"'), paste0('"', unlist(x$params), '"'),
sep=' = ', collapse=', '), '\n', sep='') sep = ' = ', collapse = ', '), '\n', sep = '')
} }
if (!is.null(x$callbacks) && length(x$callbacks) > 0) { if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
cat('callbacks:\n') cat('callbacks:\n')
@ -280,7 +280,7 @@ print.xgb.cv.synchronous <- function(x, verbose=FALSE, ...) {
for (n in c('niter', 'best_iteration', 'best_ntreelimit')) { for (n in c('niter', 'best_iteration', 'best_ntreelimit')) {
if (is.null(x[[n]])) if (is.null(x[[n]]))
next next
cat(n, ': ', x[[n]], '\n', sep='') cat(n, ': ', x[[n]], '\n', sep = '')
} }
if (!is.null(x$pred)) { if (!is.null(x$pred)) {

View File

@ -39,19 +39,19 @@
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) #' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
#' #'
#' @export #' @export
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
dump_format = c("text", "json"), ...) { dump_format = c("text", "json"), ...) {
check.deprecation(...) check.deprecation(...)
dump_format <- match.arg(dump_format) dump_format <- match.arg(dump_format)
if (class(model) != "xgb.Booster") if (!inherits(model, "xgb.Booster"))
stop("model: argument must be of type xgb.Booster") stop("model: argument must be of type xgb.Booster")
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) if (!(is.null(fname) || is.character(fname)))
stop("fname: argument must be of type character (when provided)") stop("fname: argument must be a character string (when provided)")
if (!(class(fmap) %in% c("character", "NULL") && length(fmap) <= 1)) if (!(is.null(fmap) || is.character(fmap)))
stop("fmap: argument must be of type character (when provided)") stop("fmap: argument must be a character string (when provided)")
model <- xgb.Booster.complete(model) model <- xgb.Booster.complete(model)
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), model_dump <- .Call("XGBoosterDumpModel_R", model$handle, NVL(fmap, "")[1], as.integer(with_stats),
as.character(dump_format), PACKAGE = "xgboost") as.character(dump_format), PACKAGE = "xgboost")
if (is.null(fname)) if (is.null(fname))
@ -65,7 +65,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE,
if (is.null(fname)) { if (is.null(fname)) {
return(model_dump) return(model_dump)
} else { } else {
writeLines(model_dump, fname) writeLines(model_dump, fname[1])
return(TRUE) return(TRUE)
} }
} }

View File

@ -58,13 +58,13 @@ xgb.importance <- function(feature_names = NULL, model = NULL,
if (!(is.null(data) && is.null(label) && is.null(target))) if (!(is.null(data) && is.null(label) && is.null(target)))
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated") warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
if (class(model) != "xgb.Booster") if (!inherits(model, "xgb.Booster"))
stop("Either 'model' has to be an object of class xgb.Booster") stop("model: must be an object of class xgb.Booster")
if (is.null(feature_names) && !is.null(model$feature_names)) if (is.null(feature_names) && !is.null(model$feature_names))
feature_names <- model$feature_names feature_names <- model$feature_names
if (!class(feature_names) %in% c("character", "NULL")) if (!(is.null(feature_names) || is.character(feature_names)))
stop("feature_names: Has to be a character vector") stop("feature_names: Has to be a character vector")
model_text_dump <- xgb.dump(model = model, with_stats = TRUE) model_text_dump <- xgb.dump(model = model, with_stats = TRUE)
@ -76,6 +76,8 @@ xgb.importance <- function(feature_names = NULL, model = NULL,
as.numeric as.numeric
if(is.null(feature_names)) if(is.null(feature_names))
feature_names <- seq(to = length(weights)) feature_names <- seq(to = length(weights))
if (length(feature_names) != length(weights))
stop("feature_names has less elements than there are features used in the model")
result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))] result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
} else { } else {
# tree model # tree model

View File

@ -14,6 +14,8 @@
#' It could be useful, e.g., in multiclass classification to get only #' It could be useful, e.g., in multiclass classification to get only
#' the trees of one certain class. IMPORTANT: the tree index in xgboost models #' the trees of one certain class. IMPORTANT: the tree index in xgboost models
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees). #' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
#' @param use_int_id a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
#' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
#' @param ... currently not used. #' @param ... currently not used.
#' #'
#' @return #' @return
@ -22,9 +24,9 @@
#' The columns of the \code{data.table} are: #' The columns of the \code{data.table} are:
#' #'
#' \itemize{ #' \itemize{
#' \item \code{Tree}: ID of a tree in a model (integer) #' \item \code{Tree}: integer ID of a tree in a model (zero-based index)
#' \item \code{Node}: integer ID of a node in a tree (integer) #' \item \code{Node}: integer ID of a node in a tree (zero-based index)
#' \item \code{ID}: identifier of a node in a model (character) #' \item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
#' \item \code{Feature}: for a branch node, it's a feature id or name (when available); #' \item \code{Feature}: for a branch node, it's a feature id or name (when available);
#' for a leaf note, it simply labels it as \code{'Leaf'} #' for a leaf note, it simply labels it as \code{'Leaf'}
#' \item \code{Split}: location of the split for a branch node (split condition is always "less than") #' \item \code{Split}: location of the split for a branch node (split condition is always "less than")
@ -36,6 +38,10 @@
#' or collected by a leaf during training. #' or collected by a leaf during training.
#' } #' }
#' #'
#' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
#' the corresponding trees in the "Node" column.
#'
#' @examples #' @examples
#' # Basic use: #' # Basic use:
#' #'
@ -45,8 +51,9 @@
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' #'
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
#' # This bst has feature_names stored in it, so those would be used when #'
#' # the feature_names parameter is not provided: #' # This bst model already has feature_names stored with it, so those would be used when
#' # feature_names is not set:
#' (dt <- xgb.model.dt.tree(model = bst)) #' (dt <- xgb.model.dt.tree(model = bst))
#' #'
#' # How to match feature names of splits that are following a current 'Yes' branch: #' # How to match feature names of splits that are following a current 'Yes' branch:
@ -55,24 +62,24 @@
#' #'
#' @export #' @export
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
trees = NULL, ...){ trees = NULL, use_int_id = FALSE, ...){
check.deprecation(...) check.deprecation(...)
if (class(model) != "xgb.Booster" & class(text) != "character") { if (!inherits(model, "xgb.Booster") & !is.character(text)) {
stop("Either 'model' has to be an object of class xgb.Booster\n", stop("Either 'model' must be an object of class xgb.Booster\n",
" or 'text' has to be a character vector with the result of xgb.dump\n", " or 'text' must be a character vector with the result of xgb.dump\n",
" (or NULL if the model was provided).") " (or NULL if 'model' was provided).")
} }
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names)) if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
feature_names <- model$feature_names feature_names <- model$feature_names
if (!class(feature_names) %in% c("character", "NULL")) { if (!(is.null(feature_names) || is.character(feature_names))) {
stop("feature_names: Has to be a character vector") stop("feature_names: must be a character vector")
} }
if (!class(trees) %in% c("integer", "numeric", "NULL")) { if (!(is.null(trees) || is.numeric(trees))) {
stop("trees: Has to be a vector of integers.") stop("trees: must be a vector of integers.")
} }
if (is.null(text)){ if (is.null(text)){
@ -86,11 +93,11 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
position <- which(!is.na(stri_match_first_regex(text, "booster"))) position <- which(!is.na(stri_match_first_regex(text, "booster")))
add.tree.id <- function(x, i) paste(i, x, sep = "-") add.tree.id <- function(node, tree) if (use_int_id) node else paste(tree, node, sep = "-")
anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
td <- data.table(t=text) td <- data.table(t = text)
td[position, Tree := 1L] td[position, Tree := 1L]
td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L] td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L]
@ -102,32 +109,43 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
td <- td[Tree %in% trees & !grepl('^booster', t)] td <- td[Tree %in% trees & !grepl('^booster', t)]
td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ] td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ]
td[, ID := add.tree.id(Node, Tree)] if (!use_int_id) td[, ID := add.tree.id(Node, Tree)]
td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))] td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]
# parse branch lines # parse branch lines
td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := { branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")") "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
td[isLeaf == FALSE,
(branch_cols) := {
# skip some indices with spurious capture groups from anynumber_regex # skip some indices with spurious capture groups from anynumber_regex
xtr <- stri_match_first_regex(t, rx)[, c(2,3,5,6,7,8,10)] xtr <- stri_match_first_regex(t, branch_rx)[, c(2,3,5,6,7,8,10), drop = FALSE]
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree) xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
lapply(1:ncol(xtr), function(i) xtr[,i]) lapply(1:ncol(xtr), function(i) xtr[,i])
}] }]
# assign feature_names when available # assign feature_names when available
td[isLeaf==FALSE & !is.null(feature_names), if (!is.null(feature_names)) {
Feature := feature_names[as.numeric(Feature) + 1] ] if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
stop("feature_names has less elements than there are features used in the model")
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1] ]
}
# parse leaf lines # parse leaf lines
td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := { leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")") leaf_cols <- c("Feature", "Quality", "Cover")
xtr <- stri_match_first_regex(t, rx)[, c(2,4)] td[isLeaf == TRUE,
(leaf_cols) := {
xtr <- stri_match_first_regex(t, leaf_rx)[, c(2,4)]
c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i])) c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
}] }]
# convert some columns to numeric # convert some columns to numeric
numeric_cols <- c("Split", "Quality", "Cover") numeric_cols <- c("Split", "Quality", "Cover")
td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols=numeric_cols] td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
if (use_int_id) {
int_cols <- c("Yes", "No", "Missing")
td[, (int_cols) := lapply(.SD, as.integer), .SDcols = int_cols]
}
td[, t := NULL] td[, t := NULL]
td[, isLeaf := NULL] td[, isLeaf := NULL]

View File

@ -63,7 +63,7 @@
xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"), xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
plot = TRUE, ...) { plot = TRUE, ...) {
if (!(class(model) == "xgb.Booster" || is.data.table(model))) if (!(inherits(model, "xgb.Booster") || is.data.table(model)))
stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n", stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
"or a data.table result of the xgb.importance function") "or a data.table result of the xgb.importance function")
@ -73,14 +73,14 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
which <- match.arg(which) which <- match.arg(which)
dt_tree <- model dt_tree <- model
if (class(model) == "xgb.Booster") if (inherits(model, "xgb.Booster"))
dt_tree <- xgb.model.dt.tree(model = model) dt_tree <- xgb.model.dt.tree(model = model)
if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree))) if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))
stop("Model tree columns are not as expected!\n", stop("Model tree columns are not as expected!\n",
" Note that this function works only for tree models.") " Note that this function works only for tree models.")
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight=Quality)], by = "ID") dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID")
setkeyv(dt_depths, c("Tree", "ID")) setkeyv(dt_depths, c("Tree", "ID"))
# count by depth levels, and also calculate average cover at a depth # count by depth levels, and also calculate average cover at a depth
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth] dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
@ -89,13 +89,13 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
if (plot) { if (plot) {
if (which == "2x1") { if (which == "2x1") {
op <- par(no.readonly = TRUE) op <- par(no.readonly = TRUE)
par(mfrow=c(2,1), par(mfrow = c(2,1),
oma = c(3,1,3,1) + 0.1, oma = c(3,1,3,1) + 0.1,
mar = c(1,4,1,0) + 0.1) mar = c(1,4,1,0) + 0.1)
dt_summaries[, barplot(N, border=NA, ylab = 'Number of leafs', ...)] dt_summaries[, barplot(N, border = NA, ylab = 'Number of leafs', ...)]
dt_summaries[, barplot(Cover, border=NA, ylab = "Weighted cover", names.arg=Depth, ...)] dt_summaries[, barplot(Cover, border = NA, ylab = "Weighted cover", names.arg = Depth, ...)]
title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1) title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1)
par(op) par(op)
@ -119,8 +119,8 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
get.leaf.depth <- function(dt_tree) { get.leaf.depth <- function(dt_tree) {
# extract tree graph's edges # extract tree graph's edges
dt_edges <- rbindlist(list( dt_edges <- rbindlist(list(
dt_tree[Feature != "Leaf", .(ID, To=Yes, Tree)], dt_tree[Feature != "Leaf", .(ID, To = Yes, Tree)],
dt_tree[Feature != "Leaf", .(ID, To=No, Tree)] dt_tree[Feature != "Leaf", .(ID, To = No, Tree)]
)) ))
# whether "To" is a leaf: # whether "To" is a leaf:
dt_edges <- dt_edges <-

View File

@ -61,8 +61,8 @@
xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) { rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) {
check.deprecation(...) check.deprecation(...)
if (!"data.table" %in% class(importance_matrix)) { if (!is.data.table(importance_matrix)) {
stop("importance_matrix: Should be a data.table.") stop("importance_matrix: must be a data.table")
} }
imp_names <- colnames(importance_matrix) imp_names <- colnames(importance_matrix)
@ -107,12 +107,12 @@ xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure
# reverse the order of rows to have the highest ranked at the top # reverse the order of rows to have the highest ranked at the top
importance_matrix[nrow(importance_matrix):1, importance_matrix[nrow(importance_matrix):1,
barplot(Importance, horiz=TRUE, border=NA, cex.names=cex, barplot(Importance, horiz = TRUE, border = NA, cex.names = cex,
names.arg=Feature, las=1, ...)] names.arg = Feature, las = 1, ...)]
grid(NULL, NA) grid(NULL, NA)
# redraw over the grid # redraw over the grid
importance_matrix[nrow(importance_matrix):1, importance_matrix[nrow(importance_matrix):1,
barplot(Importance, horiz=TRUE, border=NA, add=TRUE)] barplot(Importance, horiz = TRUE, border = NA, add = TRUE)]
par(op) par(op)
} }

View File

@ -11,7 +11,7 @@
#' @param plot_width the width of the diagram in pixels. #' @param plot_width the width of the diagram in pixels.
#' @param plot_height the height of the diagram in pixels. #' @param plot_height the height of the diagram in pixels.
#' @param render a logical flag for whether the graph should be rendered (see Value). #' @param render a logical flag for whether the graph should be rendered (see Value).
#' @param show_node_id a logical flag for whether to include node id's in the graph. #' @param show_node_id a logical flag for whether to show node id's in the graph.
#' @param ... currently not used. #' @param ... currently not used.
#' #'
#' @details #' @details
@ -53,17 +53,25 @@
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' # plot all the trees #' # plot all the trees
#' xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst) #' xgb.plot.tree(model = bst)
#' # plot only the first tree and include the node ID: #' # plot only the first tree and display the node ID:
#' xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst, #' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
#' trees = 0, show_node_id = TRUE) #'
#' \dontrun{
#' # Below is an example of how to save this plot to a file.
#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
#' library(DiagrammeR)
#' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
#' export_graph(gr, 'tree.pdf', width=1500, height=1900)
#' export_graph(gr, 'tree.png', width=1500, height=1900)
#' }
#' #'
#' @export #' @export
xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL, xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
render = TRUE, show_node_id = FALSE, ...){ render = TRUE, show_node_id = FALSE, ...){
check.deprecation(...) check.deprecation(...)
if (class(model) != "xgb.Booster") { if (!inherits(model, "xgb.Booster")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") stop("model: Has to be an object of class xgb.Booster")
} }
if (!requireNamespace("DiagrammeR", quietly = TRUE)) { if (!requireNamespace("DiagrammeR", quietly = TRUE)) {

View File

@ -32,10 +32,11 @@
xgb.save <- function(model, fname) { xgb.save <- function(model, fname) {
if (typeof(fname) != "character") if (typeof(fname) != "character")
stop("fname must be character") stop("fname must be character")
if (class(model) != "xgb.Booster") if (!inherits(model, "xgb.Booster")) {
stop("the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.") stop("model must be xgb.Booster.",
if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
}
model <- xgb.Booster.complete(model, saveraw = FALSE) model <- xgb.Booster.complete(model, saveraw = FALSE)
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost") .Call("XGBoosterSaveModel_R", model$handle, fname[1], PACKAGE = "xgboost")
return(TRUE) return(TRUE)
} }

View File

@ -233,7 +233,7 @@
#' @rdname xgb.train #' @rdname xgb.train
#' @export #' @export
xgb.train <- function(params = list(), data, nrounds, watchlist = list(), xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, print_every_n=1L, obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, early_stopping_rounds = NULL, maximize = NULL,
save_period = NULL, save_name = "xgboost.model", save_period = NULL, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) { xgb_model = NULL, callbacks = list(), ...) {
@ -247,11 +247,11 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
# data & watchlist checks # data & watchlist checks
dtrain <- data dtrain <- data
if (class(dtrain) != "xgb.DMatrix") if (!inherits(dtrain, "xgb.DMatrix"))
stop("second argument dtrain must be xgb.DMatrix") stop("second argument dtrain must be xgb.DMatrix")
if (length(watchlist) > 0) { if (length(watchlist) > 0) {
if (typeof(watchlist) != "list" || if (typeof(watchlist) != "list" ||
!all(sapply(watchlist, class) == "xgb.DMatrix")) !all(sapply(watchlist, inherits, 'xgb.DMatrix')))
stop("watchlist must be a list of xgb.DMatrix elements") stop("watchlist must be a list of xgb.DMatrix elements")
evnames <- names(watchlist) evnames <- names(watchlist)
if (is.null(evnames) || any(evnames == "")) if (is.null(evnames) || any(evnames == ""))
@ -281,7 +281,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
if (!is.null(early_stopping_rounds) && if (!is.null(early_stopping_rounds) &&
!has.callbacks(callbacks, 'cb.early.stop')) { !has.callbacks(callbacks, 'cb.early.stop')) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
maximize=maximize, verbose=verbose)) maximize = maximize, verbose = verbose))
} }
# Sort the callbacks into categories # Sort the callbacks into categories
cb <- categorize.callbacks(callbacks) cb <- categorize.callbacks(callbacks)
@ -332,7 +332,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
if (stop_condition) break if (stop_condition) break
} }
for (f in cb$finalize) f(finalize=TRUE) for (f in cb$finalize) f(finalize = TRUE)
bst <- xgb.Booster.complete(bst, saveraw = TRUE) bst <- xgb.Booster.complete(bst, saveraw = TRUE)
@ -343,7 +343,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
if (length(evaluation_log) > 0 && if (length(evaluation_log) > 0 &&
nrow(evaluation_log) > 0) { nrow(evaluation_log) > 0) {
# include the previous compatible history when available # include the previous compatible history when available
if (class(xgb_model) == 'xgb.Booster' && if (inherits(xgb_model, 'xgb.Booster') &&
!is_update && !is_update &&
!is.null(xgb_model$evaluation_log) && !is.null(xgb_model$evaluation_log) &&
all.equal(colnames(evaluation_log), all.equal(colnames(evaluation_log),

View File

@ -1,4 +1,4 @@
# Simple interface for training an xgboost model that wraps \code{xgb.train} # Simple interface for training an xgboost model that wraps \code{xgb.train}.
# Its documentation is combined with xgb.train. # Its documentation is combined with xgb.train.
# #
#' @rdname xgb.train #' @rdname xgb.train
@ -7,7 +7,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, params = list(), nrounds,
verbose = 1, print_every_n = 1L, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, early_stopping_rounds = NULL, maximize = NULL,
save_period = 0, save_name = "xgboost.model", save_period = NULL, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) { xgb_model = NULL, callbacks = list(), ...) {
dtrain <- xgb.get.DMatrix(data, label, missing, weight) dtrain <- xgb.get.DMatrix(data, label, missing, weight)

View File

@ -19,49 +19,6 @@ We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now
install.packages('xgboost') install.packages('xgboost')
``` ```
You can also install from our weekly updated drat repo:
```r
install.packages("xgboost", repos=c("http://dmlc.ml/drat/", getOption("repos")), type="source")
```
***Important*** Due to the usage of submodule, `install_github` is no longer support to install the
latest version of R package.
For up-to-date version, please install from github.
Windows users will need to install [RTools](https://cran.r-project.org/bin/windows/Rtools/) first. They also need to download [MinGW-W64](http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe) using x86_64 architecture during installation.
Run the following command to add MinGW to PATH in Windows if not already added.
```cmd
PATH %PATH%;C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin
```
To compile xgboost at the root of your storage, run the following bash script.
```bash
git clone --recursive https://github.com/dmlc/xgboost
cd xgboost
git submodule init
git submodule update
alias make='mingw32-make'
cd dmlc-core
make -j4
cd ../rabit
make lib/librabit_empty.a -j4
cd ..
cp make/mingw64.mk config.mk
make -j4
```
Run the following R script to install xgboost package from the root directory.
```r
install.packages('devtools') # if not installed
setwd('C:/xgboost/')
library(devtools)
install('R-package')
```
For more detailed installation instructions, please see [here](http://xgboost.readthedocs.org/en/latest/build.html#r-package-installation). For more detailed installation instructions, please see [here](http://xgboost.readthedocs.org/en/latest/build.html#r-package-installation).
Examples Examples
@ -69,3 +26,8 @@ Examples
* Please visit [walk through example](demo). * Please visit [walk through example](demo).
* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd). * See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
Development
-----------
* See the [R Package section](https://xgboost.readthedocs.io/en/latest/how_to/contribute.html#r-package) of the contributiors guide.

View File

@ -28,8 +28,8 @@ E.g., when an \code{xgb.Booster} model is saved as an R object and then is loade
its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
should still work for such a model object since those methods would be using should still work for such a model object since those methods would be using
\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
\code{xgb.Booster.complete} function once after loading a model as an R-object. That which would \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
prevent further reconstruction (potentially, multiple times) of an internal booster model. That would prevent further repeated implicit reconstruction of an internal booster model.
} }
\examples{ \examples{
@ -41,6 +41,7 @@ saveRDS(bst, "xgb.model.rds")
bst1 <- readRDS("xgb.model.rds") bst1 <- readRDS("xgb.model.rds")
# the handle is invalid: # the handle is invalid:
print(bst1$handle) print(bst1$handle)
bst1 <- xgb.Booster.complete(bst1) bst1 <- xgb.Booster.complete(bst1)
# now the handle points to a valid internal booster model: # now the handle points to a valid internal booster model:
print(bst1$handle) print(bst1$handle)

View File

@ -2,23 +2,28 @@
% Please edit documentation in R/xgb.DMatrix.R % Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix} \name{xgb.DMatrix}
\alias{xgb.DMatrix} \alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object} \title{Construct xgb.DMatrix object}
\usage{ \usage{
xgb.DMatrix(data, info = list(), missing = NA, ...) xgb.DMatrix(data, info = list(), missing = NA, silent = FALSE, ...)
} }
\arguments{ \arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} \item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
string representing a filename.}
\item{info}{a list of information of the xgb.DMatrix object} \item{info}{a named list of additional information to store in the \code{xgb.DMatrix} object.
See \code{\link{setinfo}} for the specific allowed kinds of}
\item{missing}{Missing is only used when input is dense matrix, pick a float \item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} It is useful when a 0 or some other extreme value represents missing values in data.}
\item{...}{other information to pass to \code{info}.} \item{silent}{whether to suppress printing an informational message after loading from a file.}
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
} }
\description{ \description{
Contruct xgb.DMatrix object from dense matrix, sparse matrix Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
or local file (that was created previously by saving an \code{xgb.DMatrix}). Supported input file formats are either a libsvm text file or a binary file that was created previously by
\code{\link{xgb.DMatrix.save}}).
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')

View File

@ -4,7 +4,7 @@
\alias{xgb.dump} \alias{xgb.dump}
\title{Dump an xgboost model in text format.} \title{Dump an xgboost model in text format.}
\usage{ \usage{
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, xgb.dump(model, fname = NULL, fmap = "", with_stats = FALSE,
dump_format = c("text", "json"), ...) dump_format = c("text", "json"), ...)
} }
\arguments{ \arguments{

View File

@ -5,7 +5,7 @@
\title{Parse a boosted tree model text dump} \title{Parse a boosted tree model text dump}
\usage{ \usage{
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL, xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
trees = NULL, ...) trees = NULL, use_int_id = FALSE, ...)
} }
\arguments{ \arguments{
\item{feature_names}{character vector of feature names. If the model already \item{feature_names}{character vector of feature names. If the model already
@ -24,6 +24,9 @@ It could be useful, e.g., in multiclass classification to get only
the trees of one certain class. IMPORTANT: the tree index in xgboost models the trees of one certain class. IMPORTANT: the tree index in xgboost models
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).} is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
\item{use_int_id}{a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).}
\item{...}{currently not used.} \item{...}{currently not used.}
} }
\value{ \value{
@ -32,9 +35,9 @@ A \code{data.table} with detailed information about model trees' nodes.
The columns of the \code{data.table} are: The columns of the \code{data.table} are:
\itemize{ \itemize{
\item \code{Tree}: ID of a tree in a model (integer) \item \code{Tree}: integer ID of a tree in a model (zero-based index)
\item \code{Node}: integer ID of a node in a tree (integer) \item \code{Node}: integer ID of a node in a tree (zero-based index)
\item \code{ID}: identifier of a node in a model (character) \item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
\item \code{Feature}: for a branch node, it's a feature id or name (when available); \item \code{Feature}: for a branch node, it's a feature id or name (when available);
for a leaf note, it simply labels it as \code{'Leaf'} for a leaf note, it simply labels it as \code{'Leaf'}
\item \code{Split}: location of the split for a branch node (split condition is always "less than") \item \code{Split}: location of the split for a branch node (split condition is always "less than")
@ -45,6 +48,10 @@ The columns of the \code{data.table} are:
\item \code{Cover}: metric related to the number of observation either seen by a split \item \code{Cover}: metric related to the number of observation either seen by a split
or collected by a leaf during training. or collected by a leaf during training.
} }
When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
the corresponding trees in the "Node" column.
} }
\description{ \description{
Parse a boosted tree model text dump into a \code{data.table} structure. Parse a boosted tree model text dump into a \code{data.table} structure.
@ -58,8 +65,9 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
# This bst has feature_names stored in it, so those would be used when
# the feature_names parameter is not provided: # This bst model already has feature_names stored with it, so those would be used when
# feature_names is not set:
(dt <- xgb.model.dt.tree(model = bst)) (dt <- xgb.model.dt.tree(model = bst))
# How to match feature names of splits that are following a current 'Yes' branch: # How to match feature names of splits that are following a current 'Yes' branch:

View File

@ -24,7 +24,7 @@ IMPORTANT: the tree index in xgboost model is zero-based
\item{render}{a logical flag for whether the graph should be rendered (see Value).} \item{render}{a logical flag for whether the graph should be rendered (see Value).}
\item{show_node_id}{a logical flag for whether to include node id's in the graph.} \item{show_node_id}{a logical flag for whether to show node id's in the graph.}
\item{...}{currently not used.} \item{...}{currently not used.}
} }
@ -68,9 +68,17 @@ data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
# plot all the trees # plot all the trees
xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst) xgb.plot.tree(model = bst)
# plot only the first tree and include the node ID: # plot only the first tree and display the node ID:
xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst, xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
trees = 0, show_node_id = TRUE)
\dontrun{
# Below is an example of how to save this plot to a file.
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
library(DiagrammeR)
gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
export_graph(gr, 'tree.pdf', width=1500, height=1900)
export_graph(gr, 'tree.png', width=1500, height=1900)
}
} }

View File

@ -12,7 +12,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL, xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, verbose = 1, print_every_n = 1L, params = list(), nrounds, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, save_period = 0, early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...) save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
} }
\arguments{ \arguments{

View File

@ -68,12 +68,19 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP dim = getAttrib(mat, R_DimSymbol); SEXP dim = getAttrib(mat, R_DimSymbol);
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]); size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]); size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
double *din = REAL(mat); const bool is_int = TYPEOF(mat) == INTSXP;
double *din;
int *iin;
if (is_int) {
iin = INTEGER(mat);
} else {
din = REAL(mat);
}
std::vector<float> data(nrow * ncol); std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (omp_ulong i = 0; i < nrow; ++i) { for (omp_ulong i = 0; i < nrow; ++i) {
for (size_t j = 0; j < ncol; ++j) { for (size_t j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j]; data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
} }
} }
DMatrixHandle handle; DMatrixHandle handle;

View File

@ -189,3 +189,36 @@ test_that("xgb.cv works", {
expect_false(is.null(cv$callbacks)) expect_false(is.null(cv$callbacks))
expect_false(is.null(cv$call)) expect_false(is.null(cv$call))
}) })
test_that("train and predict with non-strict classes", {
# standard dense matrix input
train_dense <- as.matrix(train$data)
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
pr0 <- predict(bst, train_dense)
# dense matrix-like input of non-matrix class
class(train_dense) <- 'shmatrix'
expect_true(is.matrix(train_dense))
expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
, regexp = NA)
expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr)
# dense matrix-like input of non-matrix class with some inheritance
class(train_dense) <- c('pphmatrix','shmatrix')
expect_true(is.matrix(train_dense))
expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
, regexp = NA)
expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr)
# when someone inhertis from xgb.Booster, it should still be possible to use it as xgb.Booster
class(bst) <- c('super.Booster', 'xgb.Booster')
expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr)
})

View File

@ -7,18 +7,30 @@ data(agaricus.test, package='xgboost')
test_data <- agaricus.test$data[1:100,] test_data <- agaricus.test$data[1:100,]
test_label <- agaricus.test$label[1:100] test_label <- agaricus.test$label[1:100]
test_that("xgb.DMatrix: basic construction, saving, loading", { test_that("xgb.DMatrix: basic construction", {
# from sparse matrix # from sparse matrix
dtest1 <- xgb.DMatrix(test_data, label=test_label) dtest1 <- xgb.DMatrix(test_data, label=test_label)
# from dense matrix # from dense matrix
dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label) dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label)
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
expect_equal(dim(dtest1), dim(dtest2))
#from dense integer matrix
int_data <- as.matrix(test_data)
storage.mode(int_data) <- "integer"
dtest3 <- xgb.DMatrix(int_data, label=test_label)
expect_equal(dim(dtest1), dim(dtest3))
})
test_that("xgb.DMatrix: saving, loading", {
# save to a local file # save to a local file
dtest1 <- xgb.DMatrix(test_data, label=test_label)
tmp_file <- tempfile('xgb.DMatrix_') tmp_file <- tempfile('xgb.DMatrix_')
expect_true(xgb.DMatrix.save(dtest1, tmp_file)) expect_true(xgb.DMatrix.save(dtest1, tmp_file))
# read from a local file # read from a local file
dtest3 <- xgb.DMatrix(tmp_file) expect_output(dtest3 <- xgb.DMatrix(tmp_file), "entries loaded from")
expect_output(dtest3 <- xgb.DMatrix(tmp_file, silent = TRUE), NA)
unlink(tmp_file) unlink(tmp_file)
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
@ -26,7 +38,7 @@ test_that("xgb.DMatrix: basic construction, saving, loading", {
tmp <- c("0 1:1 2:1","1 3:1","0 1:1") tmp <- c("0 1:1 2:1","1 3:1","0 1:1")
tmp_file <- 'tmp.libsvm' tmp_file <- 'tmp.libsvm'
writeLines(tmp, tmp_file) writeLines(tmp, tmp_file)
dtest4 <- xgb.DMatrix(tmp_file) dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
expect_equal(dim(dtest4), c(3, 4)) expect_equal(dim(dtest4), c(3, 4))
expect_equal(getinfo(dtest4, 'label'), c(0,1,0)) expect_equal(getinfo(dtest4, 'label'), c(0,1,0))
unlink(tmp_file) unlink(tmp_file)

View File

@ -133,6 +133,12 @@ test_that("xgb.model.dt.tree works with and without feature names", {
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x) dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"') expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE]) expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
# using integer node ID instead of character
dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
expect_equal(as.integer(tstrsplit(dt.tree$Yes, '-')[[2]]), dt.tree.int$Yes)
expect_equal(as.integer(tstrsplit(dt.tree$No, '-')[[2]]), dt.tree.int$No)
expect_equal(as.integer(tstrsplit(dt.tree$Missing, '-')[[2]]), dt.tree.int$Missing)
}) })
test_that("xgb.model.dt.tree throws error for gblinear", { test_that("xgb.model.dt.tree throws error for gblinear", {
@ -169,6 +175,17 @@ test_that("xgb.importance works with GLM model", {
xgb.ggplot.importance(importance.GLM) xgb.ggplot.importance(importance.GLM)
}) })
test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
bst1 <- xgboost(data = sparse_matrix, label = label, max_depth = 1,
eta = 1, nthread = 2, nrounds = 1, verbose = 0,
objective = "binary:logistic")
expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error
expect_equal(nrow(dt), 3)
expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error
expect_equal(nrow(imp), 1)
expect_equal(imp$Gain, 1)
})
test_that("xgb.plot.tree works with and without feature names", { test_that("xgb.plot.tree works with and without feature names", {
xgb.plot.tree(feature_names = feature.names, model = bst.Tree) xgb.plot.tree(feature_names = feature.names, model = bst.Tree)
xgb.plot.tree(model = bst.Tree) xgb.plot.tree(model = bst.Tree)

View File

@ -120,6 +120,7 @@ R Package
make rcpplint make rcpplint
``` ```
- When needed, you can disable the linter warning of certain line with ```// NOLINT(*)``` comments. - When needed, you can disable the linter warning of certain line with ```// NOLINT(*)``` comments.
- We use [roxygen](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) for documenting the R package.
### Rmarkdown Vignettes ### Rmarkdown Vignettes
Rmarkdown vignettes are placed in [R-package/vignettes](../R-package/vignettes) Rmarkdown vignettes are placed in [R-package/vignettes](../R-package/vignettes)
@ -143,3 +144,17 @@ make the-markdown-to-make.md
make html make html
``` ```
The reason we do this is to avoid exploded repo size due to generated images sizes. The reason we do this is to avoid exploded repo size due to generated images sizes.
### R package versioning
Since version 0.6.4.3, we have adopted a versioning system that uses an ```x.y.z``` (or ```core_major.core_minor.cran_release```)
format for CRAN releases and an ```x.y.z.p``` (or ```core_major.core_minor.cran_release.patch```) format for development patch versions.
This approach is similar to the one described in Yihui Xie's
[blog post on R Package Versioning](https://yihui.name/en/2013/06/r-package-versioning/),
except we need an additional field to accomodate the ```x.y``` core library version.
Each new CRAN release bumps up the 3rd field, while developments in-between CRAN releases
would be marked by an additional 4th field on the top of an existing CRAN release version.
Some additional consideration is needed when the core library version changes.
E.g., after the core changes from 0.6 to 0.7, the R package development version would become 0.7.0.1, working towards
a 0.7.1 CRAN release. The 0.7.0 would not be released to CRAN, unless it would require almost no additional development.