[R] maintenance Apr 2017 (#2237)

* [R] make sure things work for a single split model; fixes #2191

* [R] add option use_int_id to xgb.model.dt.tree

* [R] add example of exporting tree plot to a file

* [R] set save_period = NULL as default in xgboost() to be the same as in xgb.train; fixes #2182

* [R] it's a good practice after CRAN releases to bump up package version in dev

* [R] allow xgb.DMatrix construction from integer dense matrices

* [R] xgb.DMatrix: silent parameter; improve documentation

* [R] xgb.model.dt.tree code style changes

* [R] update NEWS with parameter changes

* [R] code safety & style; handle non-strict matrix and inherited classes of input and model; fixes #2242

* [R] change to x.y.z.p R-package versioning scheme and set version to 0.6.4.3

* [R] add an R package versioning section to the contributors guide

* [R] R-package/README.md: clean up the redundant old installation instructions, link the contributors guide
This commit is contained in:
Vadim Khotilovich 2017-05-02 00:51:34 -05:00 committed by Tong He
parent d769b6bcb5
commit a375ad2822
29 changed files with 351 additions and 246 deletions

View File

@ -12,6 +12,11 @@ This file records the changes in xgboost library in reverse chronological order.
- Thread local variable is upgraded so it is automatically freed at thread exit.
* Migrate to C++11
- The current master version now requires C++11 enabled compiled(g++4.8 or higher)
* R package:
- New parameters:
- `silent` in `xgb.DMatrix()`
- `use_int_id` in `xgb.model.dt.tree()`
- Default value of the `save_period` parameter in `xgboost()` changed to NULL (consistent with `xgb.train()`).
## v0.6 (2016.07.29)
* Version 0.5 is skipped due to major improvements in the core

View File

@ -1,7 +1,7 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 0.6-4
Version: 0.6.4.3
Date: 2017-01-04
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
Michael Benesty <michael@benesty.fr>, Vadim Khotilovich <khotilovich@gmail.com>,

View File

@ -334,7 +334,7 @@ cb.early.stop <- function(stopping_rounds, maximize=FALSE,
env$stop_condition <- FALSE
if (!is.null(env$bst)) {
if (class(env$bst) != 'xgb.Booster')
if (!inherits(env$bst, 'xgb.Booster'))
stop("'bst' in the parent frame must be an 'xgb.Booster'")
if (!is.null(best_score <- xgb.attr(env$bst$handle, 'best_score'))) {
best_score <<- as.numeric(best_score)

View File

@ -120,22 +120,22 @@ check.custom.eval <- function(env = parent.frame()) {
}
# Update booster with dtrain for an iteration
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
if (class(booster) != "xgb.Booster.handle") {
stop("first argument type must be xgb.Booster.handle")
# Update a booster handle for an iteration with dtrain data
xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
if (!identical(class(booster_handle), "xgb.Booster.handle")) {
stop("booster_handle must be of xgb.Booster.handle class")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("second argument type must be xgb.DMatrix")
if (!inherits(dtrain, "xgb.DMatrix")) {
stop("dtrain must be of xgb.DMatrix class")
}
if (is.null(obj)) {
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
.Call("XGBoosterUpdateOneIter_R", booster_handle, as.integer(iter), dtrain,
PACKAGE = "xgboost")
} else {
pred <- predict(booster, dtrain)
pred <- predict(booster_handle, dtrain)
gpair <- obj(pred, dtrain)
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
.Call("XGBoosterBoostOneIter_R", booster_handle, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
}
return(TRUE)
}
@ -144,16 +144,16 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
# Evaluate one iteration.
# Returns a named vector of evaluation metrics
# with the names in a 'datasetname-metricname' format.
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
if (class(booster) != "xgb.Booster.handle")
stop("first argument type must be xgb.Booster.handle")
xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
if (!identical(class(booster_handle), "xgb.Booster.handle"))
stop("class of booster_handle must be xgb.Booster.handle")
if (length(watchlist) == 0)
return(NULL)
evnames <- names(watchlist)
if (is.null(feval)) {
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
msg <- .Call("XGBoosterEvalOneIter_R", booster_handle, as.integer(iter), watchlist,
as.list(evnames), PACKAGE = "xgboost")
msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
@ -161,7 +161,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
} else {
res <- sapply(seq_along(watchlist), function(j) {
w <- watchlist[[j]]
preds <- predict(booster, w) # predict using all trees
preds <- predict(booster_handle, w) # predict using all trees
eval_res <- feval(preds, w)
out <- eval_res$value
names(out) <- paste0(evnames[j], "-", eval_res$metric)

View File

@ -1,18 +1,18 @@
# Construct an internal xgboost Booster and return a handle to it
# Construct an internal xgboost Booster and return a handle to it.
# internal utility function
xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list" ||
any(sapply(cachelist, class) != 'xgb.DMatrix')) {
stop("xgb.Booster only accepts list of DMatrix as cachelist")
!all(sapply(cachelist, inherits, 'xgb.DMatrix'))) {
stop("cachelist must be a list of xgb.DMatrix objects")
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") {
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
.Call("XGBoosterLoadModel_R", handle, modelfile[1], PACKAGE = "xgboost")
} else if (typeof(modelfile) == "raw") {
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
} else if (class(modelfile) == "xgb.Booster") {
} else if (inherits(modelfile, "xgb.Booster")) {
bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
.Call("XGBoosterLoadModelFromRaw_R", handle, bst$raw, PACKAGE = "xgboost")
} else {
@ -37,7 +37,7 @@ xgb.handleToBooster <- function(handle, raw = NULL) {
# Check whether xgb.Booster.handle is null
# internal utility function
is.null.handle <- function(handle) {
if (class(handle) != "xgb.Booster.handle")
if (!identical(class(handle), "xgb.Booster.handle"))
stop("argument type must be xgb.Booster.handle")
if (is.null(handle) || .Call("XGCheckNullPtr_R", handle, PACKAGE = "xgboost"))
@ -78,8 +78,8 @@ xgb.get.handle <- function(object) {
#' its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
#' should still work for such a model object since those methods would be using
#' \code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
#' \code{xgb.Booster.complete} function once after loading a model as an R-object. That which would
#' prevent further reconstruction (potentially, multiple times) of an internal booster model.
#' \code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
#' That would prevent further repeated implicit reconstruction of an internal booster model.
#'
#' @return
#' An object of \code{xgb.Booster} class.
@ -94,13 +94,14 @@ xgb.get.handle <- function(object) {
#' bst1 <- readRDS("xgb.model.rds")
#' # the handle is invalid:
#' print(bst1$handle)
#'
#' bst1 <- xgb.Booster.complete(bst1)
#' # now the handle points to a valid internal booster model:
#' print(bst1$handle)
#'
#' @export
xgb.Booster.complete <- function(object, saveraw = TRUE) {
if (class(object) != "xgb.Booster")
if (!inherits(object, "xgb.Booster"))
stop("argument type must be xgb.Booster")
if (is.null.handle(object$handle)) {
@ -225,7 +226,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE, reshape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE)
if (class(newdata) != "xgb.DMatrix")
if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing)
if (is.null(ntreelimit))
ntreelimit <- NVL(object$best_ntreelimit, 0)
@ -442,8 +443,8 @@ xgb.attributes <- function(object) {
object
}
# Extract # of trees in a model
# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration
# Extract the number of trees in a model.
# TODO: either add a getter to C-interface, or simply set an 'ntree' attribute after each iteration.
# internal utility function
xgb.ntree <- function(bst) {
length(grep('^booster', xgb.dump(bst)))

View File

@ -1,14 +1,17 @@
#' Construct xgb.DMatrix object
#'
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix
#' or local file (that was created previously by saving an \code{xgb.DMatrix}).
#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
#' Supported input file formats are either a libsvm text file or a binary file that was created previously by
#' \code{\link{xgb.DMatrix.save}}).
#'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
#' string representing a filename.
#' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
#' See \code{\link{setinfo}} for the specific allowed kinds of
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
@ -17,19 +20,19 @@
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, ...) {
cnames <- NULL
if (typeof(data) == "character") {
if (length(data) > 1)
stop("'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename.")
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(silent),
PACKAGE = "xgboost")
} else if (is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
PACKAGE = "xgboost")
cnames <- colnames(data)
} else if (class(data) == "dgCMatrix") {
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, nrow(data),
PACKAGE = "xgboost")
cnames <- colnames(data)
@ -51,10 +54,9 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
# get dmatrix from data, label
# internal helper method
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
inClass <- class(data)
if ("dgCMatrix" %in% inClass || "matrix" %in% inClass ) {
if (inherits(data, "dgCMatrix") || is.matrix(data)) {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
stop("label must be provided when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
if (!is.null(weight)){
@ -64,11 +66,11 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (inClass == "character") {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
if (is.character(data)) {
dtrain <- xgb.DMatrix(data[1])
} else if (inherits(data, "xgb.DMatrix")) {
dtrain <- data
} else if ("data.frame" %in% inClass) {
} else if (inherits(data, "data.frame")) {
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
} else {
stop("xgboost: invalid input data")
@ -297,8 +299,8 @@ slice <- function(object, ...) UseMethod("slice")
#' @rdname slice.xgb.DMatrix
#' @export
slice.xgb.DMatrix <- function(object, idxset, ...) {
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
if (!inherits(object, "xgb.DMatrix")) {
stop("object must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")

View File

@ -15,9 +15,9 @@
xgb.DMatrix.save <- function(dmatrix, fname) {
if (typeof(fname) != "character")
stop("fname must be character")
if (class(dmatrix) != "xgb.DMatrix")
stop("the input data must be xgb.DMatrix")
if (!inherits(dmatrix, "xgb.DMatrix"))
stop("dmatrix must be xgb.DMatrix")
.Call("XGDMatrixSaveBinary_R", dmatrix, fname, 0L, PACKAGE = "xgboost")
.Call("XGDMatrixSaveBinary_R", dmatrix, fname[1], 0L, PACKAGE = "xgboost")
return(TRUE)
}

View File

@ -130,13 +130,13 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
# stop("Either 'eval_metric' or 'feval' must be provided for CV")
# Check the labels
if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) ||
(class(data) != 'xgb.DMatrix' && is.null(label)))
if ( (inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
(!inherits(data, 'xgb.DMatrix') && is.null(label)))
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
# CV folds
if(!is.null(folds)) {
if(class(folds) != "list" || length(folds) < 2)
if(!is.list(folds) || length(folds) < 2)
stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold")
nfold <- length(folds)
} else {

View File

@ -39,19 +39,19 @@
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
#'
#' @export
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE,
xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE,
dump_format = c("text", "json"), ...) {
check.deprecation(...)
dump_format <- match.arg(dump_format)
if (class(model) != "xgb.Booster")
if (!inherits(model, "xgb.Booster"))
stop("model: argument must be of type xgb.Booster")
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
stop("fname: argument must be of type character (when provided)")
if (!(class(fmap) %in% c("character", "NULL") && length(fmap) <= 1))
stop("fmap: argument must be of type character (when provided)")
if (!(is.null(fname) || is.character(fname)))
stop("fname: argument must be a character string (when provided)")
if (!(is.null(fmap) || is.character(fmap)))
stop("fmap: argument must be a character string (when provided)")
model <- xgb.Booster.complete(model)
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, NVL(fmap, "")[1], as.integer(with_stats),
as.character(dump_format), PACKAGE = "xgboost")
if (is.null(fname))
@ -65,7 +65,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE,
if (is.null(fname)) {
return(model_dump)
} else {
writeLines(model_dump, fname)
writeLines(model_dump, fname[1])
return(TRUE)
}
}

View File

@ -58,13 +58,13 @@ xgb.importance <- function(feature_names = NULL, model = NULL,
if (!(is.null(data) && is.null(label) && is.null(target)))
warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated")
if (class(model) != "xgb.Booster")
stop("Either 'model' has to be an object of class xgb.Booster")
if (!inherits(model, "xgb.Booster"))
stop("model: must be an object of class xgb.Booster")
if (is.null(feature_names) && !is.null(model$feature_names))
feature_names <- model$feature_names
if (!class(feature_names) %in% c("character", "NULL"))
if (!(is.null(feature_names) || is.character(feature_names)))
stop("feature_names: Has to be a character vector")
model_text_dump <- xgb.dump(model = model, with_stats = TRUE)
@ -76,6 +76,8 @@ xgb.importance <- function(feature_names = NULL, model = NULL,
as.numeric
if(is.null(feature_names))
feature_names <- seq(to = length(weights))
if (length(feature_names) != length(weights))
stop("feature_names has less elements than there are features used in the model")
result <- data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))]
} else {
# tree model

View File

@ -14,6 +14,8 @@
#' It could be useful, e.g., in multiclass classification to get only
#' the trees of one certain class. IMPORTANT: the tree index in xgboost models
#' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).
#' @param use_int_id a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
#' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).
#' @param ... currently not used.
#'
#' @return
@ -22,9 +24,9 @@
#' The columns of the \code{data.table} are:
#'
#' \itemize{
#' \item \code{Tree}: ID of a tree in a model (integer)
#' \item \code{Node}: integer ID of a node in a tree (integer)
#' \item \code{ID}: identifier of a node in a model (character)
#' \item \code{Tree}: integer ID of a tree in a model (zero-based index)
#' \item \code{Node}: integer ID of a node in a tree (zero-based index)
#' \item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
#' \item \code{Feature}: for a branch node, it's a feature id or name (when available);
#' for a leaf note, it simply labels it as \code{'Leaf'}
#' \item \code{Split}: location of the split for a branch node (split condition is always "less than")
@ -36,6 +38,10 @@
#' or collected by a leaf during training.
#' }
#'
#' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
#' the corresponding trees in the "Node" column.
#'
#' @examples
#' # Basic use:
#'
@ -45,8 +51,9 @@
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#'
#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
#' # This bst has feature_names stored in it, so those would be used when
#' # the feature_names parameter is not provided:
#'
#' # This bst model already has feature_names stored with it, so those would be used when
#' # feature_names is not set:
#' (dt <- xgb.model.dt.tree(model = bst))
#'
#' # How to match feature names of splits that are following a current 'Yes' branch:
@ -55,24 +62,24 @@
#'
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
trees = NULL, ...){
trees = NULL, use_int_id = FALSE, ...){
check.deprecation(...)
if (class(model) != "xgb.Booster" & class(text) != "character") {
stop("Either 'model' has to be an object of class xgb.Booster\n",
" or 'text' has to be a character vector with the result of xgb.dump\n",
" (or NULL if the model was provided).")
if (!inherits(model, "xgb.Booster") & !is.character(text)) {
stop("Either 'model' must be an object of class xgb.Booster\n",
" or 'text' must be a character vector with the result of xgb.dump\n",
" (or NULL if 'model' was provided).")
}
if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names))
feature_names <- model$feature_names
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a character vector")
if (!(is.null(feature_names) || is.character(feature_names))) {
stop("feature_names: must be a character vector")
}
if (!class(trees) %in% c("integer", "numeric", "NULL")) {
stop("trees: Has to be a vector of integers.")
if (!(is.null(trees) || is.numeric(trees))) {
stop("trees: must be a vector of integers.")
}
if (is.null(text)){
@ -86,7 +93,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
position <- which(!is.na(stri_match_first_regex(text, "booster")))
add.tree.id <- function(x, i) paste(i, x, sep = "-")
add.tree.id <- function(node, tree) if (use_int_id) node else paste(tree, node, sep = "-")
anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
@ -102,32 +109,43 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
td <- td[Tree %in% trees & !grepl('^booster', t)]
td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ]
td[, ID := add.tree.id(Node, Tree)]
if (!use_int_id) td[, ID := add.tree.id(Node, Tree)]
td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]
# parse branch lines
td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := {
rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
td[isLeaf == FALSE,
(branch_cols) := {
# skip some indices with spurious capture groups from anynumber_regex
xtr <- stri_match_first_regex(t, rx)[, c(2,3,5,6,7,8,10)]
xtr <- stri_match_first_regex(t, branch_rx)[, c(2,3,5,6,7,8,10), drop = FALSE]
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
lapply(1:ncol(xtr), function(i) xtr[,i])
}]
# assign feature_names when available
td[isLeaf==FALSE & !is.null(feature_names),
Feature := feature_names[as.numeric(Feature) + 1] ]
if (!is.null(feature_names)) {
if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE))
stop("feature_names has less elements than there are features used in the model")
td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1] ]
}
# parse leaf lines
td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := {
rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
xtr <- stri_match_first_regex(t, rx)[, c(2,4)]
leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
leaf_cols <- c("Feature", "Quality", "Cover")
td[isLeaf == TRUE,
(leaf_cols) := {
xtr <- stri_match_first_regex(t, leaf_rx)[, c(2,4)]
c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
}]
# convert some columns to numeric
numeric_cols <- c("Split", "Quality", "Cover")
td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
if (use_int_id) {
int_cols <- c("Yes", "No", "Missing")
td[, (int_cols) := lapply(.SD, as.integer), .SDcols = int_cols]
}
td[, t := NULL]
td[, isLeaf := NULL]

View File

@ -63,7 +63,7 @@
xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
plot = TRUE, ...) {
if (!(class(model) == "xgb.Booster" || is.data.table(model)))
if (!(inherits(model, "xgb.Booster") || is.data.table(model)))
stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
"or a data.table result of the xgb.importance function")
@ -73,7 +73,7 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
which <- match.arg(which)
dt_tree <- model
if (class(model) == "xgb.Booster")
if (inherits(model, "xgb.Booster"))
dt_tree <- xgb.model.dt.tree(model = model)
if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))

View File

@ -61,8 +61,8 @@
xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) {
check.deprecation(...)
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
if (!is.data.table(importance_matrix)) {
stop("importance_matrix: must be a data.table")
}
imp_names <- colnames(importance_matrix)

View File

@ -11,7 +11,7 @@
#' @param plot_width the width of the diagram in pixels.
#' @param plot_height the height of the diagram in pixels.
#' @param render a logical flag for whether the graph should be rendered (see Value).
#' @param show_node_id a logical flag for whether to include node id's in the graph.
#' @param show_node_id a logical flag for whether to show node id's in the graph.
#' @param ... currently not used.
#'
#' @details
@ -53,17 +53,25 @@
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' # plot all the trees
#' xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst)
#' # plot only the first tree and include the node ID:
#' xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst,
#' trees = 0, show_node_id = TRUE)
#' xgb.plot.tree(model = bst)
#' # plot only the first tree and display the node ID:
#' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
#'
#' \dontrun{
#' # Below is an example of how to save this plot to a file.
#' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
#' library(DiagrammeR)
#' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
#' export_graph(gr, 'tree.pdf', width=1500, height=1900)
#' export_graph(gr, 'tree.png', width=1500, height=1900)
#' }
#'
#' @export
xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL,
render = TRUE, show_node_id = FALSE, ...){
check.deprecation(...)
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
if (!inherits(model, "xgb.Booster")) {
stop("model: Has to be an object of class xgb.Booster")
}
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {

View File

@ -32,10 +32,11 @@
xgb.save <- function(model, fname) {
if (typeof(fname) != "character")
stop("fname must be character")
if (class(model) != "xgb.Booster")
stop("the input must be xgb.Booster. Use xgb.DMatrix.save to save xgb.DMatrix object.")
if (!inherits(model, "xgb.Booster")) {
stop("model must be xgb.Booster.",
if (inherits(model, "xgb.DMatrix")) " Use xgb.DMatrix.save to save an xgb.DMatrix object." else "")
}
model <- xgb.Booster.complete(model, saveraw = FALSE)
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
.Call("XGBoosterSaveModel_R", model$handle, fname[1], PACKAGE = "xgboost")
return(TRUE)
}

View File

@ -247,11 +247,11 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
# data & watchlist checks
dtrain <- data
if (class(dtrain) != "xgb.DMatrix")
if (!inherits(dtrain, "xgb.DMatrix"))
stop("second argument dtrain must be xgb.DMatrix")
if (length(watchlist) > 0) {
if (typeof(watchlist) != "list" ||
!all(sapply(watchlist, class) == "xgb.DMatrix"))
!all(sapply(watchlist, inherits, 'xgb.DMatrix')))
stop("watchlist must be a list of xgb.DMatrix elements")
evnames <- names(watchlist)
if (is.null(evnames) || any(evnames == ""))
@ -343,7 +343,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
if (length(evaluation_log) > 0 &&
nrow(evaluation_log) > 0) {
# include the previous compatible history when available
if (class(xgb_model) == 'xgb.Booster' &&
if (inherits(xgb_model, 'xgb.Booster') &&
!is_update &&
!is.null(xgb_model$evaluation_log) &&
all.equal(colnames(evaluation_log),

View File

@ -1,4 +1,4 @@
# Simple interface for training an xgboost model that wraps \code{xgb.train}
# Simple interface for training an xgboost model that wraps \code{xgb.train}.
# Its documentation is combined with xgb.train.
#
#' @rdname xgb.train
@ -7,7 +7,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds,
verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL,
save_period = 0, save_name = "xgboost.model",
save_period = NULL, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) {
dtrain <- xgb.get.DMatrix(data, label, missing, weight)

View File

@ -19,49 +19,6 @@ We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now
install.packages('xgboost')
```
You can also install from our weekly updated drat repo:
```r
install.packages("xgboost", repos=c("http://dmlc.ml/drat/", getOption("repos")), type="source")
```
***Important*** Due to the usage of submodule, `install_github` is no longer support to install the
latest version of R package.
For up-to-date version, please install from github.
Windows users will need to install [RTools](https://cran.r-project.org/bin/windows/Rtools/) first. They also need to download [MinGW-W64](http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe) using x86_64 architecture during installation.
Run the following command to add MinGW to PATH in Windows if not already added.
```cmd
PATH %PATH%;C:\Program Files\mingw-w64\x86_64-5.3.0-posix-seh-rt_v4-rev0\mingw64\bin
```
To compile xgboost at the root of your storage, run the following bash script.
```bash
git clone --recursive https://github.com/dmlc/xgboost
cd xgboost
git submodule init
git submodule update
alias make='mingw32-make'
cd dmlc-core
make -j4
cd ../rabit
make lib/librabit_empty.a -j4
cd ..
cp make/mingw64.mk config.mk
make -j4
```
Run the following R script to install xgboost package from the root directory.
```r
install.packages('devtools') # if not installed
setwd('C:/xgboost/')
library(devtools)
install('R-package')
```
For more detailed installation instructions, please see [here](http://xgboost.readthedocs.org/en/latest/build.html#r-package-installation).
Examples
@ -69,3 +26,8 @@ Examples
* Please visit [walk through example](demo).
* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
Development
-----------
* See the [R Package section](https://xgboost.readthedocs.io/en/latest/how_to/contribute.html#r-package) of the contributiors guide.

View File

@ -28,8 +28,8 @@ E.g., when an \code{xgb.Booster} model is saved as an R object and then is loade
its handle (pointer) to an internal xgboost model would be invalid. The majority of xgboost methods
should still work for such a model object since those methods would be using
\code{xgb.Booster.complete} internally. However, one might find it to be more efficient to call the
\code{xgb.Booster.complete} function once after loading a model as an R-object. That which would
prevent further reconstruction (potentially, multiple times) of an internal booster model.
\code{xgb.Booster.complete} function explicitely once after loading a model as an R-object.
That would prevent further repeated implicit reconstruction of an internal booster model.
}
\examples{
@ -41,6 +41,7 @@ saveRDS(bst, "xgb.model.rds")
bst1 <- readRDS("xgb.model.rds")
# the handle is invalid:
print(bst1$handle)
bst1 <- xgb.Booster.complete(bst1)
# now the handle points to a valid internal booster model:
print(bst1$handle)

View File

@ -2,23 +2,28 @@
% Please edit documentation in R/xgb.DMatrix.R
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
\title{Construct xgb.DMatrix object}
\usage{
xgb.DMatrix(data, info = list(), missing = NA, ...)
xgb.DMatrix(data, info = list(), missing = NA, silent = FALSE, ...)
}
\arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
\item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
string representing a filename.}
\item{info}{a list of information of the xgb.DMatrix object}
\item{info}{a named list of additional information to store in the \code{xgb.DMatrix} object.
See \code{\link{setinfo}} for the specific allowed kinds of}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{missing}{a float value to represents missing values in data (used only when input is a dense matrix).
It is useful when a 0 or some other extreme value represents missing values in data.}
\item{...}{other information to pass to \code{info}.}
\item{silent}{whether to suppress printing an informational message after loading from a file.}
\item{...}{the \code{info} data could be passed directly as parameters, without creating an \code{info} list.}
}
\description{
Contruct xgb.DMatrix object from dense matrix, sparse matrix
or local file (that was created previously by saving an \code{xgb.DMatrix}).
Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
Supported input file formats are either a libsvm text file or a binary file that was created previously by
\code{\link{xgb.DMatrix.save}}).
}
\examples{
data(agaricus.train, package='xgboost')

View File

@ -4,7 +4,7 @@
\alias{xgb.dump}
\title{Dump an xgboost model in text format.}
\usage{
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
xgb.dump(model, fname = NULL, fmap = "", with_stats = FALSE,
dump_format = c("text", "json"), ...)
}
\arguments{

View File

@ -5,7 +5,7 @@
\title{Parse a boosted tree model text dump}
\usage{
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
trees = NULL, ...)
trees = NULL, use_int_id = FALSE, ...)
}
\arguments{
\item{feature_names}{character vector of feature names. If the model already
@ -24,6 +24,9 @@ It could be useful, e.g., in multiclass classification to get only
the trees of one certain class. IMPORTANT: the tree index in xgboost models
is zero-based (e.g., use \code{trees = 0:4} for first 5 trees).}
\item{use_int_id}{a logical flag indicating whether nodes in columns "Yes", "No", "Missing" should be
represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE).}
\item{...}{currently not used.}
}
\value{
@ -32,9 +35,9 @@ A \code{data.table} with detailed information about model trees' nodes.
The columns of the \code{data.table} are:
\itemize{
\item \code{Tree}: ID of a tree in a model (integer)
\item \code{Node}: integer ID of a node in a tree (integer)
\item \code{ID}: identifier of a node in a model (character)
\item \code{Tree}: integer ID of a tree in a model (zero-based index)
\item \code{Node}: integer ID of a node in a tree (zero-based index)
\item \code{ID}: character identifier of a node in a model (only when \code{use_int_id=FALSE})
\item \code{Feature}: for a branch node, it's a feature id or name (when available);
for a leaf note, it simply labels it as \code{'Leaf'}
\item \code{Split}: location of the split for a branch node (split condition is always "less than")
@ -45,6 +48,10 @@ The columns of the \code{data.table} are:
\item \code{Cover}: metric related to the number of observation either seen by a split
or collected by a leaf during training.
}
When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers
in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from
the corresponding trees in the "Node" column.
}
\description{
Parse a boosted tree model text dump into a \code{data.table} structure.
@ -58,8 +65,9 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
# This bst has feature_names stored in it, so those would be used when
# the feature_names parameter is not provided:
# This bst model already has feature_names stored with it, so those would be used when
# feature_names is not set:
(dt <- xgb.model.dt.tree(model = bst))
# How to match feature names of splits that are following a current 'Yes' branch:

View File

@ -24,7 +24,7 @@ IMPORTANT: the tree index in xgboost model is zero-based
\item{render}{a logical flag for whether the graph should be rendered (see Value).}
\item{show_node_id}{a logical flag for whether to include node id's in the graph.}
\item{show_node_id}{a logical flag for whether to show node id's in the graph.}
\item{...}{currently not used.}
}
@ -68,9 +68,17 @@ data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
# plot all the trees
xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst)
# plot only the first tree and include the node ID:
xgb.plot.tree(feature_names = colnames(agaricus.train$data), model = bst,
trees = 0, show_node_id = TRUE)
xgb.plot.tree(model = bst)
# plot only the first tree and display the node ID:
xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE)
\dontrun{
# Below is an example of how to save this plot to a file.
# Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed.
library(DiagrammeR)
gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE)
export_graph(gr, 'tree.pdf', width=1500, height=1900)
export_graph(gr, 'tree.png', width=1500, height=1900)
}
}

View File

@ -12,7 +12,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, save_period = 0,
early_stopping_rounds = NULL, maximize = NULL, save_period = NULL,
save_name = "xgboost.model", xgb_model = NULL, callbacks = list(), ...)
}
\arguments{

View File

@ -68,12 +68,19 @@ SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP dim = getAttrib(mat, R_DimSymbol);
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
double *din = REAL(mat);
const bool is_int = TYPEOF(mat) == INTSXP;
double *din;
int *iin;
if (is_int) {
iin = INTEGER(mat);
} else {
din = REAL(mat);
}
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (omp_ulong i = 0; i < nrow; ++i) {
for (size_t j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
data[i * ncol +j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
}
}
DMatrixHandle handle;

View File

@ -189,3 +189,36 @@ test_that("xgb.cv works", {
expect_false(is.null(cv$callbacks))
expect_false(is.null(cv$call))
})
test_that("train and predict with non-strict classes", {
# standard dense matrix input
train_dense <- as.matrix(train$data)
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
pr0 <- predict(bst, train_dense)
# dense matrix-like input of non-matrix class
class(train_dense) <- 'shmatrix'
expect_true(is.matrix(train_dense))
expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
, regexp = NA)
expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr)
# dense matrix-like input of non-matrix class with some inheritance
class(train_dense) <- c('pphmatrix','shmatrix')
expect_true(is.matrix(train_dense))
expect_error(
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
, regexp = NA)
expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr)
# when someone inhertis from xgb.Booster, it should still be possible to use it as xgb.Booster
class(bst) <- c('super.Booster', 'xgb.Booster')
expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr)
})

View File

@ -7,18 +7,30 @@ data(agaricus.test, package='xgboost')
test_data <- agaricus.test$data[1:100,]
test_label <- agaricus.test$label[1:100]
test_that("xgb.DMatrix: basic construction, saving, loading", {
test_that("xgb.DMatrix: basic construction", {
# from sparse matrix
dtest1 <- xgb.DMatrix(test_data, label=test_label)
# from dense matrix
dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label)
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
expect_equal(dim(dtest1), dim(dtest2))
#from dense integer matrix
int_data <- as.matrix(test_data)
storage.mode(int_data) <- "integer"
dtest3 <- xgb.DMatrix(int_data, label=test_label)
expect_equal(dim(dtest1), dim(dtest3))
})
test_that("xgb.DMatrix: saving, loading", {
# save to a local file
dtest1 <- xgb.DMatrix(test_data, label=test_label)
tmp_file <- tempfile('xgb.DMatrix_')
expect_true(xgb.DMatrix.save(dtest1, tmp_file))
# read from a local file
dtest3 <- xgb.DMatrix(tmp_file)
expect_output(dtest3 <- xgb.DMatrix(tmp_file), "entries loaded from")
expect_output(dtest3 <- xgb.DMatrix(tmp_file, silent = TRUE), NA)
unlink(tmp_file)
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label'))
@ -26,7 +38,7 @@ test_that("xgb.DMatrix: basic construction, saving, loading", {
tmp <- c("0 1:1 2:1","1 3:1","0 1:1")
tmp_file <- 'tmp.libsvm'
writeLines(tmp, tmp_file)
dtest4 <- xgb.DMatrix(tmp_file)
dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE)
expect_equal(dim(dtest4), c(3, 4))
expect_equal(getinfo(dtest4, 'label'), c(0,1,0))
unlink(tmp_file)

View File

@ -133,6 +133,12 @@ test_that("xgb.model.dt.tree works with and without feature names", {
dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x)
expect_output(str(dt.tree.x), 'Feature.*\\"3\\"')
expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE])
# using integer node ID instead of character
dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE)
expect_equal(as.integer(tstrsplit(dt.tree$Yes, '-')[[2]]), dt.tree.int$Yes)
expect_equal(as.integer(tstrsplit(dt.tree$No, '-')[[2]]), dt.tree.int$No)
expect_equal(as.integer(tstrsplit(dt.tree$Missing, '-')[[2]]), dt.tree.int$Missing)
})
test_that("xgb.model.dt.tree throws error for gblinear", {
@ -169,6 +175,17 @@ test_that("xgb.importance works with GLM model", {
xgb.ggplot.importance(importance.GLM)
})
test_that("xgb.model.dt.tree and xgb.importance work with a single split model", {
bst1 <- xgboost(data = sparse_matrix, label = label, max_depth = 1,
eta = 1, nthread = 2, nrounds = 1, verbose = 0,
objective = "binary:logistic")
expect_error(dt <- xgb.model.dt.tree(model = bst1), regexp = NA) # no error
expect_equal(nrow(dt), 3)
expect_error(imp <- xgb.importance(model = bst1), regexp = NA) # no error
expect_equal(nrow(imp), 1)
expect_equal(imp$Gain, 1)
})
test_that("xgb.plot.tree works with and without feature names", {
xgb.plot.tree(feature_names = feature.names, model = bst.Tree)
xgb.plot.tree(model = bst.Tree)

View File

@ -120,6 +120,7 @@ R Package
make rcpplint
```
- When needed, you can disable the linter warning of certain line with ```// NOLINT(*)``` comments.
- We use [roxygen](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html) for documenting the R package.
### Rmarkdown Vignettes
Rmarkdown vignettes are placed in [R-package/vignettes](../R-package/vignettes)
@ -143,3 +144,17 @@ make the-markdown-to-make.md
make html
```
The reason we do this is to avoid exploded repo size due to generated images sizes.
### R package versioning
Since version 0.6.4.3, we have adopted a versioning system that uses an ```x.y.z``` (or ```core_major.core_minor.cran_release```)
format for CRAN releases and an ```x.y.z.p``` (or ```core_major.core_minor.cran_release.patch```) format for development patch versions.
This approach is similar to the one described in Yihui Xie's
[blog post on R Package Versioning](https://yihui.name/en/2013/06/r-package-versioning/),
except we need an additional field to accomodate the ```x.y``` core library version.
Each new CRAN release bumps up the 3rd field, while developments in-between CRAN releases
would be marked by an additional 4th field on the top of an existing CRAN release version.
Some additional consideration is needed when the core library version changes.
E.g., after the core changes from 0.6 to 0.7, the R package development version would become 0.7.0.1, working towards
a 0.7.1 CRAN release. The 0.7.0 would not be released to CRAN, unless it would require almost no additional development.