[R-package] GPL2 dependency reduction and some fixes (#1401)

* [R] do not remove zero coefficients from gblinear dump

* [R] switch from stringr to stringi

* fix #1399

* [R] separate ggplot backend, add base r graphics, cleanup, more plots, tests

* add missing include in amalgamation - fixes building R package in linux

* add forgotten file

* [R] fix DESCRIPTION

* [R] fix travis check issue and some cleanup
This commit is contained in:
Vadim Khotilovich 2016-07-27 02:05:04 -05:00 committed by Tong He
parent f6423056c0
commit d5c143367d
19 changed files with 548 additions and 312 deletions

View File

@ -35,5 +35,5 @@ Imports:
methods,
data.table (>= 1.9.6),
magrittr (>= 1.5),
stringr (>= 0.6.2)
stringi (>= 0.5.2)
RoxygenNote: 5.0.1

View File

@ -31,6 +31,8 @@ export(xgb.attributes)
export(xgb.create.features)
export(xgb.cv)
export(xgb.dump)
export(xgb.ggplot.deepness)
export(xgb.ggplot.importance)
export(xgb.importance)
export(xgb.load)
export(xgb.model.dt.tree)
@ -53,15 +55,16 @@ importFrom(data.table,":=")
importFrom(data.table,as.data.table)
importFrom(data.table,data.table)
importFrom(data.table,rbindlist)
importFrom(data.table,setkey)
importFrom(data.table,setkeyv)
importFrom(data.table,setnames)
importFrom(magrittr,"%>%")
importFrom(stats,predict)
importFrom(stringr,str_detect)
importFrom(stringr,str_extract)
importFrom(stringr,str_match)
importFrom(stringr,str_replace)
importFrom(stringr,str_replace_all)
importFrom(stringr,str_split)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_match_first_regex)
importFrom(stringi,stri_replace_all_regex)
importFrom(stringi,stri_replace_first_regex)
importFrom(stringi,stri_split_regex)
importFrom(utils,object.size)
importFrom(utils,str)
importFrom(utils,tail)

View File

@ -482,9 +482,12 @@ cb.cv.predict <- function(save_models = FALSE) {
stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
N <- nrow(env$data)
pred <- ifelse(env$num_class > 1,
matrix(NA_real_, N, env$num_class),
rep(NA_real_, N))
pred <-
if (env$num_class > 1) {
matrix(NA_real_, N, env$num_class)
} else {
rep(NA_real_, N)
}
ntreelimit <- NVL(env$basket$best_ntreelimit,
env$end_iteration * env$num_parallel_tree)

View File

@ -146,7 +146,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
if (is.null(feval)) {
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
as.list(evnames), PACKAGE = "xgboost")
msg <- str_split(msg, '(\\s+|:|\\s+)')[[1]][-1]
msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
names(res) <- msg[c(TRUE,FALSE)] # odds are the names
} else {

View File

@ -45,10 +45,10 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ..
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
if (is.null(fname))
model_dump <- str_replace_all(model_dump, '\t', '')
model_dump <- stri_replace_all_regex(model_dump, '\t', '')
model_dump <- unlist(str_split(model_dump, '\n'))
model_dump <- grep('(^$|^0$)', model_dump, invert = TRUE, value = TRUE)
model_dump <- unlist(stri_split_regex(model_dump, '\n'))
model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
if (is.null(fname)) {
return(model_dump)

135
R-package/R/xgb.ggplot.R Normal file
View File

@ -0,0 +1,135 @@
# ggplot backend for the xgboost plotting facilities
#' @rdname xgb.plot.importance
#' @export
xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
rel_to_first = FALSE, n_clusters = c(1:10), ...) {
importance_matrix <- xgb.plot.importance(importance_matrix, top_n = top_n, measure = measure,
rel_to_first = rel_to_first, plot = FALSE, ...)
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required", call. = FALSE)
}
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required", call. = FALSE)
}
clusters <- suppressWarnings(
Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix$Importance, n_clusters)
)
importance_matrix[, Cluster := as.character(clusters$cluster)]
plot <-
ggplot2::ggplot(importance_matrix,
ggplot2::aes(x = factor(Feature, levels = rev(Feature)), y = Importance, width = 0.05),
environment = environment()) +
ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = "identity") +
ggplot2::coord_flip() +
ggplot2::xlab("Features") +
ggplot2::ggtitle("Feature importance") +
ggplot2::theme(plot.title = ggplot2::element_text(lineheight = .9, face = "bold"),
panel.grid.major.y = ggplot2::element_blank())
return(plot)
}
#' @rdname xgb.plot.deepness
#' @export
xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight")) {
if (!requireNamespace("ggplot2", quietly = TRUE))
stop("ggplot2 package is required for plotting the graph deepness.", call. = FALSE)
which <- match.arg(which)
dt_depths <- xgb.plot.deepness(model = model, plot = FALSE)
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
setkey(dt_summaries, 'Depth')
if (which == "2x1") {
p1 <-
ggplot2::ggplot(dt_summaries) +
ggplot2::geom_bar(ggplot2::aes(x = Depth, y = N), stat = "Identity") +
ggplot2::xlab("") +
ggplot2::ylab("Number of leafs") +
ggplot2::ggtitle("Model complexity") +
ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
panel.grid.major.y = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank()
)
p2 <-
ggplot2::ggplot(dt_summaries) +
ggplot2::geom_bar(ggplot2::aes(x = Depth, y = Cover), stat = "Identity") +
ggplot2::xlab("Leaf depth") +
ggplot2::ylab("Weighted cover")
multiplot(p1, p2, cols = 1)
return(invisible(list(p1, p2)))
} else if (which == "max.depth") {
p <-
ggplot2::ggplot(dt_depths[, max(Depth), Tree]) +
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
height = 0.15, alpha=0.4, size=3, stroke=0) +
ggplot2::xlab("tree #") +
ggplot2::ylab("Max tree leaf depth")
return(p)
} else if (which == "med.depth") {
p <-
ggplot2::ggplot(dt_depths[, median(as.numeric(Depth)), Tree]) +
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
height = 0.15, alpha=0.4, size=3, stroke=0) +
ggplot2::xlab("tree #") +
ggplot2::ylab("Median tree leaf depth")
return(p)
} else if (which == "med.weight") {
p <-
ggplot2::ggplot(dt_depths[, median(abs(Weight)), Tree]) +
ggplot2::geom_point(ggplot2::aes(x = Tree, y = V1),
alpha=0.4, size=3, stroke=0) +
ggplot2::xlab("tree #") +
ggplot2::ylab("Median absolute leaf weight")
return(p)
}
}
# Plot multiple ggplot graph aligned by rows and columns.
# ... the plots
# cols number of columns
# internal utility function
multiplot <- function(..., cols = 1) {
plots <- list(...)
num_plots = length(plots)
layout <- matrix(seq(1, cols * ceiling(num_plots / cols)),
ncol = cols, nrow = ceiling(num_plots / cols))
if (num_plots == 1) {
print(plots[[1]])
} else {
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
for (i in 1:num_plots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
print(
plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col
)
)
}
}
}
globalVariables(c(
"Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
"element_blank", "element_text"
))

View File

@ -69,7 +69,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
text <- xgb.dump(model = model, with_stats = T)
}
position <- which(!is.na(str_match(text, "booster")))
position <- which(!is.na(stri_match_first_regex(text, "booster")))
add.tree.id <- function(x, i) paste(i, x, sep = "-")
@ -82,16 +82,16 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
n_first_tree <- min(max(td$Tree), n_first_tree)
td <- td[Tree <= n_first_tree & !grepl('^booster', t)]
td[, Node := str_match(t, "(\\d+):")[,2] %>% as.numeric ]
td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.numeric ]
td[, ID := add.tree.id(Node, Tree)]
td[, isLeaf := !is.na(str_match(t, "leaf"))]
td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]
# parse branch lines
td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := {
rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
# skip some indices with spurious capture groups from anynumber_regex
xtr <- str_match(t, rx)[, c(2,3,5,6,7,8,10)]
xtr <- stri_match_first_regex(t, rx)[, c(2,3,5,6,7,8,10)]
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
lapply(1:ncol(xtr), function(i) xtr[,i])
}]
@ -102,7 +102,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
# parse leaf lines
td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := {
rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
xtr <- str_match(t, rx)[, c(2,4)]
xtr <- stri_match_first_regex(t, rx)[, c(2,4)]
c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
}]

View File

@ -1,148 +1,142 @@
#' Plot multiple graphs at the same time
#'
#' Plot multiple graph aligned by rows and columns.
#'
#' @param ... the plots
#' @param cols number of columns
#' @return NULL
multiplot <- function(..., cols = 1) {
plots <- list(...)
numPlots = length(plots)
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
ncol = cols, nrow = ceiling(numPlots / cols))
if (numPlots == 1) {
print(plots[[1]])
} else {
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
print(
plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col
)
)
}
}
}
#' Parse the graph to extract vector of edges
#' @param element igraph object containing the path from the root to the leaf.
edge.parser <- function(element) {
edges.vector <- igraph::as_ids(element)
t <- tail(edges.vector, n = 1)
l <- length(edges.vector)
list(t,l)
}
#' Extract path from root to leaf from data.table
#' @param dt_tree data.table containing the nodes and edges of the trees
get.paths.to.leaf <- function(dt_tree) {
dt.not.leaf.edges <-
dt_tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt_tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
trees <- dt_tree[,unique(Tree)]
paths <- list()
for (tree in trees) {
graph <-
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
paths.tmp <-
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt_tree[Tree == tree &
Feature == "Leaf", c(ID)])
paths <- c(paths, paths.tmp$vpath)
}
paths
}
#' Plot model trees deepness
#'
#' Generate a graph to plot the distribution of deepness among trees.
#'
#' @param model dump generated by the \code{xgb.train} function.
#'
#' @return Two graphs showing the distribution of the model deepness.
#'
#' Visualizes distributions related to depth of tree leafs.
#' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
#'
#' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function
#' or a data.table result of the \code{xgb.model.dt.tree} function.
#' @param plot (base R barplot) whether a barplot should be produced.
#' If FALSE, only a data.table is returned.
#' @param which which distribution to plot (see details).
#' @param ... other parameters passed to \code{barplot} or \code{plot}.
#'
#' @details
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
#' by tree deepness level.
#'
#' The purpose of this function is to help the user to find the best trade-off to set
#' the \code{max_depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
#'
#' See \link{xgb.train} for more information about these parameters.
#'
#' The graph is made of two parts:
#'
#' When \code{which="2x1"}, two distributions with respect to the leaf depth
#' are plotted on top of each other:
#' \itemize{
#' \item Count: number of leaf per level of deepness;
#' \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
#' \item the distribution of the number of leafs in a tree model at a certain depth;
#' \item the distribution of average weighted number of observations ("cover")
#' ending up in leafs at certain depth.
#' }
#' Those could be helpful in determining sensible ranges of the \code{max_depth}
#' and \code{min_child_weight} parameters.
#'
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
#' a tree's median absolute leaf weight changes through the iterations.
#'
#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#' This function was inspired by the blog post
#' \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
#'
#' @return
#'
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
#'
#' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
#' or a single ggplot graph for the other \code{which} options.
#'
#' @seealso
#'
#' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
#'
#' @examples
#'
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
#' min_child_weight = 50)
#' eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
#' subsample = 0.5, min_child_weight = 2)
#'
#' xgb.plot.deepness(model = bst)
#' xgb.plot.deepness(bst)
#' xgb.ggplot.deepness(bst)
#'
#' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
#'
#' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
#'
#' @rdname xgb.plot.deepness
#' @export
xgb.plot.deepness <- function(model = NULL) {
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required for plotting the graph deepness.",
call. = FALSE)
xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
plot = TRUE, ...) {
if (!(class(model) == "xgb.Booster" || is.data.table(model)))
stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
"or a data.table result of the xgb.importance function")
if (!requireNamespace("igraph", quietly = TRUE))
stop("igraph package is required for plotting the graph deepness.", call. = FALSE)
which <- match.arg(which)
dt_tree <- model
if (class(model) == "xgb.Booster")
dt_tree <- xgb.model.dt.tree(model = model)
if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))
stop("Model tree columns are not as expected!\n",
" Note that this function works only for tree models.")
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight=Quality)], by = "ID")
setkeyv(dt_depths, c("Tree", "ID"))
# count by depth levels, and also calculate average cover at a depth
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
setkey(dt_summaries, "Depth")
if (plot) {
if (which == "2x1") {
op <- par(no.readonly = TRUE)
par(mfrow=c(2,1),
oma = c(3,1,3,1) + 0.1,
mar = c(1,4,1,0) + 0.1)
dt_summaries[, barplot(N, border=NA, ylab = 'Number of leafs', ...)]
dt_summaries[, barplot(Cover, border=NA, ylab = "Weighted cover", names.arg=Depth, ...)]
title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1)
par(op)
} else if (which == "max.depth") {
dt_depths[, max(Depth), Tree][
, plot(jitter(V1, amount = 0.1) ~ Tree, ylab = 'Max tree leaf depth', xlab = "tree #", ...)]
} else if (which == "med.depth") {
dt_depths[, median(as.numeric(Depth)), Tree][
, plot(jitter(V1, amount = 0.1) ~ Tree, ylab = 'Median tree leaf depth', xlab = "tree #", ...)]
} else if (which == "med.weight") {
dt_depths[, median(abs(Weight)), Tree][
, plot(V1 ~ Tree, ylab = 'Median absolute leaf weight', xlab = "tree #", ...)]
}
}
if (!requireNamespace("igraph", quietly = TRUE)) {
stop("igraph package is required for plotting the graph deepness.",
call. = FALSE)
}
if (!requireNamespace("grid", quietly = TRUE)) {
stop("grid package is required for plotting the graph deepness.",
call. = FALSE)
}
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
dt.tree <- xgb.model.dt.tree(model = model)
dt.edge.elements <- data.table()
paths <- get.paths.to.leaf(dt.tree)
dt.edge.elements <-
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
dt.edge.summuize <-
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
p1 <-
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
panel.grid.major.y = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank()
)
p2 <-
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
multiplot(p1,p2,cols = 1)
invisible(dt_depths)
}
# Extract path depths from root to leaf
# from data.table containing the nodes and edges of the trees.
# internal utility function
get.leaf.depth <- function(dt_tree) {
# extract tree graph's edges
dt_edges <- rbindlist(list(
dt_tree[Feature != "Leaf", .(ID, To=Yes, Tree)],
dt_tree[Feature != "Leaf", .(ID, To=No, Tree)]
))
# whether "To" is a leaf:
dt_edges <-
merge(dt_edges,
dt_tree[Feature == "Leaf", .(ID, Leaf = TRUE)],
all.x = TRUE, by.x = "To", by.y = "ID")
dt_edges[is.na(Leaf), Leaf := FALSE]
dt_edges[, {
graph <- igraph::graph_from_data_frame(.SD[,.(ID, To)])
# min(ID) in a tree is a root node
paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE])
# list of paths to each leaf in a tree
paths <- lapply(paths_tmp$vpath, names)
# combine into a resulting path lengths table for a tree
data.table(Depth = sapply(paths, length), ID = To[Leaf == TRUE])
}, by = Tree]
}
# Avoid error messages during CRAN check.
@ -150,6 +144,6 @@ xgb.plot.deepness <- function(model = NULL) {
# They are mainly column names inferred by Data.table...
globalVariables(
c(
".N", "N", "size", "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature"
)
)

View File

@ -1,79 +1,125 @@
#' Plot feature importance bar graph
#' Plot feature importance as a bar graph
#'
#' Read a data.table containing feature importance details and plot it (for both GLM and Trees).
#' Represents previously calculated feature importance as a bar graph.
#' \code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend.
#'
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
#' @param n_clusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
#' @param ... currently not used
#'
#' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
#' @param importance_matrix a \code{data.table} returned by \code{\link{xgb.importance}}.
#' @param top_n maximal number of top features to include into the plot.
#' @param measure the name of importance measure to plot.
#' When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
#' @param rel_to_first whether importance values should be represented as relative to the highest ranked feature.
#' See Details.
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
#' When it is NULL, the existing \code{par('mar')} is used.
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#' @param plot (base R barplot) whether a barplot should be produced.
#' If FALSE, only a data.table is returned.
#' @param n_clusters (ggplot only) a \code{numeric} vector containing the min and the max range
#' of the possible number of clusters of bars.
#' @param ... other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las).
#'
#' @details
#' The purpose of this function is to easily represent the importance of each feature of a model.
#' The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
#' The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
#' Features are shown ranked in a decreasing importance order.
#' It works for importances from both \code{gblinear} and \code{gbtree} models.
#'
#' When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}.
#' For gbtree model, that would mean being normalized to the total of 1
#' ("what is feature's importance contribution relative to the whole model?").
#' For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
#' Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
#' "what is feature's importance contribution relative to the most important feature?"
#'
#' The ggplot-backend method also performs 1-D custering of the importance values,
#' with bar colors coresponding to different clusters that have somewhat similar importance values.
#'
#' @return
#' The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE})
#' and silently returns a processed data.table with \code{n_top} features sorted by importance.
#'
#' The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards.
#' E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result.
#'
#' @seealso
#' \code{\link[graphics]{barplot}}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.train)
#'
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#'
#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
#' xgb.plot.importance(importance_matrix)
#'
#' xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance")
#'
#' (gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE))
#' gg + ggplot2::ylab("Frequency")
#'
#' @rdname xgb.plot.importance
#' @export
xgb.plot.importance <-
function(importance_matrix = NULL, n_clusters = c(1:10), ...) {
check.deprecation(...)
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
}
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
}
if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Gain", "Cover", "Frequency")))){
y.axe.name <- "Gain"
} else if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Weight")))){
y.axe.name <- "Weight"
} else {
stop("Importance matrix is not correct (column names issue)")
}
# To avoid issues in clustering when co-occurences are used
importance_matrix <-
importance_matrix[, .(Gain.or.Weight = sum(get(y.axe.name))), by = Feature]
clusters <-
suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain.or.Weight], n_clusters))
importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
plot <-
ggplot2::ggplot(
importance_matrix, ggplot2::aes(
x = stats::reorder(Feature, Gain.or.Weight), y = Gain.or.Weight, width = 0.05
), environment = environment()
) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
"identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab(y.axe.name) + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
)
return(plot)
xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) {
check.deprecation(...)
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
imp_names <- colnames(importance_matrix)
if (is.null(measure)) {
if (all(c("Feature", "Gain") %in% imp_names)) {
measure <- "Gain"
} else if (all(c("Feature", "Weight") %in% imp_names)) {
measure <- "Weight"
} else {
stop("Importance matrix column names are not as expected!")
}
} else {
if (!measure %in% imp_names)
stop("Invalid `measure`")
if (!"Feature" %in% imp_names)
stop("Importance matrix column names are not as expected!")
}
# also aggregate, just in case when the values were not yet summed up by feature
importance_matrix <- importance_matrix[, Importance := sum(get(measure)), by = Feature]
# make sure it's ordered
importance_matrix <- importance_matrix[order(-abs(Importance))]
if (!is.null(top_n)) {
top_n <- min(top_n, nrow(importance_matrix))
importance_matrix <- head(importance_matrix, top_n)
}
if (rel_to_first) {
importance_matrix[, Importance := Importance/max(abs(Importance))]
}
if (is.null(cex)) {
cex <- 2.5/log2(1 + nrow(importance_matrix))
}
if (plot) {
op <- par(no.readonly = TRUE)
mar <- op$mar
if (!is.null(left_margin))
mar[2] <- left_margin
par(mar = mar)
# reverse the order of rows to have the highest ranked at the top
importance_matrix[nrow(importance_matrix):1,
barplot(Importance, horiz=TRUE, border=NA, cex.names=cex,
names.arg=Feature, las=1, ...)]
grid(NULL, NA)
# redraw over the grid
importance_matrix[nrow(importance_matrix):1,
barplot(Importance, horiz=TRUE, border=NA, add=TRUE)]
par(op)
}
invisible(importance_matrix)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(
c(
"Feature", "Gain.or.Weight", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "Gain.or.Weight"
)
)
globalVariables(c("Feature", "Importance"))

View File

@ -49,7 +49,7 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
# first number of the path represents the tree, then the following numbers are related to the path to follow
# root init
root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID]
tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
precedent.nodes <- root.nodes
@ -70,7 +70,7 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
remove.tree <- . %>% str_replace(pattern = "^\\d+-", replacement = "")
remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]

View File

@ -87,14 +87,15 @@ NULL
#' @importFrom data.table as.data.table
#' @importFrom data.table :=
#' @importFrom data.table rbindlist
#' @importFrom data.table setkey
#' @importFrom data.table setkeyv
#' @importFrom data.table setnames
#' @importFrom magrittr %>%
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#' @importFrom stringr str_match
#' @importFrom stringr str_replace
#' @importFrom stringr str_replace_all
#' @importFrom stringr str_split
#' @importFrom stringi stri_detect_regex
#' @importFrom stringi stri_match_first_regex
#' @importFrom stringi stri_replace_first_regex
#' @importFrom stringi stri_replace_all_regex
#' @importFrom stringi stri_split_regex
#' @importFrom utils object.size str tail
#' @importFrom stats predict
#'

View File

@ -1,15 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{edge.parser}
\alias{edge.parser}
\title{Parse the graph to extract vector of edges}
\usage{
edge.parser(element)
}
\arguments{
\item{element}{igraph object containing the path from the root to the leaf.}
}
\description{
Parse the graph to extract vector of edges
}

View File

@ -1,15 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{get.paths.to.leaf}
\alias{get.paths.to.leaf}
\title{Extract path from root to leaf from data.table}
\usage{
get.paths.to.leaf(dt_tree)
}
\arguments{
\item{dt_tree}{data.table containing the nodes and edges of the trees}
}
\description{
Extract path from root to leaf from data.table
}

View File

@ -1,17 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{multiplot}
\alias{multiplot}
\title{Plot multiple graphs at the same time}
\usage{
multiplot(..., cols = 1)
}
\arguments{
\item{...}{the plots}
\item{cols}{number of columns}
}
\description{
Plot multiple graph aligned by rows and columns.
}

View File

@ -1,46 +1,74 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{xgb.plot.deepness}
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.deepness.R
\name{xgb.ggplot.deepness}
\alias{xgb.ggplot.deepness}
\alias{xgb.plot.deepness}
\title{Plot model trees deepness}
\usage{
xgb.plot.deepness(model = NULL)
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
"med.weight"))
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
"med.weight"), plot = TRUE, ...)
}
\arguments{
\item{model}{dump generated by the \code{xgb.train} function.}
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
or a data.table result of the \code{xgb.model.dt.tree} function.}
\item{which}{which distribution to plot (see details).}
\item{plot}{(base R barplot) whether a barplot should be produced.
If FALSE, only a data.table is returned.}
\item{...}{other parameters passed to \code{barplot} or \code{plot}.}
}
\value{
Two graphs showing the distribution of the model deepness.
Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
or a single ggplot graph for the other \code{which} options.
}
\description{
Generate a graph to plot the distribution of deepness among trees.
Visualizes distributions related to depth of tree leafs.
\code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
}
\details{
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
by tree deepness level.
The purpose of this function is to help the user to find the best trade-off to set
the \code{max_depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
See \link{xgb.train} for more information about these parameters.
The graph is made of two parts:
When \code{which="2x1"}, two distributions with respect to the leaf depth
are plotted on top of each other:
\itemize{
\item Count: number of leaf per level of deepness;
\item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
\item the distribution of the number of leafs in a tree model at a certain depth;
\item the distribution of average weighted number of observations ("cover")
ending up in leafs at certain depth.
}
Those could be helpful in determining sensible ranges of the \code{max_depth}
and \code{min_child_weight} parameters.
This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
a tree's median absolute leaf weight changes through the iterations.
This function was inspired by the blog post
\url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
min_child_weight = 50)
eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
subsample = 0.5, min_child_weight = 2)
xgb.plot.deepness(model = bst)
xgb.plot.deepness(bst)
xgb.ggplot.deepness(bst)
xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
}
\seealso{
\code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
}

View File

@ -1,41 +1,82 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.importance.R
\name{xgb.plot.importance}
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.importance.R
\name{xgb.ggplot.importance}
\alias{xgb.ggplot.importance}
\alias{xgb.plot.importance}
\title{Plot feature importance bar graph}
\title{Plot feature importance as a bar graph}
\usage{
xgb.plot.importance(importance_matrix = NULL, n_clusters = c(1:10), ...)
xgb.ggplot.importance(importance_matrix = NULL, top_n = NULL,
measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)
xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL,
plot = TRUE, ...)
}
\arguments{
\item{importance_matrix}{a \code{data.table} returned by the \code{xgb.importance} function.}
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
\item{n_clusters}{a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.}
\item{top_n}{maximal number of top features to include into the plot.}
\item{...}{currently not used}
\item{measure}{the name of importance measure to plot.
When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.}
\item{rel_to_first}{whether importance values should be represented as relative to the highest ranked feature.
See Details.}
\item{n_clusters}{(ggplot only) a \code{numeric} vector containing the min and the max range
of the possible number of clusters of bars.}
\item{...}{other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las).}
\item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.
When it is NULL, the existing \code{par('mar')} is used.}
\item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
\item{plot}{(base R barplot) whether a barplot should be produced.
If FALSE, only a data.table is returned.}
}
\value{
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE})
and silently returns a processed data.table with \code{n_top} features sorted by importance.
The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards.
E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result.
}
\description{
Read a data.table containing feature importance details and plot it (for both GLM and Trees).
Represents previously calculated feature importance as a bar graph.
\code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend.
}
\details{
The purpose of this function is to easily represent the importance of each feature of a model.
The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
Features are shown ranked in a decreasing importance order.
It works for importances from both \code{gblinear} and \code{gbtree} models.
When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}.
For gbtree model, that would mean being normalized to the total of 1
("what is feature's importance contribution relative to the whole model?").
For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
"what is feature's importance contribution relative to the most important feature?"
The ggplot-backend method also performs 1-D custering of the importance values,
with bar colors coresponding to different clusters that have somewhat similar importance values.
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.train)
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
xgb.plot.importance(importance_matrix)
xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance")
(gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE))
gg + ggplot2::ylab("Frequency")
}
\seealso{
\code{\link[graphics]{barplot}}.
}

View File

@ -264,7 +264,7 @@ test_that("prediction in early-stopping xgb.cv works", {
set.seed(1)
expect_output(
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
early_stopping_rounds = 5, maximize = FALSE, prediction=TRUE)
early_stopping_rounds = 5, maximize = FALSE, prediction = TRUE)
, "Stopping. Best iteration")
expect_false(is.null(cv$best_iteration))
@ -279,3 +279,17 @@ test_that("prediction in early-stopping xgb.cv works", {
err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
expect_gt(abs(err_pred - err_log_last), 1e-4)
})
test_that("prediction in xgb.cv for softprob works", {
lb <- as.numeric(iris$Species) - 1
set.seed(11)
expect_warning(
cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
subsample = 0.8, gamma = 2,
prediction = TRUE, objective = "multi:softprob", num_class = 3)
, NA)
expect_false(is.null(cv$pred))
expect_equal(dim(cv$pred), c(nrow(iris), 3))
expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
})

View File

@ -24,12 +24,22 @@ feature.names <- colnames(sparse_matrix)
test_that("xgb.dump works", {
expect_length(xgb.dump(bst.Tree), 172)
expect_length(xgb.dump(bst.GLM), 14)
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
expect_true(file.exists('xgb.model.dump'))
expect_gt(file.size('xgb.model.dump'), 8000)
})
test_that("xgb.dump works for gblinear", {
expect_length(xgb.dump(bst.GLM), 14)
# also make sure that it works properly for a sparse model where some coefficients
# are 0 from setting large L1 regularization:
bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
alpha=2, objective = "binary:logistic", booster = "gblinear")
d.sp <- xgb.dump(bst.GLM.sp)
expect_length(d.sp, 14)
expect_gt(sum(d.sp == "0"), 0)
})
test_that("xgb-attribute functionality", {
val <- "my attribute value"
list.val <- list(my_attr=val, a=123, b='ok')
@ -76,7 +86,9 @@ test_that("xgb.importance works with and without feature names", {
expect_equal(dim(importance.Tree), c(7, 4))
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"')
xgb.plot.importance(importance_matrix = importance.Tree)
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
xgb.ggplot.importance(importance_matrix = importance.Tree)
})
test_that("xgb.importance works with GLM model", {
@ -84,7 +96,9 @@ test_that("xgb.importance works with GLM model", {
expect_equal(dim(importance.GLM), c(10, 2))
expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
xgb.importance(model = bst.GLM)
xgb.plot.importance(importance.GLM)
imp2plot <- xgb.plot.importance(importance.GLM)
expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance"))
xgb.ggplot.importance(importance.GLM)
})
test_that("xgb.plot.tree works with and without feature names", {
@ -98,7 +112,10 @@ test_that("xgb.plot.multi.trees works with and without feature names", {
})
test_that("xgb.plot.deepness works", {
xgb.plot.deepness(model = bst.Tree)
d2p <- xgb.plot.deepness(model = bst.Tree)
expect_equal(colnames(d2p), c("ID", "Tree", "Depth", "Cover", "Weight"))
xgb.plot.deepness(model = bst.Tree, which = "med.depth")
xgb.ggplot.deepness(model = bst.Tree)
})
test_that("check.deprecation works", {

View File

@ -35,6 +35,7 @@
#if DMLC_ENABLE_STD_THREAD
#include "../src/data/sparse_page_source.cc"
#include "../src/data/sparse_page_dmatrix.cc"
#include "../src/data/sparse_page_writer.cc"
#endif
// tress