[R-package] GPL2 dependency reduction and some fixes (#1401)
* [R] do not remove zero coefficients from gblinear dump * [R] switch from stringr to stringi * fix #1399 * [R] separate ggplot backend, add base r graphics, cleanup, more plots, tests * add missing include in amalgamation - fixes building R package in linux * add forgotten file * [R] fix DESCRIPTION * [R] fix travis check issue and some cleanup
This commit is contained in:
committed by
Tong He
parent
f6423056c0
commit
d5c143367d
@@ -1,148 +1,142 @@
|
||||
#' Plot multiple graphs at the same time
|
||||
#'
|
||||
#' Plot multiple graph aligned by rows and columns.
|
||||
#'
|
||||
#' @param ... the plots
|
||||
#' @param cols number of columns
|
||||
#' @return NULL
|
||||
multiplot <- function(..., cols = 1) {
|
||||
plots <- list(...)
|
||||
numPlots = length(plots)
|
||||
|
||||
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
|
||||
ncol = cols, nrow = ceiling(numPlots / cols))
|
||||
|
||||
if (numPlots == 1) {
|
||||
print(plots[[1]])
|
||||
} else {
|
||||
grid::grid.newpage()
|
||||
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
|
||||
for (i in 1:numPlots) {
|
||||
# Get the i,j matrix positions of the regions that contain this subplot
|
||||
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
|
||||
|
||||
print(
|
||||
plots[[i]], vp = grid::viewport(
|
||||
layout.pos.row = matchidx$row,
|
||||
layout.pos.col = matchidx$col
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#' Parse the graph to extract vector of edges
|
||||
#' @param element igraph object containing the path from the root to the leaf.
|
||||
edge.parser <- function(element) {
|
||||
edges.vector <- igraph::as_ids(element)
|
||||
t <- tail(edges.vector, n = 1)
|
||||
l <- length(edges.vector)
|
||||
list(t,l)
|
||||
}
|
||||
|
||||
#' Extract path from root to leaf from data.table
|
||||
#' @param dt_tree data.table containing the nodes and edges of the trees
|
||||
get.paths.to.leaf <- function(dt_tree) {
|
||||
dt.not.leaf.edges <-
|
||||
dt_tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt_tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
|
||||
|
||||
trees <- dt_tree[,unique(Tree)]
|
||||
|
||||
paths <- list()
|
||||
for (tree in trees) {
|
||||
graph <-
|
||||
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
|
||||
paths.tmp <-
|
||||
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt_tree[Tree == tree &
|
||||
Feature == "Leaf", c(ID)])
|
||||
paths <- c(paths, paths.tmp$vpath)
|
||||
}
|
||||
paths
|
||||
}
|
||||
|
||||
#' Plot model trees deepness
|
||||
#'
|
||||
#' Generate a graph to plot the distribution of deepness among trees.
|
||||
#'
|
||||
#' @param model dump generated by the \code{xgb.train} function.
|
||||
#'
|
||||
#' @return Two graphs showing the distribution of the model deepness.
|
||||
#'
|
||||
#' Visualizes distributions related to depth of tree leafs.
|
||||
#' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
|
||||
#'
|
||||
#' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||
#' or a data.table result of the \code{xgb.model.dt.tree} function.
|
||||
#' @param plot (base R barplot) whether a barplot should be produced.
|
||||
#' If FALSE, only a data.table is returned.
|
||||
#' @param which which distribution to plot (see details).
|
||||
#' @param ... other parameters passed to \code{barplot} or \code{plot}.
|
||||
#'
|
||||
#' @details
|
||||
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||
#' by tree deepness level.
|
||||
#'
|
||||
#' The purpose of this function is to help the user to find the best trade-off to set
|
||||
#' the \code{max_depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||
#'
|
||||
#' See \link{xgb.train} for more information about these parameters.
|
||||
#'
|
||||
#' The graph is made of two parts:
|
||||
#'
|
||||
#' When \code{which="2x1"}, two distributions with respect to the leaf depth
|
||||
#' are plotted on top of each other:
|
||||
#' \itemize{
|
||||
#' \item Count: number of leaf per level of deepness;
|
||||
#' \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
|
||||
#' \item the distribution of the number of leafs in a tree model at a certain depth;
|
||||
#' \item the distribution of average weighted number of observations ("cover")
|
||||
#' ending up in leafs at certain depth.
|
||||
#' }
|
||||
#' Those could be helpful in determining sensible ranges of the \code{max_depth}
|
||||
#' and \code{min_child_weight} parameters.
|
||||
#'
|
||||
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
|
||||
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
|
||||
#' a tree's median absolute leaf weight changes through the iterations.
|
||||
#'
|
||||
#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
#' This function was inspired by the blog post
|
||||
#' \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
|
||||
#'
|
||||
#' @return
|
||||
#'
|
||||
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
|
||||
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
|
||||
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
|
||||
#'
|
||||
#' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
|
||||
#' or a single ggplot graph for the other \code{which} options.
|
||||
#'
|
||||
#' @seealso
|
||||
#'
|
||||
#' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
|
||||
#'
|
||||
#' @examples
|
||||
#'
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
||||
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
||||
#' min_child_weight = 50)
|
||||
#' eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
||||
#' subsample = 0.5, min_child_weight = 2)
|
||||
#'
|
||||
#' xgb.plot.deepness(model = bst)
|
||||
#' xgb.plot.deepness(bst)
|
||||
#' xgb.ggplot.deepness(bst)
|
||||
#'
|
||||
#' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||
#'
|
||||
#' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||
#'
|
||||
#' @rdname xgb.plot.deepness
|
||||
#' @export
|
||||
xgb.plot.deepness <- function(model = NULL) {
|
||||
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||
stop("ggplot2 package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
|
||||
plot = TRUE, ...) {
|
||||
|
||||
if (!(class(model) == "xgb.Booster" || is.data.table(model)))
|
||||
stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
|
||||
"or a data.table result of the xgb.importance function")
|
||||
|
||||
if (!requireNamespace("igraph", quietly = TRUE))
|
||||
stop("igraph package is required for plotting the graph deepness.", call. = FALSE)
|
||||
|
||||
which <- match.arg(which)
|
||||
|
||||
dt_tree <- model
|
||||
if (class(model) == "xgb.Booster")
|
||||
dt_tree <- xgb.model.dt.tree(model = model)
|
||||
|
||||
if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))
|
||||
stop("Model tree columns are not as expected!\n",
|
||||
" Note that this function works only for tree models.")
|
||||
|
||||
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight=Quality)], by = "ID")
|
||||
setkeyv(dt_depths, c("Tree", "ID"))
|
||||
# count by depth levels, and also calculate average cover at a depth
|
||||
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
|
||||
setkey(dt_summaries, "Depth")
|
||||
|
||||
if (plot) {
|
||||
if (which == "2x1") {
|
||||
op <- par(no.readonly = TRUE)
|
||||
par(mfrow=c(2,1),
|
||||
oma = c(3,1,3,1) + 0.1,
|
||||
mar = c(1,4,1,0) + 0.1)
|
||||
|
||||
dt_summaries[, barplot(N, border=NA, ylab = 'Number of leafs', ...)]
|
||||
|
||||
dt_summaries[, barplot(Cover, border=NA, ylab = "Weighted cover", names.arg=Depth, ...)]
|
||||
|
||||
title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1)
|
||||
par(op)
|
||||
} else if (which == "max.depth") {
|
||||
dt_depths[, max(Depth), Tree][
|
||||
, plot(jitter(V1, amount = 0.1) ~ Tree, ylab = 'Max tree leaf depth', xlab = "tree #", ...)]
|
||||
} else if (which == "med.depth") {
|
||||
dt_depths[, median(as.numeric(Depth)), Tree][
|
||||
, plot(jitter(V1, amount = 0.1) ~ Tree, ylab = 'Median tree leaf depth', xlab = "tree #", ...)]
|
||||
} else if (which == "med.weight") {
|
||||
dt_depths[, median(abs(Weight)), Tree][
|
||||
, plot(V1 ~ Tree, ylab = 'Median absolute leaf weight', xlab = "tree #", ...)]
|
||||
}
|
||||
}
|
||||
|
||||
if (!requireNamespace("igraph", quietly = TRUE)) {
|
||||
stop("igraph package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (!requireNamespace("grid", quietly = TRUE)) {
|
||||
stop("grid package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
dt.tree <- xgb.model.dt.tree(model = model)
|
||||
|
||||
dt.edge.elements <- data.table()
|
||||
paths <- get.paths.to.leaf(dt.tree)
|
||||
|
||||
dt.edge.elements <-
|
||||
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
|
||||
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
|
||||
|
||||
dt.edge.summuize <-
|
||||
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
|
||||
|
||||
p1 <-
|
||||
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
|
||||
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
|
||||
ggplot2::theme(
|
||||
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
|
||||
panel.grid.major.y = ggplot2::element_blank(),
|
||||
axis.ticks = ggplot2::element_blank(),
|
||||
axis.text.x = ggplot2::element_blank()
|
||||
)
|
||||
|
||||
p2 <-
|
||||
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
|
||||
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
|
||||
|
||||
multiplot(p1,p2,cols = 1)
|
||||
invisible(dt_depths)
|
||||
}
|
||||
|
||||
# Extract path depths from root to leaf
|
||||
# from data.table containing the nodes and edges of the trees.
|
||||
# internal utility function
|
||||
get.leaf.depth <- function(dt_tree) {
|
||||
# extract tree graph's edges
|
||||
dt_edges <- rbindlist(list(
|
||||
dt_tree[Feature != "Leaf", .(ID, To=Yes, Tree)],
|
||||
dt_tree[Feature != "Leaf", .(ID, To=No, Tree)]
|
||||
))
|
||||
# whether "To" is a leaf:
|
||||
dt_edges <-
|
||||
merge(dt_edges,
|
||||
dt_tree[Feature == "Leaf", .(ID, Leaf = TRUE)],
|
||||
all.x = TRUE, by.x = "To", by.y = "ID")
|
||||
dt_edges[is.na(Leaf), Leaf := FALSE]
|
||||
|
||||
dt_edges[, {
|
||||
graph <- igraph::graph_from_data_frame(.SD[,.(ID, To)])
|
||||
# min(ID) in a tree is a root node
|
||||
paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE])
|
||||
# list of paths to each leaf in a tree
|
||||
paths <- lapply(paths_tmp$vpath, names)
|
||||
# combine into a resulting path lengths table for a tree
|
||||
data.table(Depth = sapply(paths, length), ID = To[Leaf == TRUE])
|
||||
}, by = Tree]
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
@@ -150,6 +144,6 @@ xgb.plot.deepness <- function(model = NULL) {
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(
|
||||
c(
|
||||
".N", "N", "size", "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
|
||||
".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature"
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user