[R-package] GPL2 dependency reduction and some fixes (#1401)
* [R] do not remove zero coefficients from gblinear dump * [R] switch from stringr to stringi * fix #1399 * [R] separate ggplot backend, add base r graphics, cleanup, more plots, tests * add missing include in amalgamation - fixes building R package in linux * add forgotten file * [R] fix DESCRIPTION * [R] fix travis check issue and some cleanup
This commit is contained in:
parent
f6423056c0
commit
d5c143367d
@ -35,5 +35,5 @@ Imports:
|
|||||||
methods,
|
methods,
|
||||||
data.table (>= 1.9.6),
|
data.table (>= 1.9.6),
|
||||||
magrittr (>= 1.5),
|
magrittr (>= 1.5),
|
||||||
stringr (>= 0.6.2)
|
stringi (>= 0.5.2)
|
||||||
RoxygenNote: 5.0.1
|
RoxygenNote: 5.0.1
|
||||||
|
|||||||
@ -31,6 +31,8 @@ export(xgb.attributes)
|
|||||||
export(xgb.create.features)
|
export(xgb.create.features)
|
||||||
export(xgb.cv)
|
export(xgb.cv)
|
||||||
export(xgb.dump)
|
export(xgb.dump)
|
||||||
|
export(xgb.ggplot.deepness)
|
||||||
|
export(xgb.ggplot.importance)
|
||||||
export(xgb.importance)
|
export(xgb.importance)
|
||||||
export(xgb.load)
|
export(xgb.load)
|
||||||
export(xgb.model.dt.tree)
|
export(xgb.model.dt.tree)
|
||||||
@ -53,15 +55,16 @@ importFrom(data.table,":=")
|
|||||||
importFrom(data.table,as.data.table)
|
importFrom(data.table,as.data.table)
|
||||||
importFrom(data.table,data.table)
|
importFrom(data.table,data.table)
|
||||||
importFrom(data.table,rbindlist)
|
importFrom(data.table,rbindlist)
|
||||||
|
importFrom(data.table,setkey)
|
||||||
|
importFrom(data.table,setkeyv)
|
||||||
importFrom(data.table,setnames)
|
importFrom(data.table,setnames)
|
||||||
importFrom(magrittr,"%>%")
|
importFrom(magrittr,"%>%")
|
||||||
importFrom(stats,predict)
|
importFrom(stats,predict)
|
||||||
importFrom(stringr,str_detect)
|
importFrom(stringi,stri_detect_regex)
|
||||||
importFrom(stringr,str_extract)
|
importFrom(stringi,stri_match_first_regex)
|
||||||
importFrom(stringr,str_match)
|
importFrom(stringi,stri_replace_all_regex)
|
||||||
importFrom(stringr,str_replace)
|
importFrom(stringi,stri_replace_first_regex)
|
||||||
importFrom(stringr,str_replace_all)
|
importFrom(stringi,stri_split_regex)
|
||||||
importFrom(stringr,str_split)
|
|
||||||
importFrom(utils,object.size)
|
importFrom(utils,object.size)
|
||||||
importFrom(utils,str)
|
importFrom(utils,str)
|
||||||
importFrom(utils,tail)
|
importFrom(utils,tail)
|
||||||
|
|||||||
@ -482,9 +482,12 @@ cb.cv.predict <- function(save_models = FALSE) {
|
|||||||
stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
|
stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
|
||||||
|
|
||||||
N <- nrow(env$data)
|
N <- nrow(env$data)
|
||||||
pred <- ifelse(env$num_class > 1,
|
pred <-
|
||||||
matrix(NA_real_, N, env$num_class),
|
if (env$num_class > 1) {
|
||||||
rep(NA_real_, N))
|
matrix(NA_real_, N, env$num_class)
|
||||||
|
} else {
|
||||||
|
rep(NA_real_, N)
|
||||||
|
}
|
||||||
|
|
||||||
ntreelimit <- NVL(env$basket$best_ntreelimit,
|
ntreelimit <- NVL(env$basket$best_ntreelimit,
|
||||||
env$end_iteration * env$num_parallel_tree)
|
env$end_iteration * env$num_parallel_tree)
|
||||||
|
|||||||
@ -146,7 +146,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
|
|||||||
if (is.null(feval)) {
|
if (is.null(feval)) {
|
||||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
|
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
|
||||||
as.list(evnames), PACKAGE = "xgboost")
|
as.list(evnames), PACKAGE = "xgboost")
|
||||||
msg <- str_split(msg, '(\\s+|:|\\s+)')[[1]][-1]
|
msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1]
|
||||||
res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
|
res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values
|
||||||
names(res) <- msg[c(TRUE,FALSE)] # odds are the names
|
names(res) <- msg[c(TRUE,FALSE)] # odds are the names
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -45,10 +45,10 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ..
|
|||||||
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
|
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
|
||||||
|
|
||||||
if (is.null(fname))
|
if (is.null(fname))
|
||||||
model_dump <- str_replace_all(model_dump, '\t', '')
|
model_dump <- stri_replace_all_regex(model_dump, '\t', '')
|
||||||
|
|
||||||
model_dump <- unlist(str_split(model_dump, '\n'))
|
model_dump <- unlist(stri_split_regex(model_dump, '\n'))
|
||||||
model_dump <- grep('(^$|^0$)', model_dump, invert = TRUE, value = TRUE)
|
model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
|
||||||
|
|
||||||
if (is.null(fname)) {
|
if (is.null(fname)) {
|
||||||
return(model_dump)
|
return(model_dump)
|
||||||
|
|||||||
135
R-package/R/xgb.ggplot.R
Normal file
135
R-package/R/xgb.ggplot.R
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
# ggplot backend for the xgboost plotting facilities
|
||||||
|
|
||||||
|
|
||||||
|
#' @rdname xgb.plot.importance
|
||||||
|
#' @export
|
||||||
|
xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
|
||||||
|
rel_to_first = FALSE, n_clusters = c(1:10), ...) {
|
||||||
|
|
||||||
|
importance_matrix <- xgb.plot.importance(importance_matrix, top_n = top_n, measure = measure,
|
||||||
|
rel_to_first = rel_to_first, plot = FALSE, ...)
|
||||||
|
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||||
|
stop("ggplot2 package is required", call. = FALSE)
|
||||||
|
}
|
||||||
|
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
|
||||||
|
stop("Ckmeans.1d.dp package is required", call. = FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
|
clusters <- suppressWarnings(
|
||||||
|
Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix$Importance, n_clusters)
|
||||||
|
)
|
||||||
|
importance_matrix[, Cluster := as.character(clusters$cluster)]
|
||||||
|
|
||||||
|
plot <-
|
||||||
|
ggplot2::ggplot(importance_matrix,
|
||||||
|
ggplot2::aes(x = factor(Feature, levels = rev(Feature)), y = Importance, width = 0.05),
|
||||||
|
environment = environment()) +
|
||||||
|
ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = "identity") +
|
||||||
|
ggplot2::coord_flip() +
|
||||||
|
ggplot2::xlab("Features") +
|
||||||
|
ggplot2::ggtitle("Feature importance") +
|
||||||
|
ggplot2::theme(plot.title = ggplot2::element_text(lineheight = .9, face = "bold"),
|
||||||
|
panel.grid.major.y = ggplot2::element_blank())
|
||||||
|
return(plot)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#' @rdname xgb.plot.deepness
|
||||||
|
#' @export
|
||||||
|
xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight")) {
|
||||||
|
|
||||||
|
if (!requireNamespace("ggplot2", quietly = TRUE))
|
||||||
|
stop("ggplot2 package is required for plotting the graph deepness.", call. = FALSE)
|
||||||
|
|
||||||
|
which <- match.arg(which)
|
||||||
|
|
||||||
|
dt_depths <- xgb.plot.deepness(model = model, plot = FALSE)
|
||||||
|
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
|
||||||
|
setkey(dt_summaries, 'Depth')
|
||||||
|
|
||||||
|
if (which == "2x1") {
|
||||||
|
p1 <-
|
||||||
|
ggplot2::ggplot(dt_summaries) +
|
||||||
|
ggplot2::geom_bar(ggplot2::aes(x = Depth, y = N), stat = "Identity") +
|
||||||
|
ggplot2::xlab("") +
|
||||||
|
ggplot2::ylab("Number of leafs") +
|
||||||
|
ggplot2::ggtitle("Model complexity") +
|
||||||
|
ggplot2::theme(
|
||||||
|
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
|
||||||
|
panel.grid.major.y = ggplot2::element_blank(),
|
||||||
|
axis.ticks = ggplot2::element_blank(),
|
||||||
|
axis.text.x = ggplot2::element_blank()
|
||||||
|
)
|
||||||
|
|
||||||
|
p2 <-
|
||||||
|
ggplot2::ggplot(dt_summaries) +
|
||||||
|
ggplot2::geom_bar(ggplot2::aes(x = Depth, y = Cover), stat = "Identity") +
|
||||||
|
ggplot2::xlab("Leaf depth") +
|
||||||
|
ggplot2::ylab("Weighted cover")
|
||||||
|
|
||||||
|
multiplot(p1, p2, cols = 1)
|
||||||
|
return(invisible(list(p1, p2)))
|
||||||
|
|
||||||
|
} else if (which == "max.depth") {
|
||||||
|
p <-
|
||||||
|
ggplot2::ggplot(dt_depths[, max(Depth), Tree]) +
|
||||||
|
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
|
||||||
|
height = 0.15, alpha=0.4, size=3, stroke=0) +
|
||||||
|
ggplot2::xlab("tree #") +
|
||||||
|
ggplot2::ylab("Max tree leaf depth")
|
||||||
|
return(p)
|
||||||
|
|
||||||
|
} else if (which == "med.depth") {
|
||||||
|
p <-
|
||||||
|
ggplot2::ggplot(dt_depths[, median(as.numeric(Depth)), Tree]) +
|
||||||
|
ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1),
|
||||||
|
height = 0.15, alpha=0.4, size=3, stroke=0) +
|
||||||
|
ggplot2::xlab("tree #") +
|
||||||
|
ggplot2::ylab("Median tree leaf depth")
|
||||||
|
return(p)
|
||||||
|
|
||||||
|
} else if (which == "med.weight") {
|
||||||
|
p <-
|
||||||
|
ggplot2::ggplot(dt_depths[, median(abs(Weight)), Tree]) +
|
||||||
|
ggplot2::geom_point(ggplot2::aes(x = Tree, y = V1),
|
||||||
|
alpha=0.4, size=3, stroke=0) +
|
||||||
|
ggplot2::xlab("tree #") +
|
||||||
|
ggplot2::ylab("Median absolute leaf weight")
|
||||||
|
return(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Plot multiple ggplot graph aligned by rows and columns.
|
||||||
|
# ... the plots
|
||||||
|
# cols number of columns
|
||||||
|
# internal utility function
|
||||||
|
multiplot <- function(..., cols = 1) {
|
||||||
|
plots <- list(...)
|
||||||
|
num_plots = length(plots)
|
||||||
|
|
||||||
|
layout <- matrix(seq(1, cols * ceiling(num_plots / cols)),
|
||||||
|
ncol = cols, nrow = ceiling(num_plots / cols))
|
||||||
|
|
||||||
|
if (num_plots == 1) {
|
||||||
|
print(plots[[1]])
|
||||||
|
} else {
|
||||||
|
grid::grid.newpage()
|
||||||
|
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
|
||||||
|
for (i in 1:num_plots) {
|
||||||
|
# Get the i,j matrix positions of the regions that contain this subplot
|
||||||
|
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
|
||||||
|
|
||||||
|
print(
|
||||||
|
plots[[i]], vp = grid::viewport(
|
||||||
|
layout.pos.row = matchidx$row,
|
||||||
|
layout.pos.col = matchidx$col
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
globalVariables(c(
|
||||||
|
"Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
|
||||||
|
"element_blank", "element_text"
|
||||||
|
))
|
||||||
@ -69,7 +69,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
|||||||
text <- xgb.dump(model = model, with_stats = T)
|
text <- xgb.dump(model = model, with_stats = T)
|
||||||
}
|
}
|
||||||
|
|
||||||
position <- which(!is.na(str_match(text, "booster")))
|
position <- which(!is.na(stri_match_first_regex(text, "booster")))
|
||||||
|
|
||||||
add.tree.id <- function(x, i) paste(i, x, sep = "-")
|
add.tree.id <- function(x, i) paste(i, x, sep = "-")
|
||||||
|
|
||||||
@ -82,16 +82,16 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
|||||||
n_first_tree <- min(max(td$Tree), n_first_tree)
|
n_first_tree <- min(max(td$Tree), n_first_tree)
|
||||||
td <- td[Tree <= n_first_tree & !grepl('^booster', t)]
|
td <- td[Tree <= n_first_tree & !grepl('^booster', t)]
|
||||||
|
|
||||||
td[, Node := str_match(t, "(\\d+):")[,2] %>% as.numeric ]
|
td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.numeric ]
|
||||||
td[, ID := add.tree.id(Node, Tree)]
|
td[, ID := add.tree.id(Node, Tree)]
|
||||||
td[, isLeaf := !is.na(str_match(t, "leaf"))]
|
td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))]
|
||||||
|
|
||||||
# parse branch lines
|
# parse branch lines
|
||||||
td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := {
|
td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := {
|
||||||
rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
|
||||||
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||||
# skip some indices with spurious capture groups from anynumber_regex
|
# skip some indices with spurious capture groups from anynumber_regex
|
||||||
xtr <- str_match(t, rx)[, c(2,3,5,6,7,8,10)]
|
xtr <- stri_match_first_regex(t, rx)[, c(2,3,5,6,7,8,10)]
|
||||||
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
|
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
|
||||||
lapply(1:ncol(xtr), function(i) xtr[,i])
|
lapply(1:ncol(xtr), function(i) xtr[,i])
|
||||||
}]
|
}]
|
||||||
@ -102,7 +102,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
|||||||
# parse leaf lines
|
# parse leaf lines
|
||||||
td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := {
|
td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := {
|
||||||
rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
|
||||||
xtr <- str_match(t, rx)[, c(2,4)]
|
xtr <- stri_match_first_regex(t, rx)[, c(2,4)]
|
||||||
c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
|
c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
|||||||
@ -1,148 +1,142 @@
|
|||||||
#' Plot multiple graphs at the same time
|
|
||||||
#'
|
|
||||||
#' Plot multiple graph aligned by rows and columns.
|
|
||||||
#'
|
|
||||||
#' @param ... the plots
|
|
||||||
#' @param cols number of columns
|
|
||||||
#' @return NULL
|
|
||||||
multiplot <- function(..., cols = 1) {
|
|
||||||
plots <- list(...)
|
|
||||||
numPlots = length(plots)
|
|
||||||
|
|
||||||
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
|
|
||||||
ncol = cols, nrow = ceiling(numPlots / cols))
|
|
||||||
|
|
||||||
if (numPlots == 1) {
|
|
||||||
print(plots[[1]])
|
|
||||||
} else {
|
|
||||||
grid::grid.newpage()
|
|
||||||
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
|
|
||||||
for (i in 1:numPlots) {
|
|
||||||
# Get the i,j matrix positions of the regions that contain this subplot
|
|
||||||
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
|
|
||||||
|
|
||||||
print(
|
|
||||||
plots[[i]], vp = grid::viewport(
|
|
||||||
layout.pos.row = matchidx$row,
|
|
||||||
layout.pos.col = matchidx$col
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#' Parse the graph to extract vector of edges
|
|
||||||
#' @param element igraph object containing the path from the root to the leaf.
|
|
||||||
edge.parser <- function(element) {
|
|
||||||
edges.vector <- igraph::as_ids(element)
|
|
||||||
t <- tail(edges.vector, n = 1)
|
|
||||||
l <- length(edges.vector)
|
|
||||||
list(t,l)
|
|
||||||
}
|
|
||||||
|
|
||||||
#' Extract path from root to leaf from data.table
|
|
||||||
#' @param dt_tree data.table containing the nodes and edges of the trees
|
|
||||||
get.paths.to.leaf <- function(dt_tree) {
|
|
||||||
dt.not.leaf.edges <-
|
|
||||||
dt_tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt_tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
|
|
||||||
|
|
||||||
trees <- dt_tree[,unique(Tree)]
|
|
||||||
|
|
||||||
paths <- list()
|
|
||||||
for (tree in trees) {
|
|
||||||
graph <-
|
|
||||||
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
|
|
||||||
paths.tmp <-
|
|
||||||
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt_tree[Tree == tree &
|
|
||||||
Feature == "Leaf", c(ID)])
|
|
||||||
paths <- c(paths, paths.tmp$vpath)
|
|
||||||
}
|
|
||||||
paths
|
|
||||||
}
|
|
||||||
|
|
||||||
#' Plot model trees deepness
|
#' Plot model trees deepness
|
||||||
#'
|
#'
|
||||||
#' Generate a graph to plot the distribution of deepness among trees.
|
#' Visualizes distributions related to depth of tree leafs.
|
||||||
|
#' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
|
||||||
#'
|
#'
|
||||||
#' @param model dump generated by the \code{xgb.train} function.
|
#' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||||
#'
|
#' or a data.table result of the \code{xgb.model.dt.tree} function.
|
||||||
#' @return Two graphs showing the distribution of the model deepness.
|
#' @param plot (base R barplot) whether a barplot should be produced.
|
||||||
|
#' If FALSE, only a data.table is returned.
|
||||||
|
#' @param which which distribution to plot (see details).
|
||||||
|
#' @param ... other parameters passed to \code{barplot} or \code{plot}.
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
|
||||||
#' by tree deepness level.
|
|
||||||
#'
|
|
||||||
#' The purpose of this function is to help the user to find the best trade-off to set
|
|
||||||
#' the \code{max_depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
|
||||||
#'
|
|
||||||
#' See \link{xgb.train} for more information about these parameters.
|
|
||||||
#'
|
|
||||||
#' The graph is made of two parts:
|
|
||||||
#'
|
#'
|
||||||
|
#' When \code{which="2x1"}, two distributions with respect to the leaf depth
|
||||||
|
#' are plotted on top of each other:
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item Count: number of leaf per level of deepness;
|
#' \item the distribution of the number of leafs in a tree model at a certain depth;
|
||||||
#' \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
|
#' \item the distribution of average weighted number of observations ("cover")
|
||||||
|
#' ending up in leafs at certain depth.
|
||||||
#' }
|
#' }
|
||||||
|
#' Those could be helpful in determining sensible ranges of the \code{max_depth}
|
||||||
|
#' and \code{min_child_weight} parameters.
|
||||||
#'
|
#'
|
||||||
#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
#' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
|
||||||
|
#' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
|
||||||
|
#' a tree's median absolute leaf weight changes through the iterations.
|
||||||
|
#'
|
||||||
|
#' This function was inspired by the blog post
|
||||||
|
#' \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
|
||||||
|
#'
|
||||||
|
#' @return
|
||||||
|
#'
|
||||||
|
#' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
|
||||||
|
#' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
|
||||||
|
#' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
|
||||||
|
#'
|
||||||
|
#' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
|
||||||
|
#' or a single ggplot graph for the other \code{which} options.
|
||||||
|
#'
|
||||||
|
#' @seealso
|
||||||
|
#'
|
||||||
|
#' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
|
#'
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
||||||
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
#' eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
||||||
#' min_child_weight = 50)
|
#' subsample = 0.5, min_child_weight = 2)
|
||||||
#'
|
#'
|
||||||
#' xgb.plot.deepness(model = bst)
|
#' xgb.plot.deepness(bst)
|
||||||
|
#' xgb.ggplot.deepness(bst)
|
||||||
#'
|
#'
|
||||||
|
#' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||||
|
#'
|
||||||
|
#' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||||
|
#'
|
||||||
|
#' @rdname xgb.plot.deepness
|
||||||
#' @export
|
#' @export
|
||||||
xgb.plot.deepness <- function(model = NULL) {
|
xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"),
|
||||||
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
plot = TRUE, ...) {
|
||||||
stop("ggplot2 package is required for plotting the graph deepness.",
|
|
||||||
call. = FALSE)
|
if (!(class(model) == "xgb.Booster" || is.data.table(model)))
|
||||||
|
stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n",
|
||||||
|
"or a data.table result of the xgb.importance function")
|
||||||
|
|
||||||
|
if (!requireNamespace("igraph", quietly = TRUE))
|
||||||
|
stop("igraph package is required for plotting the graph deepness.", call. = FALSE)
|
||||||
|
|
||||||
|
which <- match.arg(which)
|
||||||
|
|
||||||
|
dt_tree <- model
|
||||||
|
if (class(model) == "xgb.Booster")
|
||||||
|
dt_tree <- xgb.model.dt.tree(model = model)
|
||||||
|
|
||||||
|
if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree)))
|
||||||
|
stop("Model tree columns are not as expected!\n",
|
||||||
|
" Note that this function works only for tree models.")
|
||||||
|
|
||||||
|
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight=Quality)], by = "ID")
|
||||||
|
setkeyv(dt_depths, c("Tree", "ID"))
|
||||||
|
# count by depth levels, and also calculate average cover at a depth
|
||||||
|
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
|
||||||
|
setkey(dt_summaries, "Depth")
|
||||||
|
|
||||||
|
if (plot) {
|
||||||
|
if (which == "2x1") {
|
||||||
|
op <- par(no.readonly = TRUE)
|
||||||
|
par(mfrow=c(2,1),
|
||||||
|
oma = c(3,1,3,1) + 0.1,
|
||||||
|
mar = c(1,4,1,0) + 0.1)
|
||||||
|
|
||||||
|
dt_summaries[, barplot(N, border=NA, ylab = 'Number of leafs', ...)]
|
||||||
|
|
||||||
|
dt_summaries[, barplot(Cover, border=NA, ylab = "Weighted cover", names.arg=Depth, ...)]
|
||||||
|
|
||||||
|
title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1)
|
||||||
|
par(op)
|
||||||
|
} else if (which == "max.depth") {
|
||||||
|
dt_depths[, max(Depth), Tree][
|
||||||
|
, plot(jitter(V1, amount = 0.1) ~ Tree, ylab = 'Max tree leaf depth', xlab = "tree #", ...)]
|
||||||
|
} else if (which == "med.depth") {
|
||||||
|
dt_depths[, median(as.numeric(Depth)), Tree][
|
||||||
|
, plot(jitter(V1, amount = 0.1) ~ Tree, ylab = 'Median tree leaf depth', xlab = "tree #", ...)]
|
||||||
|
} else if (which == "med.weight") {
|
||||||
|
dt_depths[, median(abs(Weight)), Tree][
|
||||||
|
, plot(V1 ~ Tree, ylab = 'Median absolute leaf weight', xlab = "tree #", ...)]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
invisible(dt_depths)
|
||||||
|
}
|
||||||
|
|
||||||
if (!requireNamespace("igraph", quietly = TRUE)) {
|
# Extract path depths from root to leaf
|
||||||
stop("igraph package is required for plotting the graph deepness.",
|
# from data.table containing the nodes and edges of the trees.
|
||||||
call. = FALSE)
|
# internal utility function
|
||||||
}
|
get.leaf.depth <- function(dt_tree) {
|
||||||
|
# extract tree graph's edges
|
||||||
|
dt_edges <- rbindlist(list(
|
||||||
|
dt_tree[Feature != "Leaf", .(ID, To=Yes, Tree)],
|
||||||
|
dt_tree[Feature != "Leaf", .(ID, To=No, Tree)]
|
||||||
|
))
|
||||||
|
# whether "To" is a leaf:
|
||||||
|
dt_edges <-
|
||||||
|
merge(dt_edges,
|
||||||
|
dt_tree[Feature == "Leaf", .(ID, Leaf = TRUE)],
|
||||||
|
all.x = TRUE, by.x = "To", by.y = "ID")
|
||||||
|
dt_edges[is.na(Leaf), Leaf := FALSE]
|
||||||
|
|
||||||
if (!requireNamespace("grid", quietly = TRUE)) {
|
dt_edges[, {
|
||||||
stop("grid package is required for plotting the graph deepness.",
|
graph <- igraph::graph_from_data_frame(.SD[,.(ID, To)])
|
||||||
call. = FALSE)
|
# min(ID) in a tree is a root node
|
||||||
}
|
paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE])
|
||||||
|
# list of paths to each leaf in a tree
|
||||||
if (class(model) != "xgb.Booster") {
|
paths <- lapply(paths_tmp$vpath, names)
|
||||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
# combine into a resulting path lengths table for a tree
|
||||||
}
|
data.table(Depth = sapply(paths, length), ID = To[Leaf == TRUE])
|
||||||
|
}, by = Tree]
|
||||||
dt.tree <- xgb.model.dt.tree(model = model)
|
|
||||||
|
|
||||||
dt.edge.elements <- data.table()
|
|
||||||
paths <- get.paths.to.leaf(dt.tree)
|
|
||||||
|
|
||||||
dt.edge.elements <-
|
|
||||||
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
|
|
||||||
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
|
|
||||||
|
|
||||||
dt.edge.summuize <-
|
|
||||||
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
|
|
||||||
|
|
||||||
p1 <-
|
|
||||||
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
|
|
||||||
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
|
|
||||||
ggplot2::theme(
|
|
||||||
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
|
|
||||||
panel.grid.major.y = ggplot2::element_blank(),
|
|
||||||
axis.ticks = ggplot2::element_blank(),
|
|
||||||
axis.text.x = ggplot2::element_blank()
|
|
||||||
)
|
|
||||||
|
|
||||||
p2 <-
|
|
||||||
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
|
|
||||||
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
|
|
||||||
|
|
||||||
multiplot(p1,p2,cols = 1)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
@ -150,6 +144,6 @@ xgb.plot.deepness <- function(model = NULL) {
|
|||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(
|
globalVariables(
|
||||||
c(
|
c(
|
||||||
".N", "N", "size", "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
|
".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,79 +1,125 @@
|
|||||||
#' Plot feature importance bar graph
|
#' Plot feature importance as a bar graph
|
||||||
#'
|
#'
|
||||||
#' Read a data.table containing feature importance details and plot it (for both GLM and Trees).
|
#' Represents previously calculated feature importance as a bar graph.
|
||||||
|
#' \code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend.
|
||||||
#'
|
#'
|
||||||
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
|
#' @param importance_matrix a \code{data.table} returned by \code{\link{xgb.importance}}.
|
||||||
#' @param n_clusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
|
#' @param top_n maximal number of top features to include into the plot.
|
||||||
#' @param ... currently not used
|
#' @param measure the name of importance measure to plot.
|
||||||
#'
|
#' When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.
|
||||||
#' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
|
#' @param rel_to_first whether importance values should be represented as relative to the highest ranked feature.
|
||||||
|
#' See Details.
|
||||||
|
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
|
||||||
|
#' When it is NULL, the existing \code{par('mar')} is used.
|
||||||
|
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
|
||||||
|
#' @param plot (base R barplot) whether a barplot should be produced.
|
||||||
|
#' If FALSE, only a data.table is returned.
|
||||||
|
#' @param n_clusters (ggplot only) a \code{numeric} vector containing the min and the max range
|
||||||
|
#' of the possible number of clusters of bars.
|
||||||
|
#' @param ... other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las).
|
||||||
#'
|
#'
|
||||||
#' @details
|
#' @details
|
||||||
#' The purpose of this function is to easily represent the importance of each feature of a model.
|
#' The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
|
||||||
#' The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
#' Features are shown ranked in a decreasing importance order.
|
||||||
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
#' It works for importances from both \code{gblinear} and \code{gbtree} models.
|
||||||
|
#'
|
||||||
|
#' When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}.
|
||||||
|
#' For gbtree model, that would mean being normalized to the total of 1
|
||||||
|
#' ("what is feature's importance contribution relative to the whole model?").
|
||||||
|
#' For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
|
||||||
|
#' Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
|
||||||
|
#' "what is feature's importance contribution relative to the most important feature?"
|
||||||
|
#'
|
||||||
|
#' The ggplot-backend method also performs 1-D custering of the importance values,
|
||||||
|
#' with bar colors coresponding to different clusters that have somewhat similar importance values.
|
||||||
|
#'
|
||||||
|
#' @return
|
||||||
|
#' The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE})
|
||||||
|
#' and silently returns a processed data.table with \code{n_top} features sorted by importance.
|
||||||
|
#'
|
||||||
|
#' The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards.
|
||||||
|
#' E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result.
|
||||||
|
#'
|
||||||
|
#' @seealso
|
||||||
|
#' \code{\link[graphics]{barplot}}.
|
||||||
#'
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train)
|
||||||
#'
|
#'
|
||||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
|
||||||
#' #(labels = outcome column which will be learned).
|
|
||||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
#'
|
|
||||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
|
||||||
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
#'
|
#'
|
||||||
#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
||||||
#' xgb.plot.importance(importance_matrix)
|
|
||||||
#'
|
#'
|
||||||
|
#' xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance")
|
||||||
|
#'
|
||||||
|
#' (gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE))
|
||||||
|
#' gg + ggplot2::ylab("Frequency")
|
||||||
|
#'
|
||||||
|
#' @rdname xgb.plot.importance
|
||||||
#' @export
|
#' @export
|
||||||
xgb.plot.importance <-
|
xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL,
|
||||||
function(importance_matrix = NULL, n_clusters = c(1:10), ...) {
|
rel_to_first = FALSE, left_margin = 10, cex = NULL, plot = TRUE, ...) {
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
if (!"data.table" %in% class(importance_matrix)) {
|
if (!"data.table" %in% class(importance_matrix)) {
|
||||||
stop("importance_matrix: Should be a data.table.")
|
stop("importance_matrix: Should be a data.table.")
|
||||||
}
|
|
||||||
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
|
||||||
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
|
|
||||||
}
|
|
||||||
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
|
|
||||||
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Gain", "Cover", "Frequency")))){
|
|
||||||
y.axe.name <- "Gain"
|
|
||||||
} else if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Weight")))){
|
|
||||||
y.axe.name <- "Weight"
|
|
||||||
} else {
|
|
||||||
stop("Importance matrix is not correct (column names issue)")
|
|
||||||
}
|
|
||||||
|
|
||||||
# To avoid issues in clustering when co-occurences are used
|
|
||||||
importance_matrix <-
|
|
||||||
importance_matrix[, .(Gain.or.Weight = sum(get(y.axe.name))), by = Feature]
|
|
||||||
|
|
||||||
clusters <-
|
|
||||||
suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain.or.Weight], n_clusters))
|
|
||||||
importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
|
|
||||||
|
|
||||||
plot <-
|
|
||||||
ggplot2::ggplot(
|
|
||||||
importance_matrix, ggplot2::aes(
|
|
||||||
x = stats::reorder(Feature, Gain.or.Weight), y = Gain.or.Weight, width = 0.05
|
|
||||||
), environment = environment()
|
|
||||||
) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
|
|
||||||
"identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab(y.axe.name) + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
|
|
||||||
plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
|
|
||||||
)
|
|
||||||
|
|
||||||
return(plot)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
imp_names <- colnames(importance_matrix)
|
||||||
|
if (is.null(measure)) {
|
||||||
|
if (all(c("Feature", "Gain") %in% imp_names)) {
|
||||||
|
measure <- "Gain"
|
||||||
|
} else if (all(c("Feature", "Weight") %in% imp_names)) {
|
||||||
|
measure <- "Weight"
|
||||||
|
} else {
|
||||||
|
stop("Importance matrix column names are not as expected!")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!measure %in% imp_names)
|
||||||
|
stop("Invalid `measure`")
|
||||||
|
if (!"Feature" %in% imp_names)
|
||||||
|
stop("Importance matrix column names are not as expected!")
|
||||||
|
}
|
||||||
|
|
||||||
|
# also aggregate, just in case when the values were not yet summed up by feature
|
||||||
|
importance_matrix <- importance_matrix[, Importance := sum(get(measure)), by = Feature]
|
||||||
|
|
||||||
|
# make sure it's ordered
|
||||||
|
importance_matrix <- importance_matrix[order(-abs(Importance))]
|
||||||
|
|
||||||
|
if (!is.null(top_n)) {
|
||||||
|
top_n <- min(top_n, nrow(importance_matrix))
|
||||||
|
importance_matrix <- head(importance_matrix, top_n)
|
||||||
|
}
|
||||||
|
if (rel_to_first) {
|
||||||
|
importance_matrix[, Importance := Importance/max(abs(Importance))]
|
||||||
|
}
|
||||||
|
if (is.null(cex)) {
|
||||||
|
cex <- 2.5/log2(1 + nrow(importance_matrix))
|
||||||
|
}
|
||||||
|
|
||||||
|
if (plot) {
|
||||||
|
op <- par(no.readonly = TRUE)
|
||||||
|
mar <- op$mar
|
||||||
|
if (!is.null(left_margin))
|
||||||
|
mar[2] <- left_margin
|
||||||
|
par(mar = mar)
|
||||||
|
|
||||||
|
# reverse the order of rows to have the highest ranked at the top
|
||||||
|
importance_matrix[nrow(importance_matrix):1,
|
||||||
|
barplot(Importance, horiz=TRUE, border=NA, cex.names=cex,
|
||||||
|
names.arg=Feature, las=1, ...)]
|
||||||
|
grid(NULL, NA)
|
||||||
|
# redraw over the grid
|
||||||
|
importance_matrix[nrow(importance_matrix):1,
|
||||||
|
barplot(Importance, horiz=TRUE, border=NA, add=TRUE)]
|
||||||
|
par(op)
|
||||||
|
}
|
||||||
|
|
||||||
|
invisible(importance_matrix)
|
||||||
|
}
|
||||||
|
|
||||||
# Avoid error messages during CRAN check.
|
# Avoid error messages during CRAN check.
|
||||||
# The reason is that these variables are never declared
|
# The reason is that these variables are never declared
|
||||||
# They are mainly column names inferred by Data.table...
|
# They are mainly column names inferred by Data.table...
|
||||||
globalVariables(
|
globalVariables(c("Feature", "Importance"))
|
||||||
c(
|
|
||||||
"Feature", "Gain.or.Weight", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "Gain.or.Weight"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|||||||
@ -49,7 +49,7 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
|
|||||||
|
|
||||||
# first number of the path represents the tree, then the following numbers are related to the path to follow
|
# first number of the path represents the tree, then the following numbers are related to the path to follow
|
||||||
# root init
|
# root init
|
||||||
root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
|
root.nodes <- tree.matrix[stri_detect_regex(ID, "\\d+-0"), ID]
|
||||||
tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
|
tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
|
||||||
|
|
||||||
precedent.nodes <- root.nodes
|
precedent.nodes <- root.nodes
|
||||||
@ -70,7 +70,7 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
remove.tree <- . %>% str_replace(pattern = "^\\d+-", replacement = "")
|
remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
|
||||||
|
|
||||||
tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
|
tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
|
||||||
|
|
||||||
|
|||||||
@ -87,14 +87,15 @@ NULL
|
|||||||
#' @importFrom data.table as.data.table
|
#' @importFrom data.table as.data.table
|
||||||
#' @importFrom data.table :=
|
#' @importFrom data.table :=
|
||||||
#' @importFrom data.table rbindlist
|
#' @importFrom data.table rbindlist
|
||||||
|
#' @importFrom data.table setkey
|
||||||
|
#' @importFrom data.table setkeyv
|
||||||
#' @importFrom data.table setnames
|
#' @importFrom data.table setnames
|
||||||
#' @importFrom magrittr %>%
|
#' @importFrom magrittr %>%
|
||||||
#' @importFrom stringr str_detect
|
#' @importFrom stringi stri_detect_regex
|
||||||
#' @importFrom stringr str_extract
|
#' @importFrom stringi stri_match_first_regex
|
||||||
#' @importFrom stringr str_match
|
#' @importFrom stringi stri_replace_first_regex
|
||||||
#' @importFrom stringr str_replace
|
#' @importFrom stringi stri_replace_all_regex
|
||||||
#' @importFrom stringr str_replace_all
|
#' @importFrom stringi stri_split_regex
|
||||||
#' @importFrom stringr str_split
|
|
||||||
#' @importFrom utils object.size str tail
|
#' @importFrom utils object.size str tail
|
||||||
#' @importFrom stats predict
|
#' @importFrom stats predict
|
||||||
#'
|
#'
|
||||||
|
|||||||
@ -1,15 +0,0 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
|
||||||
% Please edit documentation in R/xgb.plot.deepness.R
|
|
||||||
\name{edge.parser}
|
|
||||||
\alias{edge.parser}
|
|
||||||
\title{Parse the graph to extract vector of edges}
|
|
||||||
\usage{
|
|
||||||
edge.parser(element)
|
|
||||||
}
|
|
||||||
\arguments{
|
|
||||||
\item{element}{igraph object containing the path from the root to the leaf.}
|
|
||||||
}
|
|
||||||
\description{
|
|
||||||
Parse the graph to extract vector of edges
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
|
||||||
% Please edit documentation in R/xgb.plot.deepness.R
|
|
||||||
\name{get.paths.to.leaf}
|
|
||||||
\alias{get.paths.to.leaf}
|
|
||||||
\title{Extract path from root to leaf from data.table}
|
|
||||||
\usage{
|
|
||||||
get.paths.to.leaf(dt_tree)
|
|
||||||
}
|
|
||||||
\arguments{
|
|
||||||
\item{dt_tree}{data.table containing the nodes and edges of the trees}
|
|
||||||
}
|
|
||||||
\description{
|
|
||||||
Extract path from root to leaf from data.table
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
|
||||||
% Please edit documentation in R/xgb.plot.deepness.R
|
|
||||||
\name{multiplot}
|
|
||||||
\alias{multiplot}
|
|
||||||
\title{Plot multiple graphs at the same time}
|
|
||||||
\usage{
|
|
||||||
multiplot(..., cols = 1)
|
|
||||||
}
|
|
||||||
\arguments{
|
|
||||||
\item{...}{the plots}
|
|
||||||
|
|
||||||
\item{cols}{number of columns}
|
|
||||||
}
|
|
||||||
\description{
|
|
||||||
Plot multiple graph aligned by rows and columns.
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,46 +1,74 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.plot.deepness.R
|
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.deepness.R
|
||||||
\name{xgb.plot.deepness}
|
\name{xgb.ggplot.deepness}
|
||||||
|
\alias{xgb.ggplot.deepness}
|
||||||
\alias{xgb.plot.deepness}
|
\alias{xgb.plot.deepness}
|
||||||
\title{Plot model trees deepness}
|
\title{Plot model trees deepness}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.plot.deepness(model = NULL)
|
xgb.ggplot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
|
||||||
|
"med.weight"))
|
||||||
|
|
||||||
|
xgb.plot.deepness(model = NULL, which = c("2x1", "max.depth", "med.depth",
|
||||||
|
"med.weight"), plot = TRUE, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{model}{dump generated by the \code{xgb.train} function.}
|
\item{model}{either an \code{xgb.Booster} model generated by the \code{xgb.train} function
|
||||||
|
or a data.table result of the \code{xgb.model.dt.tree} function.}
|
||||||
|
|
||||||
|
\item{which}{which distribution to plot (see details).}
|
||||||
|
|
||||||
|
\item{plot}{(base R barplot) whether a barplot should be produced.
|
||||||
|
If FALSE, only a data.table is returned.}
|
||||||
|
|
||||||
|
\item{...}{other parameters passed to \code{barplot} or \code{plot}.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
Two graphs showing the distribution of the model deepness.
|
Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function
|
||||||
|
silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model,
|
||||||
|
and contains information about leaf's depth, cover, and weight (which is used in calculating predictions).
|
||||||
|
|
||||||
|
The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"}
|
||||||
|
or a single ggplot graph for the other \code{which} options.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Generate a graph to plot the distribution of deepness among trees.
|
Visualizes distributions related to depth of tree leafs.
|
||||||
|
\code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend.
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
When \code{which="2x1"}, two distributions with respect to the leaf depth
|
||||||
by tree deepness level.
|
are plotted on top of each other:
|
||||||
|
|
||||||
The purpose of this function is to help the user to find the best trade-off to set
|
|
||||||
the \code{max_depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
|
||||||
|
|
||||||
See \link{xgb.train} for more information about these parameters.
|
|
||||||
|
|
||||||
The graph is made of two parts:
|
|
||||||
|
|
||||||
\itemize{
|
\itemize{
|
||||||
\item Count: number of leaf per level of deepness;
|
\item the distribution of the number of leafs in a tree model at a certain depth;
|
||||||
\item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
|
\item the distribution of average weighted number of observations ("cover")
|
||||||
|
ending up in leafs at certain depth.
|
||||||
}
|
}
|
||||||
|
Those could be helpful in determining sensible ranges of the \code{max_depth}
|
||||||
|
and \code{min_child_weight} parameters.
|
||||||
|
|
||||||
This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth
|
||||||
|
per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how
|
||||||
|
a tree's median absolute leaf weight changes through the iterations.
|
||||||
|
|
||||||
|
This function was inspired by the blog post
|
||||||
|
\url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
|
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
||||||
eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
||||||
min_child_weight = 50)
|
subsample = 0.5, min_child_weight = 2)
|
||||||
|
|
||||||
xgb.plot.deepness(model = bst)
|
xgb.plot.deepness(bst)
|
||||||
|
xgb.ggplot.deepness(bst)
|
||||||
|
|
||||||
|
xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||||
|
|
||||||
|
xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
\seealso{
|
||||||
|
\code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}.
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,41 +1,82 @@
|
|||||||
% Generated by roxygen2: do not edit by hand
|
% Generated by roxygen2: do not edit by hand
|
||||||
% Please edit documentation in R/xgb.plot.importance.R
|
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.importance.R
|
||||||
\name{xgb.plot.importance}
|
\name{xgb.ggplot.importance}
|
||||||
|
\alias{xgb.ggplot.importance}
|
||||||
\alias{xgb.plot.importance}
|
\alias{xgb.plot.importance}
|
||||||
\title{Plot feature importance bar graph}
|
\title{Plot feature importance as a bar graph}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.plot.importance(importance_matrix = NULL, n_clusters = c(1:10), ...)
|
xgb.ggplot.importance(importance_matrix = NULL, top_n = NULL,
|
||||||
|
measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...)
|
||||||
|
|
||||||
|
xgb.plot.importance(importance_matrix = NULL, top_n = NULL,
|
||||||
|
measure = NULL, rel_to_first = FALSE, left_margin = 10, cex = NULL,
|
||||||
|
plot = TRUE, ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{importance_matrix}{a \code{data.table} returned by the \code{xgb.importance} function.}
|
\item{importance_matrix}{a \code{data.table} returned by \code{\link{xgb.importance}}.}
|
||||||
|
|
||||||
\item{n_clusters}{a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.}
|
\item{top_n}{maximal number of top features to include into the plot.}
|
||||||
|
|
||||||
\item{...}{currently not used}
|
\item{measure}{the name of importance measure to plot.
|
||||||
|
When \code{NULL}, 'Gain' would be used for trees and 'Weight' would be used for gblinear.}
|
||||||
|
|
||||||
|
\item{rel_to_first}{whether importance values should be represented as relative to the highest ranked feature.
|
||||||
|
See Details.}
|
||||||
|
|
||||||
|
\item{n_clusters}{(ggplot only) a \code{numeric} vector containing the min and the max range
|
||||||
|
of the possible number of clusters of bars.}
|
||||||
|
|
||||||
|
\item{...}{other parameters passed to \code{barplot} (except horiz, border, cex.names, names.arg, and las).}
|
||||||
|
|
||||||
|
\item{left_margin}{(base R barplot) allows to adjust the left margin size to fit feature names.
|
||||||
|
When it is NULL, the existing \code{par('mar')} is used.}
|
||||||
|
|
||||||
|
\item{cex}{(base R barplot) passed as \code{cex.names} parameter to \code{barplot}.}
|
||||||
|
|
||||||
|
\item{plot}{(base R barplot) whether a barplot should be produced.
|
||||||
|
If FALSE, only a data.table is returned.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
|
The \code{xgb.plot.importance} function creates a \code{barplot} (when \code{plot=TRUE})
|
||||||
|
and silently returns a processed data.table with \code{n_top} features sorted by importance.
|
||||||
|
|
||||||
|
The \code{xgb.ggplot.importance} function returns a ggplot graph which could be customized afterwards.
|
||||||
|
E.g., to change the title of the graph, add \code{+ ggtitle("A GRAPH NAME")} to the result.
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
Read a data.table containing feature importance details and plot it (for both GLM and Trees).
|
Represents previously calculated feature importance as a bar graph.
|
||||||
|
\code{xgb.plot.importance} uses base R graphics, while \code{xgb.ggplot.importance} uses the ggplot backend.
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
The purpose of this function is to easily represent the importance of each feature of a model.
|
The graph represents each feature as a horizontal bar of length proportional to the importance of a feature.
|
||||||
The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
Features are shown ranked in a decreasing importance order.
|
||||||
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
It works for importances from both \code{gblinear} and \code{gbtree} models.
|
||||||
|
|
||||||
|
When \code{rel_to_first = FALSE}, the values would be plotted as they were in \code{importance_matrix}.
|
||||||
|
For gbtree model, that would mean being normalized to the total of 1
|
||||||
|
("what is feature's importance contribution relative to the whole model?").
|
||||||
|
For linear models, \code{rel_to_first = FALSE} would show actual values of the coefficients.
|
||||||
|
Setting \code{rel_to_first = TRUE} allows to see the picture from the perspective of
|
||||||
|
"what is feature's importance contribution relative to the most important feature?"
|
||||||
|
|
||||||
|
The ggplot-backend method also performs 1-D custering of the importance values,
|
||||||
|
with bar colors coresponding to different clusters that have somewhat similar importance values.
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train)
|
||||||
|
|
||||||
#Both dataset are list with two items, a sparse matrix and labels
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
|
||||||
#(labels = outcome column which will be learned).
|
|
||||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
|
||||||
|
|
||||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
|
|
||||||
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
|
||||||
|
|
||||||
importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
|
||||||
xgb.plot.importance(importance_matrix)
|
|
||||||
|
xgb.plot.importance(importance_matrix, rel_to_first = TRUE, xlab = "Relative importance")
|
||||||
|
|
||||||
|
(gg <- xgb.ggplot.importance(importance_matrix, measure = "Frequency", rel_to_first = TRUE))
|
||||||
|
gg + ggplot2::ylab("Frequency")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
\seealso{
|
||||||
|
\code{\link[graphics]{barplot}}.
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@ -264,7 +264,7 @@ test_that("prediction in early-stopping xgb.cv works", {
|
|||||||
set.seed(1)
|
set.seed(1)
|
||||||
expect_output(
|
expect_output(
|
||||||
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
|
cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
|
||||||
early_stopping_rounds = 5, maximize = FALSE, prediction=TRUE)
|
early_stopping_rounds = 5, maximize = FALSE, prediction = TRUE)
|
||||||
, "Stopping. Best iteration")
|
, "Stopping. Best iteration")
|
||||||
|
|
||||||
expect_false(is.null(cv$best_iteration))
|
expect_false(is.null(cv$best_iteration))
|
||||||
@ -279,3 +279,17 @@ test_that("prediction in early-stopping xgb.cv works", {
|
|||||||
err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
|
err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
|
||||||
expect_gt(abs(err_pred - err_log_last), 1e-4)
|
expect_gt(abs(err_pred - err_log_last), 1e-4)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("prediction in xgb.cv for softprob works", {
|
||||||
|
lb <- as.numeric(iris$Species) - 1
|
||||||
|
set.seed(11)
|
||||||
|
expect_warning(
|
||||||
|
cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
|
||||||
|
eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
|
||||||
|
subsample = 0.8, gamma = 2,
|
||||||
|
prediction = TRUE, objective = "multi:softprob", num_class = 3)
|
||||||
|
, NA)
|
||||||
|
expect_false(is.null(cv$pred))
|
||||||
|
expect_equal(dim(cv$pred), c(nrow(iris), 3))
|
||||||
|
expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
|
||||||
|
})
|
||||||
|
|||||||
@ -24,12 +24,22 @@ feature.names <- colnames(sparse_matrix)
|
|||||||
|
|
||||||
test_that("xgb.dump works", {
|
test_that("xgb.dump works", {
|
||||||
expect_length(xgb.dump(bst.Tree), 172)
|
expect_length(xgb.dump(bst.Tree), 172)
|
||||||
expect_length(xgb.dump(bst.GLM), 14)
|
|
||||||
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
|
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
|
||||||
expect_true(file.exists('xgb.model.dump'))
|
expect_true(file.exists('xgb.model.dump'))
|
||||||
expect_gt(file.size('xgb.model.dump'), 8000)
|
expect_gt(file.size('xgb.model.dump'), 8000)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("xgb.dump works for gblinear", {
|
||||||
|
expect_length(xgb.dump(bst.GLM), 14)
|
||||||
|
# also make sure that it works properly for a sparse model where some coefficients
|
||||||
|
# are 0 from setting large L1 regularization:
|
||||||
|
bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1,
|
||||||
|
alpha=2, objective = "binary:logistic", booster = "gblinear")
|
||||||
|
d.sp <- xgb.dump(bst.GLM.sp)
|
||||||
|
expect_length(d.sp, 14)
|
||||||
|
expect_gt(sum(d.sp == "0"), 0)
|
||||||
|
})
|
||||||
|
|
||||||
test_that("xgb-attribute functionality", {
|
test_that("xgb-attribute functionality", {
|
||||||
val <- "my attribute value"
|
val <- "my attribute value"
|
||||||
list.val <- list(my_attr=val, a=123, b='ok')
|
list.val <- list(my_attr=val, a=123, b='ok')
|
||||||
@ -76,7 +86,9 @@ test_that("xgb.importance works with and without feature names", {
|
|||||||
expect_equal(dim(importance.Tree), c(7, 4))
|
expect_equal(dim(importance.Tree), c(7, 4))
|
||||||
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
|
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
|
||||||
expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"')
|
expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"')
|
||||||
xgb.plot.importance(importance_matrix = importance.Tree)
|
imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree)
|
||||||
|
expect_equal(colnames(imp2plot), c("Feature", "Gain", "Cover", "Frequency", "Importance"))
|
||||||
|
xgb.ggplot.importance(importance_matrix = importance.Tree)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.importance works with GLM model", {
|
test_that("xgb.importance works with GLM model", {
|
||||||
@ -84,7 +96,9 @@ test_that("xgb.importance works with GLM model", {
|
|||||||
expect_equal(dim(importance.GLM), c(10, 2))
|
expect_equal(dim(importance.GLM), c(10, 2))
|
||||||
expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
|
expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
|
||||||
xgb.importance(model = bst.GLM)
|
xgb.importance(model = bst.GLM)
|
||||||
xgb.plot.importance(importance.GLM)
|
imp2plot <- xgb.plot.importance(importance.GLM)
|
||||||
|
expect_equal(colnames(imp2plot), c("Feature", "Weight", "Importance"))
|
||||||
|
xgb.ggplot.importance(importance.GLM)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.plot.tree works with and without feature names", {
|
test_that("xgb.plot.tree works with and without feature names", {
|
||||||
@ -98,7 +112,10 @@ test_that("xgb.plot.multi.trees works with and without feature names", {
|
|||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.plot.deepness works", {
|
test_that("xgb.plot.deepness works", {
|
||||||
xgb.plot.deepness(model = bst.Tree)
|
d2p <- xgb.plot.deepness(model = bst.Tree)
|
||||||
|
expect_equal(colnames(d2p), c("ID", "Tree", "Depth", "Cover", "Weight"))
|
||||||
|
xgb.plot.deepness(model = bst.Tree, which = "med.depth")
|
||||||
|
xgb.ggplot.deepness(model = bst.Tree)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("check.deprecation works", {
|
test_that("check.deprecation works", {
|
||||||
|
|||||||
@ -35,6 +35,7 @@
|
|||||||
#if DMLC_ENABLE_STD_THREAD
|
#if DMLC_ENABLE_STD_THREAD
|
||||||
#include "../src/data/sparse_page_source.cc"
|
#include "../src/data/sparse_page_source.cc"
|
||||||
#include "../src/data/sparse_page_dmatrix.cc"
|
#include "../src/data/sparse_page_dmatrix.cc"
|
||||||
|
#include "../src/data/sparse_page_writer.cc"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// tress
|
// tress
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user