Plot model deepness
New function to explore the model by ploting the way splits are done.
This commit is contained in:
parent
fe7cdcefb4
commit
d9fe9c5d8a
@ -33,4 +33,4 @@ Imports:
|
||||
data.table (>= 1.9.6),
|
||||
magrittr (>= 1.5),
|
||||
stringr (>= 0.6.2)
|
||||
RoxygenNote: 5.0.0
|
||||
RoxygenNote: 5.0.1
|
||||
|
||||
@ -10,6 +10,7 @@ export(xgb.dump)
|
||||
export(xgb.importance)
|
||||
export(xgb.load)
|
||||
export(xgb.model.dt.tree)
|
||||
export(xgb.plot.deepness)
|
||||
export(xgb.plot.importance)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
|
||||
172
R-package/R/xgb.plot.deepness.R
Normal file
172
R-package/R/xgb.plot.deepness.R
Normal file
@ -0,0 +1,172 @@
|
||||
#' Plot multiple graphs at the same time
|
||||
#'
|
||||
#' Plot multiple graph aligned by rows and columns.
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @param cols number of columns
|
||||
#' @return NULL
|
||||
multiplot <- function(..., cols = 1) {
|
||||
plots <- list(...)
|
||||
numPlots = length(plots)
|
||||
|
||||
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
|
||||
ncol = cols, nrow = ceiling(numPlots / cols))
|
||||
|
||||
if (numPlots == 1) {
|
||||
print(plots[[1]])
|
||||
} else {
|
||||
grid::grid.newpage()
|
||||
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
|
||||
for (i in 1:numPlots) {
|
||||
# Get the i,j matrix positions of the regions that contain this subplot
|
||||
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
|
||||
|
||||
print(
|
||||
plots[[i]], vp = grid::viewport(
|
||||
layout.pos.row = matchidx$row,
|
||||
layout.pos.col = matchidx$col
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#' Parse the graph to extract vector of edges
|
||||
#' @param element igraph object containing the path from the root to the leaf.
|
||||
edge.parser <- function(element) {
|
||||
edges.vector <- igraph::as_ids(element)
|
||||
t <- tail(edges.vector, n = 1)
|
||||
l <- length(edges.vector)
|
||||
list(t,l)
|
||||
}
|
||||
|
||||
#' Extract path from root to leaf from data.table
|
||||
#' @param dt.tree data.table containing the nodes and edges of the trees
|
||||
get.paths.to.leaf <- function(dt.tree) {
|
||||
dt.not.leaf.edges <-
|
||||
dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
|
||||
|
||||
trees <- dt.tree[,unique(Tree)]
|
||||
|
||||
paths <- list()
|
||||
for (tree in trees) {
|
||||
graph <-
|
||||
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
|
||||
paths.tmp <-
|
||||
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
|
||||
Feature == "Leaf", c(ID)])
|
||||
paths <- c(paths, paths.tmp$vpath)
|
||||
}
|
||||
paths
|
||||
}
|
||||
|
||||
#' Plot model trees deepness
|
||||
#'
|
||||
#' Generate a graph to plot the distribution of deepness among trees.
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom data.table setnames
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#'
|
||||
#' @return Two graphs showing the distribution of the model deepness.
|
||||
#'
|
||||
#' @details
|
||||
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||
#' by tree deepness level.
|
||||
#' The purpose of this function is to help the user to find the best trad-off to set
|
||||
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||
#'
|
||||
#' See \link{xgb.train} for more information about these parameters.
|
||||
#'
|
||||
#' The graph is made of two parts:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item Count: number of leaf per level of deepness;
|
||||
#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
|
||||
#' }
|
||||
#'
|
||||
#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
|
||||
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||
#' min_child_weight = 50)
|
||||
#'
|
||||
#' xgb.plot.deepness(model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
|
||||
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||
stop("ggplot2 package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (!requireNamespace("igraph", quietly = TRUE)) {
|
||||
stop("igraph package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (!requireNamespace("grid", quietly = TRUE)) {
|
||||
stop("grid package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
||||
}
|
||||
|
||||
if(!is.null(model)){
|
||||
dt.tree <- xgb.model.dt.tree(model = model)
|
||||
} else if(!is.null(filename_dump)){
|
||||
dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
|
||||
}
|
||||
|
||||
dt.edge.elements <- data.table()
|
||||
paths <- get.paths.to.leaf(dt.tree)
|
||||
|
||||
dt.edge.elements <-
|
||||
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
|
||||
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
|
||||
|
||||
dt.edge.summuize <-
|
||||
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
|
||||
|
||||
p1 <-
|
||||
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
|
||||
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
|
||||
ggplot2::theme(
|
||||
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
|
||||
panel.grid.major.y = ggplot2::element_blank(),
|
||||
axis.ticks = ggplot2::element_blank(),
|
||||
axis.text.x = ggplot2::element_blank()
|
||||
)
|
||||
|
||||
p2 <-
|
||||
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
|
||||
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
|
||||
|
||||
multiplot(p1,p2,cols = 1)
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(
|
||||
c(
|
||||
"Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
|
||||
)
|
||||
)
|
||||
@ -29,7 +29,8 @@
|
||||
#' xgb.plot.importance(importance_matrix)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
|
||||
xgb.plot.importance <-
|
||||
function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
|
||||
if (!"data.table" %in% class(importance_matrix)) {
|
||||
stop("importance_matrix: Should be a data.table.")
|
||||
}
|
||||
@ -41,12 +42,22 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
|
||||
}
|
||||
|
||||
# To avoid issues in clustering when co-occurences are used
|
||||
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
|
||||
importance_matrix <-
|
||||
importance_matrix[, .(Gain = sum(Gain)), by = Feature]
|
||||
|
||||
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
|
||||
clusters <-
|
||||
suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
|
||||
importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
|
||||
|
||||
plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
|
||||
plot <-
|
||||
ggplot2::ggplot(
|
||||
importance_matrix, ggplot2::aes(
|
||||
x = stats::reorder(Feature, Gain), y = Gain, width = 0.05
|
||||
), environment = environment()
|
||||
) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
|
||||
"identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
|
||||
plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
|
||||
)
|
||||
|
||||
return(plot)
|
||||
}
|
||||
@ -54,4 +65,8 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
|
||||
globalVariables(
|
||||
c(
|
||||
"Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"
|
||||
)
|
||||
)
|
||||
|
||||
64
R-package/demo/xgb.plot.multi.tree.R
Normal file
64
R-package/demo/xgb.plot.multi.tree.R
Normal file
@ -0,0 +1,64 @@
|
||||
library(stringr)
|
||||
library(data.table)
|
||||
library(xgboost)
|
||||
|
||||
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nround = 4, objective = "binary:logistic")
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
|
||||
|
||||
# first number of the path represents the tree, then the following numbers are related to the path to follow
|
||||
|
||||
# root init
|
||||
root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
|
||||
tree.matrix[ID == root.nodes, Abs.Position:=root.nodes]
|
||||
|
||||
precedent.nodes <- root.nodes
|
||||
|
||||
while(tree.matrix[,sum(is.na(Abs.Position))] > 0) {
|
||||
yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)]
|
||||
no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)]
|
||||
yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0")
|
||||
no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1")
|
||||
|
||||
tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos]
|
||||
tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos]
|
||||
precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
|
||||
}
|
||||
|
||||
tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")]
|
||||
tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")]
|
||||
tree.matrix[,ID:= Abs.Position]
|
||||
|
||||
tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))]
|
||||
keepN <- 3
|
||||
tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position]
|
||||
|
||||
tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
|
||||
|
||||
tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
|
||||
|
||||
tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")]
|
||||
|
||||
CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
|
||||
|
||||
|
||||
yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
|
||||
|
||||
no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
|
||||
|
||||
path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n")
|
||||
DiagrammeR::mermaid(path)
|
||||
|
||||
# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf"
|
||||
# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to"))
|
||||
15
R-package/man/edge.parser.Rd
Normal file
15
R-package/man/edge.parser.Rd
Normal file
@ -0,0 +1,15 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.deepness.R
|
||||
\name{edge.parser}
|
||||
\alias{edge.parser}
|
||||
\title{Parse the graph to extract vector of edges}
|
||||
\usage{
|
||||
edge.parser(element)
|
||||
}
|
||||
\arguments{
|
||||
\item{element}{igraph object containing the path from the root to the leaf.}
|
||||
}
|
||||
\description{
|
||||
Parse the graph to extract vector of edges
|
||||
}
|
||||
|
||||
15
R-package/man/get.paths.to.leaf.Rd
Normal file
15
R-package/man/get.paths.to.leaf.Rd
Normal file
@ -0,0 +1,15 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.deepness.R
|
||||
\name{get.paths.to.leaf}
|
||||
\alias{get.paths.to.leaf}
|
||||
\title{Extract path from root to leaf from data.table}
|
||||
\usage{
|
||||
get.paths.to.leaf(dt.tree)
|
||||
}
|
||||
\arguments{
|
||||
\item{dt.tree}{data.table containing the nodes and edges of the trees}
|
||||
}
|
||||
\description{
|
||||
Extract path from root to leaf from data.table
|
||||
}
|
||||
|
||||
15
R-package/man/multiplot.Rd
Normal file
15
R-package/man/multiplot.Rd
Normal file
@ -0,0 +1,15 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.deepness.R
|
||||
\name{multiplot}
|
||||
\alias{multiplot}
|
||||
\title{Plot multiple graphs at the same time}
|
||||
\usage{
|
||||
multiplot(..., cols = 1)
|
||||
}
|
||||
\arguments{
|
||||
\item{cols}{number of columns}
|
||||
}
|
||||
\description{
|
||||
Plot multiple graph aligned by rows and columns.
|
||||
}
|
||||
|
||||
47
R-package/man/xgb.plot.deepness.Rd
Normal file
47
R-package/man/xgb.plot.deepness.Rd
Normal file
@ -0,0 +1,47 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.deepness.R
|
||||
\name{xgb.plot.deepness}
|
||||
\alias{xgb.plot.deepness}
|
||||
\title{Plot model trees deepness}
|
||||
\usage{
|
||||
xgb.plot.deepness(filename_dump = NULL, model = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
}
|
||||
\value{
|
||||
Two graphs showing the distribution of the model deepness.
|
||||
}
|
||||
\description{
|
||||
Generate a graph to plot the distribution of deepness among trees.
|
||||
}
|
||||
\details{
|
||||
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||
by tree deepness level.
|
||||
The purpose of this function is to help the user to find the best trad-off to set
|
||||
the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||
|
||||
See \link{xgb.train} for more information about these parameters.
|
||||
|
||||
The graph is made of two parts:
|
||||
|
||||
\itemize{
|
||||
\item Count: number of leaf per level of deepness;
|
||||
\item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
|
||||
}
|
||||
|
||||
This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
|
||||
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||
min_child_weight = 50)
|
||||
|
||||
xgb.plot.deepness(model = bst)
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user