Plot model deepness

New function to explore the model by ploting the way splits are done.
This commit is contained in:
pommedeterresautee 2015-11-24 11:45:32 +01:00
parent fe7cdcefb4
commit d9fe9c5d8a
12 changed files with 383 additions and 39 deletions

View File

@ -33,4 +33,4 @@ Imports:
data.table (>= 1.9.6),
magrittr (>= 1.5),
stringr (>= 0.6.2)
RoxygenNote: 5.0.0
RoxygenNote: 5.0.1

View File

@ -10,6 +10,7 @@ export(xgb.dump)
export(xgb.importance)
export(xgb.load)
export(xgb.model.dt.tree)
export(xgb.plot.deepness)
export(xgb.plot.importance)
export(xgb.plot.tree)
export(xgb.save)

View File

@ -0,0 +1,172 @@
#' Plot multiple graphs at the same time
#'
#' Plot multiple graph aligned by rows and columns.
#'
#' @importFrom data.table data.table
#' @param cols number of columns
#' @return NULL
multiplot <- function(..., cols = 1) {
plots <- list(...)
numPlots = length(plots)
layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
ncol = cols, nrow = ceiling(numPlots / cols))
if (numPlots == 1) {
print(plots[[1]])
} else {
grid::grid.newpage()
grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
print(
plots[[i]], vp = grid::viewport(
layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col
)
)
}
}
}
#' Parse the graph to extract vector of edges
#' @param element igraph object containing the path from the root to the leaf.
edge.parser <- function(element) {
edges.vector <- igraph::as_ids(element)
t <- tail(edges.vector, n = 1)
l <- length(edges.vector)
list(t,l)
}
#' Extract path from root to leaf from data.table
#' @param dt.tree data.table containing the nodes and edges of the trees
get.paths.to.leaf <- function(dt.tree) {
dt.not.leaf.edges <-
dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
trees <- dt.tree[,unique(Tree)]
paths <- list()
for (tree in trees) {
graph <-
igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
paths.tmp <-
igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
Feature == "Leaf", c(ID)])
paths <- c(paths, paths.tmp$vpath)
}
paths
}
#' Plot model trees deepness
#'
#' Generate a graph to plot the distribution of deepness among trees.
#'
#' @importFrom data.table data.table
#' @importFrom data.table rbindlist
#' @importFrom data.table setnames
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#'
#' @return Two graphs showing the distribution of the model deepness.
#'
#' @details
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
#' by tree deepness level.
#' The purpose of this function is to help the user to find the best trad-off to set
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
#'
#' See \link{xgb.train} for more information about these parameters.
#'
#' The graph is made of two parts:
#'
#' \itemize{
#' \item Count: number of leaf per level of deepness;
#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
#' }
#'
#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
#' min_child_weight = 50)
#'
#' xgb.plot.deepness(model = bst)
#'
#' @export
xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required for plotting the graph deepness.",
call. = FALSE)
}
if (!requireNamespace("igraph", quietly = TRUE)) {
stop("igraph package is required for plotting the graph deepness.",
call. = FALSE)
}
if (!requireNamespace("grid", quietly = TRUE)) {
stop("grid package is required for plotting the graph deepness.",
call. = FALSE)
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
stop("filename_dump: path to the model doesn't exist.")
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
}
if(!is.null(model)){
dt.tree <- xgb.model.dt.tree(model = model)
} else if(!is.null(filename_dump)){
dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
}
dt.edge.elements <- data.table()
paths <- get.paths.to.leaf(dt.tree)
dt.edge.elements <-
lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
dt.edge.summuize <-
dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
p1 <-
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
panel.grid.major.y = ggplot2::element_blank(),
axis.ticks = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank()
)
p2 <-
ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) +
ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
multiplot(p1,p2,cols = 1)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(
c(
"Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
)
)

View File

@ -29,7 +29,8 @@
#' xgb.plot.importance(importance_matrix)
#'
#' @export
xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
xgb.plot.importance <-
function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
@ -41,12 +42,22 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
}
# To avoid issues in clustering when co-occurences are used
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
importance_matrix <-
importance_matrix[, .(Gain = sum(Gain)), by = Feature]
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
clusters <-
suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
plot <-
ggplot2::ggplot(
importance_matrix, ggplot2::aes(
x = stats::reorder(Feature, Gain), y = Gain, width = 0.05
), environment = environment()
) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
"identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
)
return(plot)
}
@ -54,4 +65,8 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
globalVariables(
c(
"Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"
)
)

View File

@ -0,0 +1,64 @@
library(stringr)
library(data.table)
library(xgboost)
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 4, objective = "binary:logistic")
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
# first number of the path represents the tree, then the following numbers are related to the path to follow
# root init
root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
tree.matrix[ID == root.nodes, Abs.Position:=root.nodes]
precedent.nodes <- root.nodes
while(tree.matrix[,sum(is.na(Abs.Position))] > 0) {
yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)]
no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)]
yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0")
no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1")
tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos]
tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos]
precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
}
tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")]
tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")]
tree.matrix[,ID:= Abs.Position]
tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))]
keepN <- 3
tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position]
tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")]
CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n")
DiagrammeR::mermaid(path)
# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf"
# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to"))

View File

@ -0,0 +1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{edge.parser}
\alias{edge.parser}
\title{Parse the graph to extract vector of edges}
\usage{
edge.parser(element)
}
\arguments{
\item{element}{igraph object containing the path from the root to the leaf.}
}
\description{
Parse the graph to extract vector of edges
}

View File

@ -0,0 +1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{get.paths.to.leaf}
\alias{get.paths.to.leaf}
\title{Extract path from root to leaf from data.table}
\usage{
get.paths.to.leaf(dt.tree)
}
\arguments{
\item{dt.tree}{data.table containing the nodes and edges of the trees}
}
\description{
Extract path from root to leaf from data.table
}

View File

@ -0,0 +1,15 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{multiplot}
\alias{multiplot}
\title{Plot multiple graphs at the same time}
\usage{
multiplot(..., cols = 1)
}
\arguments{
\item{cols}{number of columns}
}
\description{
Plot multiple graph aligned by rows and columns.
}

View File

@ -0,0 +1,47 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.deepness.R
\name{xgb.plot.deepness}
\alias{xgb.plot.deepness}
\title{Plot model trees deepness}
\usage{
xgb.plot.deepness(filename_dump = NULL, model = NULL)
}
\arguments{
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
}
\value{
Two graphs showing the distribution of the model deepness.
}
\description{
Generate a graph to plot the distribution of deepness among trees.
}
\details{
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
by tree deepness level.
The purpose of this function is to help the user to find the best trad-off to set
the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
See \link{xgb.train} for more information about these parameters.
The graph is made of two parts:
\itemize{
\item Count: number of leaf per level of deepness;
\item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
}
This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
}
\examples{
data(agaricus.train, package='xgboost')
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
min_child_weight = 50)
xgb.plot.deepness(model = bst)
}