Plot model deepness

New function to explore the model by ploting the way splits are done.
2015-11-24 11:45:32 +01:00 · 2015-11-24 11:45:32 +01:00 · d9fe9c5d8a
commit d9fe9c5d8a
parent fe7cdcefb4
12 changed files with 383 additions and 39 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -33,4 +33,4 @@ Imports:
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
    stringr (>= 0.6.2)
-RoxygenNote: 5.0.0
+RoxygenNote: 5.0.1
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -10,6 +10,7 @@ export(xgb.dump)
 export(xgb.importance)
 export(xgb.load)
 export(xgb.model.dt.tree)
+export(xgb.plot.deepness)
 export(xgb.plot.importance)
 export(xgb.plot.tree)
 export(xgb.save)
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@ -0,0 +1,172 @@
+#' Plot multiple graphs at the same time
+#' 
+#' Plot multiple graph aligned by rows and columns.
+#' 
+#' @importFrom data.table data.table
+#' @param cols number of columns
+#' @return NULL
+multiplot <- function(..., cols = 1) {
+  plots <- list(...)
+  numPlots = length(plots)
+  
+  layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
+                   ncol = cols, nrow = ceiling(numPlots / cols))
+  
+  if (numPlots == 1) {
+    print(plots[[1]])
+  } else {
+    grid::grid.newpage()
+    grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
+    for (i in 1:numPlots) {
+      # Get the i,j matrix positions of the regions that contain this subplot
+      matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
+      
+      print(
+        plots[[i]], vp = grid::viewport(
+          layout.pos.row = matchidx$row,
+          layout.pos.col = matchidx$col
+        )
+      )
+    }
+  }
+}
+
+#' Parse the graph to extract vector of edges
+#' @param element igraph object containing the path from the root to the leaf.
+edge.parser <- function(element) {
+  edges.vector <- igraph::as_ids(element)
+  t <- tail(edges.vector, n = 1)
+  l <- length(edges.vector)
+  list(t,l)
+}
+
+#' Extract path from root to leaf from data.table
+#' @param dt.tree data.table containing the nodes and edges of the trees
+get.paths.to.leaf <- function(dt.tree) {
+  dt.not.leaf.edges <-
+    dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
+  
+  trees <- dt.tree[,unique(Tree)]
+  
+  paths <- list()
+  for (tree in trees) {
+    graph <-
+      igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
+    paths.tmp <-
+      igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
+                                                                              Feature == "Leaf", c(ID)])
+    paths <- c(paths, paths.tmp$vpath)
+  }
+  paths
+}
+
+#' Plot model trees deepness
+#'
+#' Generate a graph to plot the distribution of deepness among trees.
+#'
+#' @importFrom data.table data.table
+#' @importFrom data.table rbindlist
+#' @importFrom data.table setnames
+#' @importFrom data.table :=
+#' @importFrom magrittr %>%
+#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
+#'
+#' @return Two graphs showing the distribution of the model deepness.
+#'
+#' @details
+#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
+#' by tree deepness level.
+#' The purpose of this function is to help the user to find the best trad-off to set
+#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
+#' 
+#' See \link{xgb.train} for more information about these parameters.
+#'
+#' The graph is made of two parts:
+#'
+#' \itemize{
+#'  \item Count: number of leaf per level of deepness;
+#'  \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
+#' }
+#'
+#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+#'
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#'
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+#'                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+#'                  min_child_weight = 50)
+#'
+#' xgb.plot.deepness(model = bst)
+#'
+#' @export
+xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
+    stop("ggplot2 package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!requireNamespace("igraph", quietly = TRUE)) {
+    stop("igraph package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!requireNamespace("grid", quietly = TRUE)) {
+    stop("grid package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
+    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
+  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
+    stop("filename_dump: path to the model doesn't exist.")
+  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
+    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
+  }
+  
+  if(!is.null(model)){
+    dt.tree <- xgb.model.dt.tree(model = model)
+  } else if(!is.null(filename_dump)){
+    dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
+  }
+  
+  dt.edge.elements <- data.table()
+  paths <- get.paths.to.leaf(dt.tree)
+  
+  dt.edge.elements <-
+    lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
+    merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
+  
+  dt.edge.summuize <-
+    dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
+  
+  p1 <-
+    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
+    ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
+    ggplot2::theme(
+      plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
+      panel.grid.major.y = ggplot2::element_blank(),
+      axis.ticks = ggplot2::element_blank(),
+      axis.text.x = ggplot2::element_blank()
+    )
+  
+  p2 <- 
+    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) + 
+    ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
+  
+  multiplot(p1,p2,cols = 1)
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(
+  c(
+    "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
+  )
+)
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@ -29,7 +29,8 @@
 #' xgb.plot.importance(importance_matrix)
 #'
 #' @export
-xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
+xgb.plot.importance <-
+  function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
    if (!"data.table" %in% class(importance_matrix))  {
      stop("importance_matrix: Should be a data.table.")
    }
@ -41,12 +42,22 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
    }
    
    # To avoid issues in clustering when co-occurences are used
-  importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
+    importance_matrix <-
+      importance_matrix[, .(Gain = sum(Gain)), by = Feature]
    
-  clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
+    clusters <-
+      suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
    importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
    
-  plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
+    plot <-
+      ggplot2::ggplot(
+        importance_matrix, ggplot2::aes(
+          x = stats::reorder(Feature, Gain), y = Gain, width = 0.05
+        ), environment = environment()
+      ) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
+                              "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
+                                plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
+                              )
    
    return(plot)
  }
@ -54,4 +65,8 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
+globalVariables(
+  c(
+    "Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"
+  )
+)
--- a/R-package/demo/xgb.plot.multi.tree.R
+++ b/R-package/demo/xgb.plot.multi.tree.R
@ -0,0 +1,64 @@
+library(stringr)
+library(data.table)
+library(xgboost)
+
+
+data(agaricus.train, package='xgboost')
+
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
+#Each column of the sparse Matrix is a feature in one hot encoding format.
+train <- agaricus.train
+
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+               eta = 1, nthread = 2, nround = 4, objective = "binary:logistic")
+
+#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
+tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
+
+
+# first number of the path represents the tree, then the following numbers are related to the path to follow
+
+# root init
+root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
+tree.matrix[ID == root.nodes, Abs.Position:=root.nodes]
+
+precedent.nodes <- root.nodes
+
+while(tree.matrix[,sum(is.na(Abs.Position))] > 0) {
+  yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)]
+  no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)]
+  yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0")
+  no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1")
+  
+  tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos]
+  tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos]
+  precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
+}
+
+tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")]
+tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")]
+tree.matrix[,ID:= Abs.Position]
+
+tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))]
+keepN <- 3
+tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position]
+
+tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
+
+tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
+
+tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")]
+
+CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
+
+
+yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
+
+no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
+
+path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n")
+DiagrammeR::mermaid(path)
+
+# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf"
+# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to"))
--- a/R-package/man/edge.parser.Rd
+++ b/R-package/man/edge.parser.Rd
@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{edge.parser}
+\alias{edge.parser}
+\title{Parse the graph to extract vector of edges}
+\usage{
+edge.parser(element)
+}
+\arguments{
+\item{element}{igraph object containing the path from the root to the leaf.}
+}
+\description{
+Parse the graph to extract vector of edges
+}
+
--- a/R-package/man/get.paths.to.leaf.Rd
+++ b/R-package/man/get.paths.to.leaf.Rd
@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{get.paths.to.leaf}
+\alias{get.paths.to.leaf}
+\title{Extract path from root to leaf from data.table}
+\usage{
+get.paths.to.leaf(dt.tree)
+}
+\arguments{
+\item{dt.tree}{data.table containing the nodes and edges of the trees}
+}
+\description{
+Extract path from root to leaf from data.table
+}
+
--- a/R-package/man/multiplot.Rd
+++ b/R-package/man/multiplot.Rd
@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{multiplot}
+\alias{multiplot}
+\title{Plot multiple graphs at the same time}
+\usage{
+multiplot(..., cols = 1)
+}
+\arguments{
+\item{cols}{number of columns}
+}
+\description{
+Plot multiple graph aligned by rows and columns.
+}
+
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{xgb.plot.deepness}
+\alias{xgb.plot.deepness}
+\title{Plot model trees deepness}
+\usage{
+xgb.plot.deepness(filename_dump = NULL, model = NULL)
+}
+\arguments{
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+
+\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+}
+\value{
+Two graphs showing the distribution of the model deepness.
+}
+\description{
+Generate a graph to plot the distribution of deepness among trees.
+}
+\details{
+Display both the number of \code{leaf} and the distribution of \code{weighted observations}
+by tree deepness level.
+The purpose of this function is to help the user to find the best trad-off to set
+the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
+
+See \link{xgb.train} for more information about these parameters.
+
+The graph is made of two parts:
+
+\itemize{
+ \item Count: number of leaf per level of deepness;
+ \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
+}
+
+This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+}
+\examples{
+data(agaricus.train, package='xgboost')
+
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+                 eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+                 min_child_weight = 50)
+
+xgb.plot.deepness(model = bst)
+
+}
+