Plot model deepness

New function to explore the model by ploting the way splits are done.
2015-11-24 11:45:32 +01:00 · 2015-11-24 11:45:32 +01:00 · d9fe9c5d8a
commit d9fe9c5d8a
parent fe7cdcefb4
12 changed files with 383 additions and 39 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -33,4 +33,4 @@ Imports:
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
    stringr (>= 0.6.2)
-RoxygenNote: 5.0.0
+RoxygenNote: 5.0.1
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -10,6 +10,7 @@ export(xgb.dump)
 export(xgb.importance)
 export(xgb.load)
 export(xgb.model.dt.tree)
+export(xgb.plot.deepness)
 export(xgb.plot.importance)
 export(xgb.plot.tree)
 export(xgb.save)
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@ -0,0 +1,172 @@
+#' Plot multiple graphs at the same time
+#' 
+#' Plot multiple graph aligned by rows and columns.
+#' 
+#' @importFrom data.table data.table
+#' @param cols number of columns
+#' @return NULL
+multiplot <- function(..., cols = 1) {
+  plots <- list(...)
+  numPlots = length(plots)
+  
+  layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
+                   ncol = cols, nrow = ceiling(numPlots / cols))
+  
+  if (numPlots == 1) {
+    print(plots[[1]])
+  } else {
+    grid::grid.newpage()
+    grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
+    for (i in 1:numPlots) {
+      # Get the i,j matrix positions of the regions that contain this subplot
+      matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
+      
+      print(
+        plots[[i]], vp = grid::viewport(
+          layout.pos.row = matchidx$row,
+          layout.pos.col = matchidx$col
+        )
+      )
+    }
+  }
+}
+
+#' Parse the graph to extract vector of edges
+#' @param element igraph object containing the path from the root to the leaf.
+edge.parser <- function(element) {
+  edges.vector <- igraph::as_ids(element)
+  t <- tail(edges.vector, n = 1)
+  l <- length(edges.vector)
+  list(t,l)
+}
+
+#' Extract path from root to leaf from data.table
+#' @param dt.tree data.table containing the nodes and edges of the trees
+get.paths.to.leaf <- function(dt.tree) {
+  dt.not.leaf.edges <-
+    dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
+  
+  trees <- dt.tree[,unique(Tree)]
+  
+  paths <- list()
+  for (tree in trees) {
+    graph <-
+      igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
+    paths.tmp <-
+      igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
+                                                                              Feature == "Leaf", c(ID)])
+    paths <- c(paths, paths.tmp$vpath)
+  }
+  paths
+}
+
+#' Plot model trees deepness
+#'
+#' Generate a graph to plot the distribution of deepness among trees.
+#'
+#' @importFrom data.table data.table
+#' @importFrom data.table rbindlist
+#' @importFrom data.table setnames
+#' @importFrom data.table :=
+#' @importFrom magrittr %>%
+#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
+#'
+#' @return Two graphs showing the distribution of the model deepness.
+#'
+#' @details
+#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
+#' by tree deepness level.
+#' The purpose of this function is to help the user to find the best trad-off to set
+#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
+#' 
+#' See \link{xgb.train} for more information about these parameters.
+#'
+#' The graph is made of two parts:
+#'
+#' \itemize{
+#'  \item Count: number of leaf per level of deepness;
+#'  \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
+#' }
+#'
+#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+#'
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#'
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+#'                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+#'                  min_child_weight = 50)
+#'
+#' xgb.plot.deepness(model = bst)
+#'
+#' @export
+xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
+    stop("ggplot2 package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!requireNamespace("igraph", quietly = TRUE)) {
+    stop("igraph package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!requireNamespace("grid", quietly = TRUE)) {
+    stop("grid package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
+    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
+  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
+    stop("filename_dump: path to the model doesn't exist.")
+  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
+    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
+  }
+  
+  if(!is.null(model)){
+    dt.tree <- xgb.model.dt.tree(model = model)
+  } else if(!is.null(filename_dump)){
+    dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
+  }
+  
+  dt.edge.elements <- data.table()
+  paths <- get.paths.to.leaf(dt.tree)
+  
+  dt.edge.elements <-
+    lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
+    merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
+  
+  dt.edge.summuize <-
+    dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
+  
+  p1 <-
+    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
+    ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
+    ggplot2::theme(
+      plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
+      panel.grid.major.y = ggplot2::element_blank(),
+      axis.ticks = ggplot2::element_blank(),
+      axis.text.x = ggplot2::element_blank()
+    )
+  
+  p2 <- 
+    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) + 
+    ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
+  
+  multiplot(p1,p2,cols = 1)
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(
+  c(
+    "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
+  )
+)
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@ -29,29 +29,44 @@
 #' xgb.plot.importance(importance_matrix)
 #'
 #' @export
-xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
-  if (!"data.table" %in% class(importance_matrix))  {
-    stop("importance_matrix: Should be a data.table.")
+xgb.plot.importance <-
+  function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
+    if (!"data.table" %in% class(importance_matrix))  {
+      stop("importance_matrix: Should be a data.table.")
+    }
+    if (!requireNamespace("ggplot2", quietly = TRUE)) {
+      stop("ggplot2 package is required for plotting the importance", call. = FALSE)
+    }
+    if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
+      stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
+    }
+    
+    # To avoid issues in clustering when co-occurences are used
+    importance_matrix <-
+      importance_matrix[, .(Gain = sum(Gain)), by = Feature]
+    
+    clusters <-
+      suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
+    importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
+    
+    plot <-
+      ggplot2::ggplot(
+        importance_matrix, ggplot2::aes(
+          x = stats::reorder(Feature, Gain), y = Gain, width = 0.05
+        ), environment = environment()
+      ) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
+                              "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
+                                plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
+                              )
+    
+    return(plot)
  }
-  if (!requireNamespace("ggplot2", quietly = TRUE)) {
-    stop("ggplot2 package is required for plotting the importance", call. = FALSE)
-  }
-  if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
-    stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
-  }
-
-  # To avoid issues in clustering when co-occurences are used
-  importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
-
-  clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
-  importance_matrix[,"Cluster" := clusters$cluster %>% as.character]
-
-  plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
-
-  return(plot)
-}

 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
+globalVariables(
+  c(
+    "Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"
+  )
+)
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@ -19,7 +19,7 @@
 #'   \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
 #'   \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
 #'   \item \code{max_depth} maximum depth of a tree. Default: 6
-#'   \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
+#'   \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
 #'   \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
 #'   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
 #'   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
--- a/R-package/demo/xgb.plot.multi.tree.R
+++ b/R-package/demo/xgb.plot.multi.tree.R
@ -0,0 +1,64 @@
+library(stringr)
+library(data.table)
+library(xgboost)
+
+
+data(agaricus.train, package='xgboost')
+
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
+#Each column of the sparse Matrix is a feature in one hot encoding format.
+train <- agaricus.train
+
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+               eta = 1, nthread = 2, nround = 4, objective = "binary:logistic")
+
+#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
+tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
+
+
+# first number of the path represents the tree, then the following numbers are related to the path to follow
+
+# root init
+root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
+tree.matrix[ID == root.nodes, Abs.Position:=root.nodes]
+
+precedent.nodes <- root.nodes
+
+while(tree.matrix[,sum(is.na(Abs.Position))] > 0) {
+  yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)]
+  no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)]
+  yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0")
+  no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1")
+  
+  tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos]
+  tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos]
+  precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
+}
+
+tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")]
+tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")]
+tree.matrix[,ID:= Abs.Position]
+
+tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))]
+keepN <- 3
+tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position]
+
+tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
+
+tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
+
+tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")]
+
+CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
+
+
+yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
+
+no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
+
+path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n")
+DiagrammeR::mermaid(path)
+
+# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf"
+# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to"))
--- a/R-package/man/edge.parser.Rd
+++ b/R-package/man/edge.parser.Rd
@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{edge.parser}
+\alias{edge.parser}
+\title{Parse the graph to extract vector of edges}
+\usage{
+edge.parser(element)
+}
+\arguments{
+\item{element}{igraph object containing the path from the root to the leaf.}
+}
+\description{
+Parse the graph to extract vector of edges
+}
+
--- a/R-package/man/get.paths.to.leaf.Rd
+++ b/R-package/man/get.paths.to.leaf.Rd
@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{get.paths.to.leaf}
+\alias{get.paths.to.leaf}
+\title{Extract path from root to leaf from data.table}
+\usage{
+get.paths.to.leaf(dt.tree)
+}
+\arguments{
+\item{dt.tree}{data.table containing the nodes and edges of the trees}
+}
+\description{
+Extract path from root to leaf from data.table
+}
+
--- a/R-package/man/multiplot.Rd
+++ b/R-package/man/multiplot.Rd
@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{multiplot}
+\alias{multiplot}
+\title{Plot multiple graphs at the same time}
+\usage{
+multiplot(..., cols = 1)
+}
+\arguments{
+\item{cols}{number of columns}
+}
+\description{
+Plot multiple graph aligned by rows and columns.
+}
+
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{xgb.plot.deepness}
+\alias{xgb.plot.deepness}
+\title{Plot model trees deepness}
+\usage{
+xgb.plot.deepness(filename_dump = NULL, model = NULL)
+}
+\arguments{
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+
+\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+}
+\value{
+Two graphs showing the distribution of the model deepness.
+}
+\description{
+Generate a graph to plot the distribution of deepness among trees.
+}
+\details{
+Display both the number of \code{leaf} and the distribution of \code{weighted observations}
+by tree deepness level.
+The purpose of this function is to help the user to find the best trad-off to set
+the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
+
+See \link{xgb.train} for more information about these parameters.
+
+The graph is made of two parts:
+
+\itemize{
+ \item Count: number of leaf per level of deepness;
+ \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
+}
+
+This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+}
+\examples{
+data(agaricus.train, package='xgboost')
+
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+                 eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+                 min_child_weight = 50)
+
+xgb.plot.deepness(model = bst)
+
+}
+
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@ -27,7 +27,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
  \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
  \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
  \item \code{max_depth} maximum depth of a tree. Default: 6
-  \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
+  \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
  \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
  \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
  \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1