Merge pull request #648 from pommedeterresautee/master

New function to plot model deepness
2015-11-24 13:52:40 +01:00 · 2015-11-24 13:52:40 +01:00 · 1c4ed67779
commit 1c4ed67779
parent 1b346d7041 470ac2b46f
11 changed files with 322 additions and 41 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -20,11 +20,12 @@ BugReports: https://github.com/dmlc/xgboost/issues
 VignetteBuilder: knitr
 Suggests:
    knitr,
-    ggplot2 (>= 1.0.0),
+    ggplot2 (>= 1.0.1),
    DiagrammeR (>= 0.8.1),
    Ckmeans.1d.dp (>= 3.3.1),
    vcd (>= 1.3),
-    testthat
+    testthat,
    igraph (>= 1.0.1)
 Depends:
    R (>= 2.10)
 Imports:
@ -33,4 +34,4 @@ Imports:
    data.table (>= 1.9.6),
    magrittr (>= 1.5),
    stringr (>= 0.6.2)
-RoxygenNote: 5.0.0
+RoxygenNote: 5.0.1
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -10,6 +10,7 @@ export(xgb.dump)
 export(xgb.importance)
 export(xgb.load)
 export(xgb.model.dt.tree)
 export(xgb.plot.deepness)
 export(xgb.plot.importance)
 export(xgb.plot.tree)
 export(xgb.save)
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@ -0,0 +1,172 @@
 #' Plot multiple graphs at the same time
 #' 
 #' Plot multiple graph aligned by rows and columns.
 #' 
 #' @importFrom data.table data.table
 #' @param cols number of columns
 #' @return NULL
 multiplot <- function(..., cols = 1) {
  plots <- list(...)
  numPlots = length(plots)
  layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
                   ncol = cols, nrow = ceiling(numPlots / cols))
  if (numPlots == 1) {
    print(plots[[1]])
  } else {
    grid::grid.newpage()
    grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
      print(
        plots[[i]], vp = grid::viewport(
          layout.pos.row = matchidx$row,
          layout.pos.col = matchidx$col
        )
      )
    }
  }
 }
 #' Parse the graph to extract vector of edges
 #' @param element igraph object containing the path from the root to the leaf.
 edge.parser <- function(element) {
  edges.vector <- igraph::as_ids(element)
  t <- tail(edges.vector, n = 1)
  l <- length(edges.vector)
  list(t,l)
 }
 #' Extract path from root to leaf from data.table
 #' @param dt.tree data.table containing the nodes and edges of the trees
 get.paths.to.leaf <- function(dt.tree) {
  dt.not.leaf.edges <-
    dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
  trees <- dt.tree[,unique(Tree)]
  paths <- list()
  for (tree in trees) {
    graph <-
      igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
    paths.tmp <-
      igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
                                                                              Feature == "Leaf", c(ID)])
    paths <- c(paths, paths.tmp$vpath)
  }
  paths
 }
 #' Plot model trees deepness
 #'
 #' Generate a graph to plot the distribution of deepness among trees.
 #'
 #' @importFrom data.table data.table
 #' @importFrom data.table rbindlist
 #' @importFrom data.table setnames
 #' @importFrom data.table :=
 #' @importFrom magrittr %>%
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #'
 #' @return Two graphs showing the distribution of the model deepness.
 #'
 #' @details
 #' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
 #' by tree deepness level.
 #' The purpose of this function is to help the user to find the best trad-off to set
 #' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
 #' 
 #' See \link{xgb.train} for more information about these parameters.
 #'
 #' The graph is made of two parts:
 #'
 #' \itemize{
 #'  \item Count: number of leaf per level of deepness;
 #'  \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
 #' }
 #'
 #' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #'
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
 #'                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
 #'                  min_child_weight = 50)
 #'
 #' xgb.plot.deepness(model = bst)
 #'
 #' @export
 xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("ggplot2 package is required for plotting the graph deepness.",
         call. = FALSE)
  }
  if (!requireNamespace("igraph", quietly = TRUE)) {
    stop("igraph package is required for plotting the graph deepness.",
         call. = FALSE)
  }
  if (!requireNamespace("grid", quietly = TRUE)) {
    stop("grid package is required for plotting the graph deepness.",
         call. = FALSE)
  }
  if (!class(model) %in% c("xgb.Booster", "NULL")) {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
    stop("filename_dump: path to the model doesn't exist.")
  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
  }
  if(!is.null(model)){
    dt.tree <- xgb.model.dt.tree(model = model)
  } else if(!is.null(filename_dump)){
    dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
  }
  dt.edge.elements <- data.table()
  paths <- get.paths.to.leaf(dt.tree)
  dt.edge.elements <-
    lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
    merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
  dt.edge.summuize <-
    dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
  p1 <-
    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
    ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
    ggplot2::theme(
      plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
      panel.grid.major.y = ggplot2::element_blank(),
      axis.ticks = ggplot2::element_blank(),
      axis.text.x = ggplot2::element_blank()
    )
  p2 <- 
    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) + 
    ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
  multiplot(p1,p2,cols = 1)
 }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
 globalVariables(
  c(
    "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
  )
 )
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@ -29,7 +29,8 @@
 #' xgb.plot.importance(importance_matrix)
 #'
 #' @export
-xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
+xgb.plot.importance <-
  function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
    if (!"data.table" %in% class(importance_matrix))  {
      stop("importance_matrix: Should be a data.table.")
    }
@ -41,17 +42,31 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
    }
    # To avoid issues in clustering when co-occurences are used
-  importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
+    importance_matrix <-
      importance_matrix[, .(Gain = sum(Gain)), by = Feature]
-  clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
+    clusters <-
-  importance_matrix[,"Cluster" := clusters$cluster %>% as.character]
+      suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
    importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
-  plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
+    plot <-
      ggplot2::ggplot(
        importance_matrix, ggplot2::aes(
          x = stats::reorder(Feature, Gain), y = Gain, width = 0.05
        ), environment = environment()
      ) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
                              "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
                                plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
                              )
    return(plot)
-}
+  }
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
+globalVariables(
  c(
    "Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"
  )
 )
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@ -19,7 +19,7 @@
 #'   \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
 #'   \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
 #'   \item \code{max_depth} maximum depth of a tree. Default: 6
-#'   \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
+#'   \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
 #'   \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
 #'   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
 #'   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
--- a/R-package/man/edge.parser.Rd
+++ b/R-package/man/edge.parser.Rd
@ -0,0 +1,15 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.deepness.R
 \name{edge.parser}
 \alias{edge.parser}
 \title{Parse the graph to extract vector of edges}
 \usage{
 edge.parser(element)
 }
 \arguments{
 \item{element}{igraph object containing the path from the root to the leaf.}
 }
 \description{
 Parse the graph to extract vector of edges
 }
--- a/R-package/man/get.paths.to.leaf.Rd
+++ b/R-package/man/get.paths.to.leaf.Rd
@ -0,0 +1,15 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.deepness.R
 \name{get.paths.to.leaf}
 \alias{get.paths.to.leaf}
 \title{Extract path from root to leaf from data.table}
 \usage{
 get.paths.to.leaf(dt.tree)
 }
 \arguments{
 \item{dt.tree}{data.table containing the nodes and edges of the trees}
 }
 \description{
 Extract path from root to leaf from data.table
 }
--- a/R-package/man/multiplot.Rd
+++ b/R-package/man/multiplot.Rd
@ -0,0 +1,15 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.deepness.R
 \name{multiplot}
 \alias{multiplot}
 \title{Plot multiple graphs at the same time}
 \usage{
 multiplot(..., cols = 1)
 }
 \arguments{
 \item{cols}{number of columns}
 }
 \description{
 Plot multiple graph aligned by rows and columns.
 }
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@ -0,0 +1,47 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.deepness.R
 \name{xgb.plot.deepness}
 \alias{xgb.plot.deepness}
 \title{Plot model trees deepness}
 \usage{
 xgb.plot.deepness(filename_dump = NULL, model = NULL)
 }
 \arguments{
 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
 \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 }
 \value{
 Two graphs showing the distribution of the model deepness.
 }
 \description{
 Generate a graph to plot the distribution of deepness among trees.
 }
 \details{
 Display both the number of \code{leaf} and the distribution of \code{weighted observations}
 by tree deepness level.
 The purpose of this function is to help the user to find the best trad-off to set
 the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
 See \link{xgb.train} for more information about these parameters.
 The graph is made of two parts:
 \itemize{
 \item Count: number of leaf per level of deepness;
 \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
 }
 This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
 }
 \examples{
 data(agaricus.train, package='xgboost')
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
                 eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
                 min_child_weight = 50)
 xgb.plot.deepness(model = bst)
 }
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@ -27,7 +27,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
  \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
  \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
  \item \code{max_depth} maximum depth of a tree. Default: 6
-  \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
+  \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
  \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
  \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
  \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1