[R] xgb.plot.tree fixes (#1939)

* [R] a few fixes and improvements to xgb.plot.tree * [R] deprecate n_first_tree replace with trees; fix types in xgb.model.dt.tree
2017-01-06 13:09:51 -06:00
parent d23ea5ca7d
commit d7406e07f3
7 changed files with 225 additions and 116 deletions
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -2,8 +2,8 @@
 #' 
 #' Visualization of the ensemble of trees as a single collective unit.
 #'
-#' @param model dump generated by the \code{xgb.train} function.
-#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
+#' @param model produced by the \code{xgb.train} function.
+#' @param feature_names names of each feature as a \code{character} vector.
 #' @param features_keep number of features to keep in each position of the multi trees.
 #' @param plot_width width in pixels of the graph to produce
 #' @param plot_height height in pixels of the graph to produce
@@ -13,21 +13,19 @@
 #' 
 #' @details
 #' 
-#' This function tries to capture the complexity of gradient boosted tree ensemble 
-#' in a cohesive way. 
+#' This function tries to capture the complexity of a gradient boosted tree model 
+#' in a cohesive way by compressing an ensemble of trees into a single tree-graph representation.
+#' The goal is to improve the interpretability of a model generally seen as black box.
 #' 
-#' The goal is to improve the interpretability of the model generally seen as black box.
-#' The function is dedicated to boosting applied to decision trees only.
-#' 
-#' The purpose is to move from an ensemble of trees to a single tree only.
+#' Note: this function is applicable to tree booster-based models only.
 #' 
 #' It takes advantage of the fact that the shape of a binary tree is only defined by 
-#' its deepness (therefore in a boosting model, all trees have the same shape). 
+#' its depth (therefore, in a boosting model, all trees have similar shape). 
 #' 
 #' Moreover, the trees tend to reuse the same features.
 #' 
-#' The function will project each tree on one, and keep for each position the 
-#' \code{features_keep} first features (based on Gain per feature measure).
+#' The function projects each tree onto one, and keeps for each position the 
+#' \code{features_keep} first features (based on the Gain per feature measure).
 #' 
 #' This function is inspired by this blog post:
 #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
@@ -70,39 +68,61 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
  tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
  
  
-  
  remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "")
  
-  tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
+  tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position),
+                    Yes = remove.tree(Yes),
+                    No = remove.tree(No))]
  
-  nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), features_keep)], " (", Quality[1:min(length(Quality), features_keep)], ")") %>% paste0(collapse = "\n")), by=abs.node.position]
-  edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL]
+  nodes.dt <- tree.matrix[
+        , .(Quality = sum(Quality))
+        , by = .(abs.node.position, Feature)
+      ][, .(Text = paste0(Feature[1:min(length(Feature), features_keep)],
+                          " (",
+                          format(Quality[1:min(length(Quality), features_keep)], digits=5),
+                          ")") %>%
+                   paste0(collapse = "\n"))
+        , by = abs.node.position]
  
-  nodes <- DiagrammeR::create_node_df(n = nrow(nodes.dt),
-                                    label = nodes.dt[,Text],
-                                    style = "filled",
-                                    color = "DimGray",
-                                    fillcolor= "Beige",
-                                    shape = "oval",
-                                    fontname = "Helvetica"
+  edges.dt <- tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)] %>%
+    list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>%
+    rbindlist() %>%
+    setnames(c("From", "To")) %>%
+    .[, .N, .(From, To)] %>%
+    .[, N:=NULL]
+  
+  nodes <- DiagrammeR::create_node_df(
+    n = nrow(nodes.dt),
+    label = nodes.dt[,Text]
  )
  
-  edges <- DiagrammeR::create_edge_df(from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
-                                    to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
-                                    color = "DimGray", 
-                                    arrowsize = "1.5", 
-                                    arrowhead = "vee",
-                                    fontname = "Helvetica",
-                                    rel = "leading_to")
+  edges <- DiagrammeR::create_edge_df(
+    from = match(edges.dt[,From], nodes.dt[,abs.node.position]),
+    to = match(edges.dt[,To], nodes.dt[,abs.node.position]),
+    rel = "leading_to")
  
-  graph <- DiagrammeR::create_graph(nodes_df = nodes,
-                                    edges_df = edges)
+  graph <- DiagrammeR::create_graph(
+      nodes_df = nodes,
+      edges_df = edges,
+      attr_theme = NULL
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "graph",
+      attr  = c("layout", "rankdir"),
+      value = c("dot", "LR")
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "node",
+      attr  = c("color", "fillcolor", "style", "shape", "fontname"),
+      value = c("DimGray", "beige", "filled", "rectangle", "Helvetica")
+      ) %>%
+    DiagrammeR::add_global_graph_attrs(
+      attr_type = "edge",
+      attr  = c("color", "arrowsize", "arrowhead", "fontname"),
+      value = c("DimGray", "1.5", "vee", "Helvetica"))
  
  DiagrammeR::render_graph(graph, width = plot_width, height = plot_height)  
 }

-globalVariables(
-  c(
-    ".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos", "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"
-  )
-)
+globalVariables(c(".N", "N", "From", "To", "Text", "Feature", "no.nodes.abs.pos",
+                  "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"))