add new function to read model and use it in the plot function

2015-01-07 17:47:50 +01:00 · 2015-01-07 17:47:50 +01:00 · d532f04394
commit d532f04394
parent e380e4facf
5 changed files with 173 additions and 60 deletions
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -9,6 +9,7 @@ export(xgb.cv)
 export(xgb.dump)
 export(xgb.importance)
 export(xgb.load)
+export(xgb.model.dt.tree)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.train)
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@ -0,0 +1,109 @@
+#' Convert tree model dump to data.table
+#' 
+#' Read a tree model text dump and return a data.table.
+#' 
+#' @importFrom data.table data.table
+#' @importFrom data.table set
+#' @importFrom data.table rbindlist
+#' @importFrom data.table :=
+#' @importFrom magrittr %>%
+#' @importFrom magrittr not
+#' @importFrom magrittr add
+#' @importFrom stringr str_extract
+#' @importFrom stringr str_split
+#' @importFrom stringr str_extract
+#' @importFrom stringr str_trim
+#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
+#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
+#'
+#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
+#'
+#' @details 
+#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
+#' 
+#' The content of the \code{data.table} is organised that way:
+#' 
+#' \itemize{
+#' \item \code{ID}: unique identifier of a node ;
+#'  \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
+#'  \item \code{Split}: value of the chosen feature where is operated the split ;
+#'  \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
+#'  \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
+#'  \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
+#'  \item \code{Quality}: it's the gain related to the split in this specific node ;
+#'  \item \code{Cover}: metric to measure the number of observation affected by the split ;
+#'  \item \code{Tree}: ID of the tree. It is included in the main ID ;
+#' } 
+#'   
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#' 
+#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). 
+#' #Each column of the sparse Matrix is a feature in one hot encoding format.
+#' train <- agaricus.train
+#' 
+#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
+#'                eta = 1, nround = 2,objective = "binary:logistic")
+#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+#' 
+#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
+#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' 
+#' @export
+xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL){
+  
+  if (!class(feature_names) %in% c("character", "NULL")) {     
+    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
+  }
+  if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
+    stop("filename_dump: Has to be a path to the model dump file.")
+  }
+  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
+    stop("n_first_tree: Has to be a numeric vector of size 1.")
+  }
+  
+  text <- readLines(filename_dump) %>% str_trim(side = "both")
+  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
+  
+  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
+  
+  n_round <- min(length(position) - 1, n_first_tree)
+  
+  addTreeId <- function(x, i) paste(i,x,sep = "-")
+  
+  allTrees <- data.table()
+  
+  for(i in 1:n_round){
+    
+    tree <- text[(position[i]+1):(position[i+1]-1)]
+    
+    notLeaf <- str_match(tree, "leaf") %>% is.na
+    leaf <- notLeaf %>% not %>% tree[.]
+    branch <- notLeaf %>% tree[.]
+    idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i)
+    idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i)
+    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric 
+    if(!is.null(feature_names)){
+      featureBranch <- feature_names[featureBranch + 1]
+    }
+    featureLeaf <- rep("Leaf", length(leaf))
+    splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") 
+    splitLeaf <- rep(NA, length(leaf)) 
+    yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i)
+    yesLeaf <- rep(NA, length(leaf)) 
+    noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i)
+    noLeaf <- rep(NA, length(leaf))
+    missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i)
+    missingLeaf <- rep(NA, length(leaf))
+    qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
+    qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
+    coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
+    coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
+    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i]
+        
+    allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
+  }
+  
+  allTrees
+}
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@ -1,6 +1,6 @@
 #' Plot a boosted tree model
 #' 
-#' Read a xgboost model text dump. 
+#' Read a tree model text dump. 
 #' Plotting only works for boosted tree model (not linear model).
 #' 
 #' @importFrom data.table data.table
@ -21,7 +21,7 @@
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
 #' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
 #'
-#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
+#' @return A \code{DiagrammeR} of the model.
 #'
 #' @details 
 #' 
@ -34,7 +34,7 @@
 #' } 
 #' 
 #' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
-#' It uses Mermaid JS library for that purpose.
+#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
 #'  
 #' @examples
 #' data(agaricus.train, package='xgboost')
@ -53,61 +53,11 @@
 #' @export
 xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){  
  
-  if (!class(feature_names) %in% c("character", "NULL")) {     
-    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
-  }
-  if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
-    stop("filename_dump: Has to be a path to the model dump file.")
-  }
-  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
-    stop("n_first_tree: Has to be a numeric vector of size 1.")
-  }
-  
  if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) {
    stop("style: Has to be a character vector of size 1.")
  }
    
-  text <- readLines(filename_dump) %>% str_trim(side = "both")
-  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
-  
-  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
-  
-  n_round <- min(length(position) - 1, n_first_tree)
-  
-  addTreeId <- function(x, i) paste(i,x,sep = "-")
-  
-  allTrees <- data.table()
-  
-  for(i in 1:n_round){
-    
-    tree <- text[(position[i]+1):(position[i+1]-1)]
-    
-    notLeaf <- str_match(tree, "leaf") %>% is.na
-    leaf <- notLeaf %>% not %>% tree[.]
-    branch <- notLeaf %>% tree[.]
-    idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i)
-    idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i)
-    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric 
-    if(!is.null(feature_names)){
-      featureBranch <- feature_names[featureBranch + 1]
-    }
-    featureLeaf <- rep("Leaf", length(leaf))
-    splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") 
-    splitLeaf <- rep(NA, length(leaf)) 
-    yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i)
-    yesLeaf <- rep(NA, length(leaf)) 
-    noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i)
-    noLeaf <- rep(NA, length(leaf))
-    missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i)
-    missingLeaf <- rep(NA, length(leaf))
-    qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
-    qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
-    coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
-    coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
-    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i]
-        
-    allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
-  }
+  allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree)
  
  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(allTrees)[,ID:=Yes][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "<br/>Cover: ", Cover, sep = "")])
  
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@ -0,0 +1,54 @@
+% Generated by roxygen2 (4.1.0): do not edit by hand
+% Please edit documentation in R/xgb.model.dt.tree.R
+\name{xgb.model.dt.tree}
+\alias{xgb.model.dt.tree}
+\title{Convert tree model dump to data.table}
+\usage{
+xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
+  n_first_tree = NULL)
+}
+\arguments{
+\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+
+\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
+}
+\value{
+A \code{data.table} of the features used in the model with their gain, cover and few other thing.
+}
+\description{
+Read a tree model text dump and return a data.table.
+}
+\details{
+General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
+
+The content of the \code{data.table} is organised that way:
+
+\itemize{
+\item \code{ID}: unique identifier of a node ;
+ \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
+ \item \code{Split}: value of the chosen feature where is operated the split ;
+ \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
+ \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
+ \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
+ \item \code{Quality}: it's the gain related to the split in this specific node ;
+ \item \code{Cover}: metric to measure the number of observation affected by the split ;
+ \item \code{Tree}: ID of the tree. It is included in the main ID ;
+}
+}
+\examples{
+data(agaricus.train, package='xgboost')
+
+#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Each column of the sparse Matrix is a feature in one hot encoding format.
+train <- agaricus.train
+
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+               eta = 1, nround = 2,objective = "binary:logistic")
+xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+
+#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
+xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+}
+
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@ -17,14 +17,13 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL,
 \item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
 }
 \value{
-A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
+A \code{DiagrammeR} of the model.
 }
 \description{
-Read a xgboost model text dump.
+Read a tree model text dump.
+Plotting only works for boosted tree model (not linear model).
 }
 \details{
-Plotting only works for boosted tree model (not linear model).
-
 The content of each node is organised that way:

 \itemize{
@ -34,7 +33,7 @@ The content of each node is organised that way:
 }

 Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
-It uses Mermaid JS library for that purpose.
+It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
 }
 \examples{
 data(agaricus.train, package='xgboost')