Merge pull request #1118 from khotilov/parsing_speedup

[R-package] xgb.model.dt.tree up to x100 faster
2016-05-17 17:48:11 +02:00
parent 49bbd72d08 611b317057
commit 51154f42fe
4 changed files with 126 additions and 152 deletions
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -43,19 +43,14 @@ importFrom(Matrix,sparse.model.matrix)
 importFrom(Matrix,sparseVector)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
-importFrom(data.table,copy)
 importFrom(data.table,data.table)
 importFrom(data.table,fread)
 importFrom(data.table,rbindlist)
-importFrom(data.table,set)
 importFrom(data.table,setnames)
 importFrom(magrittr,"%>%")
-importFrom(magrittr,add)
-importFrom(magrittr,not)
 importFrom(stringr,str_detect)
 importFrom(stringr,str_extract)
 importFrom(stringr,str_extract_all)
 importFrom(stringr,str_match)
 importFrom(stringr,str_replace)
 importFrom(stringr,str_split)
-importFrom(stringr,str_trim)
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -1,153 +1,127 @@
-#' Parse boosted tree model text dump
+#' Parse a boosted tree model text dump
 #' 
-#' Parse a boosted tree model text dump and return a \code{data.table}.
+#' Parse a boosted tree model text dump into a \code{data.table} structure.
 #' 
 #' @importFrom data.table data.table
-#' @importFrom data.table set
-#' @importFrom data.table rbindlist
-#' @importFrom data.table copy
 #' @importFrom data.table :=
 #' @importFrom magrittr %>%
-#' @importFrom magrittr not
-#' @importFrom magrittr add
-#' @importFrom stringr str_extract
-#' @importFrom stringr str_split
-#' @importFrom stringr str_trim
-#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
-#' @param model object created by the \code{xgb.train} function.
-#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
-#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
-#'
-#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
-#'
-#' @details 
-#' General function to convert a text dump of tree model to a \code{data.table}. 
-#' 
-#' The purpose is to help user to explore the model and get a better understanding of it.
+#' @importFrom stringr str_match
 #' 
+#' @param feature_names character vector of feature names. If the model already
+#'          contains feature names, this argument should be \code{NULL} (default value)
+#' @param model object of class \code{xgb.Booster}
+#' @param text \code{character} vector previously generated by the \code{xgb.dump} 
+#'          function  (where parameter \code{with.stats = TRUE} should have been set).
+#' @param n_first_tree limit the parsing to the \code{n} first trees. 
+#'          If set to \code{NULL}, all trees of the model are parsed.
+#'
+#' @return 
+#' A \code{data.table} with detailed information about model trees' nodes.
+#'
 #' The columns of the \code{data.table} are:
 #' 
 #' \itemize{
-#' \item \code{ID}: unique identifier of a node ;
-#'  \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
-#'  \item \code{Split}: value of the chosen feature where is operated the split ;
-#'  \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
-#'  \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
-#'  \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
-#'  \item \code{Quality}: it's the gain related to the split in this specific node ;
-#'  \item \code{Cover}: metric to measure the number of observation affected by the split ;
-#'  \item \code{Tree}: ID of the tree. It is included in the main ID ;
-#'  \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
+#'  \item \code{Tree}: ID of a tree in a model
+#'  \item \code{Node}: ID of a node in a tree
+#'  \item \code{ID}: unique identifier of a node in a model
+#'  \item \code{Feature}: for a branch node, it's a feature id or name (when available);
+#'              for a leaf note, it simply labels it as \code{'Leaf'}
+#'  \item \code{Split}: location of the split for a branch node (split condition is always "less than")
+#'  \item \code{Yes}: ID of the next node when the split condition is met
+#'  \item \code{No}: ID of the next node when the split condition is not met
+#'  \item \code{Missing}: ID of the next node when branch value is missing
+#'  \item \code{Quality}: either the split gain (change in loss) or the leaf value
+#'  \item \code{Cover}: metric related to the number of observation either seen by a split
+#'                      or collected by a leaf during training.
 #' } 
-#'   
+#' 
 #' @examples
+#' # Basic use:
+#' 
 #' data(agaricus.train, package='xgboost')
 #' 
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
-#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
+#' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
 #' 
+#' 
+#' # How to match feature names of splits that are following a current 'Yes' branch:
+#' 
+#' merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
+#'  
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
-
+xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
+                              n_first_tree = NULL){
+  
  if (!class(feature_names) %in% c("character", "NULL")) {
-    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
+    stop("feature_names: Has to be a vector of character\n",
+         "  or NULL if the model dump already contains feature names.\n",
+         "  Look at this function documentation to see where to get feature names.")
  }
-
+  
  if (class(model) != "xgb.Booster" & class(text) != "character") {
-    "model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>%
-      paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>%
-      stop()
+    stop("Either 'model' has to be an object of class xgb.Booster\n",
+         "  or 'text' has to be a character vector with the result of xgb.dump\n",
+         "  (or NULL if the model was provided).")
  }
-
+  
  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
    stop("n_first_tree: Has to be a numeric vector of size 1.")
  }
-
-  if(is.null(text)){		
+  
+  if(is.null(text)){
    text <- xgb.dump(model = model, with.stats = T)
  }
  
-  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
-
-  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
-
-  n_round <- min(length(position) - 1, n_first_tree)
-
+  position <- which(!is.na(str_match(text, "booster")))
+  
  addTreeId <- function(x, i) paste(i,x,sep = "-")
-
-  allTrees <- data.table()
-
+  
  anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
-  for (i in 1:n_round){
+  
+  td <- data.table(t=text)
+  td[position, Tree := 1L]
+  td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L]
+  
+  n_first_tree <- min(max(td$Tree), n_first_tree)
+  td <- td[Tree <= n_first_tree & !grepl('^booster', t)]
+  
+  td[, Node := str_match(t, "(\\d+):")[,2] %>% as.numeric ]
+  td[, ID := addTreeId(Node, Tree)]
+  td[, isLeaf := !is.na(str_match(t, "leaf"))]

-    tree <- text[(position[i] + 1):(position[i + 1] - 1)]
-
-    # avoid tree made of a leaf only (no split)
-    if(length(tree) < 2) next
-
-    treeID <- i - 1
-
-    notLeaf <- str_match(tree, "leaf") %>% is.na
-    leaf <- notLeaf %>% not %>% tree[.]
-    branch <- notLeaf %>% tree[.]
-    idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
-    idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
-    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
-    if(!is.null(feature_names)){
-      featureBranch <- feature_names[featureBranch + 1]
-    }
-    featureLeaf <- rep("Leaf", length(leaf))
-    splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
-    splitLeaf <- rep(NA, length(leaf))
-    yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
-    yesLeaf <- rep(NA, length(leaf))
-    noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
-    noLeaf <- rep(NA, length(leaf))
-    missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
-    missingLeaf <- rep(NA, length(leaf))
-    qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
-    qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
-    coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
-    coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
-    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID]
-
-    allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
-  }
-
-  yes <- allTrees[!is.na(Yes), Yes]
-
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "Yes.Feature",
-      value = allTrees[ID %in% yes, Feature])
-
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "Yes.Cover",
-      value = allTrees[ID %in% yes, Cover])
-
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "Yes.Quality",
-      value = allTrees[ID %in% yes, Quality])
-  no <- allTrees[!is.na(No), No]
-
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "No.Feature",
-      value = allTrees[ID %in% no, Feature])
-
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "No.Cover",
-      value = allTrees[ID %in% no, Cover])
-
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "No.Quality",
-      value = allTrees[ID %in% no, Quality])
-
-  allTrees
+  # parse branch lines
+  td[isLeaf==FALSE, c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") := {
+    rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
+                 "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
+    # skip some indices with spurious capture groups from anynumber_regex
+    xtr <- str_match(t, rx)[, c(2,3,5,6,7,8,10)]
+    xtr[, 3:5] <- addTreeId(xtr[, 3:5], Tree)
+    lapply(1:ncol(xtr), function(i) xtr[,i])
+  }]
+  # assign feature_names when available
+  td[isLeaf==FALSE & !is.null(feature_names), 
+     Feature := feature_names[as.numeric(Feature) + 1] ]
+  
+  # parse leaf lines
+  td[isLeaf==TRUE, c("Feature", "Quality", "Cover") := {
+    rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
+    xtr <- str_match(t, rx)[, c(2,4)]
+    c("Leaf", lapply(1:ncol(xtr), function(i) xtr[,i]))
+  }]
+  
+  # convert some columns to numeric
+  numeric_cols <- c("Quality", "Cover")
+  td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols=numeric_cols]
+  
+  td[, t := NULL]
+  td[, isLeaf := NULL]
+  
+  td[order(Tree, Node)]
 }

 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency"))
+globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf",".SD", ".SDcols"))
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -2,54 +2,60 @@
 % Please edit documentation in R/xgb.model.dt.tree.R
 \name{xgb.model.dt.tree}
 \alias{xgb.model.dt.tree}
-\title{Parse boosted tree model text dump}
+\title{Parse a boosted tree model text dump}
 \usage{
 xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
  n_first_tree = NULL)
 }
 \arguments{
-\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).}
+\item{feature_names}{character vector of feature names. If the model already
+contains feature names, this argument should be \code{NULL} (default value)}

-\item{model}{object created by the \code{xgb.train} function.}
+\item{model}{object of class \code{xgb.Booster}}

-\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).}
+\item{text}{\code{character} vector previously generated by the \code{xgb.dump} 
+function  (where parameter \code{with.stats = TRUE} should have been set).}

-\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.}
+\item{n_first_tree}{limit the parsing to the \code{n} first trees. 
+If set to \code{NULL}, all trees of the model are parsed.}
 }
 \value{
-A \code{data.table} of the features used in the model with their gain, cover and few other information.
-}
-\description{
-Parse a boosted tree model text dump and return a \code{data.table}.
-}
-\details{
-General function to convert a text dump of tree model to a \code{data.table}. 
-
-The purpose is to help user to explore the model and get a better understanding of it.
+A \code{data.table} with detailed information about model trees' nodes.

 The columns of the \code{data.table} are:

 \itemize{
-\item \code{ID}: unique identifier of a node ;
- \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
- \item \code{Split}: value of the chosen feature where is operated the split ;
- \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
- \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
- \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
- \item \code{Quality}: it's the gain related to the split in this specific node ;
- \item \code{Cover}: metric to measure the number of observation affected by the split ;
- \item \code{Tree}: ID of the tree. It is included in the main ID ;
- \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
+ \item \code{Tree}: ID of a tree in a model
+ \item \code{Node}: ID of a node in a tree
+ \item \code{ID}: unique identifier of a node in a model
+ \item \code{Feature}: for a branch node, it's a feature id or name (when available);
+             for a leaf note, it simply labels it as \code{'Leaf'}
+ \item \code{Split}: location of the split for a branch node (split condition is always "less than")
+ \item \code{Yes}: ID of the next node when the split condition is met
+ \item \code{No}: ID of the next node when the split condition is not met
+ \item \code{Missing}: ID of the next node when branch value is missing
+ \item \code{Quality}: either the split gain (change in loss) or the leaf value
+ \item \code{Cover}: metric related to the number of observation either seen by a split
+                     or collected by a leaf during training.
 }
 }
+\description{
+Parse a boosted tree model text dump into a \code{data.table} structure.
+}
 \examples{
+# Basic use:
+
 data(agaricus.train, package='xgboost')

 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
               eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")

-# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
+(dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))

+
+# How to match feature names of splits that are following a current 'Yes' branch:
+
+merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
+ 
 }

--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -20,7 +20,7 @@ bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
 bst.GLM <- xgboost(data = sparse_matrix, label = output_vector,
                   eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear")

-feature.names <- agaricus.train$data@Dimnames[[2]]
+feature.names <- colnames(agaricus.train$data)

 test_that("xgb.dump works", {
  capture.output(print(xgb.dump(bst.Tree)))
@@ -57,11 +57,10 @@ test_that("xgb-attribute functionality", {
 })

 test_that("xgb.model.dt.tree works with and without feature names", {
-  names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover",
-   "Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality")
+  names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
  dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
  expect_equal(names.dt.trees, names(dt.tree))
-  expect_equal(dim(dt.tree), c(162, 15))
+  expect_equal(dim(dt.tree), c(162, 10))
  xgb.model.dt.tree(model = bst.Tree)
 })