|
|
|
|
@@ -1,153 +1,106 @@
|
|
|
|
|
#' Parse boosted tree model text dump
|
|
|
|
|
#'
|
|
|
|
|
#' Parse a boosted tree model text dump and return a \code{data.table}.
|
|
|
|
|
#' Parse a boosted tree model text dump into a \code{data.table} structure.
|
|
|
|
|
#'
|
|
|
|
|
#' @importFrom data.table data.table
|
|
|
|
|
#' @importFrom data.table set
|
|
|
|
|
#' @importFrom data.table rbindlist
|
|
|
|
|
#' @importFrom data.table copy
|
|
|
|
|
#' @importFrom data.table :=
|
|
|
|
|
#' @importFrom magrittr %>%
|
|
|
|
|
#' @importFrom magrittr not
|
|
|
|
|
#' @importFrom magrittr add
|
|
|
|
|
#' @importFrom stringr str_extract
|
|
|
|
|
#' @importFrom stringr str_split
|
|
|
|
|
#' @importFrom stringr str_trim
|
|
|
|
|
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
|
|
|
|
|
#' @param model object created by the \code{xgb.train} function.
|
|
|
|
|
#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
|
|
|
|
|
#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
|
|
|
|
|
#'
|
|
|
|
|
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
|
|
|
|
|
#'
|
|
|
|
|
#' @details
|
|
|
|
|
#' General function to convert a text dump of tree model to a \code{data.table}.
|
|
|
|
|
#'
|
|
|
|
|
#' The purpose is to help user to explore the model and get a better understanding of it.
|
|
|
|
|
#' @importFrom stringr str_match
|
|
|
|
|
#'
|
|
|
|
|
#' @param feature_names character vector of feature names. If the model already
|
|
|
|
|
#' contains feature names, this argument should be \code{NULL} (default value)
|
|
|
|
|
#' @param model object of class \code{xgb.Booster}
|
|
|
|
|
#' @param text \code{character} vector previously generated by the \code{xgb.dump}
|
|
|
|
|
#' function (where parameter \code{with.stats = TRUE} should have been set).
|
|
|
|
|
#' @param n_first_tree limit the parsing to the \code{n} first trees.
|
|
|
|
|
#' If set to \code{NULL}, all trees of the model are parsed.
|
|
|
|
|
#'
|
|
|
|
|
#' @return
|
|
|
|
|
#' A \code{data.table} with detailed information about model trees' nodes.
|
|
|
|
|
#'
|
|
|
|
|
#' The columns of the \code{data.table} are:
|
|
|
|
|
#'
|
|
|
|
|
#' \itemize{
|
|
|
|
|
#' \item \code{ID}: unique identifier of a node ;
|
|
|
|
|
#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
|
|
|
|
|
#' \item \code{Split}: value of the chosen feature where is operated the split ;
|
|
|
|
|
#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
|
|
|
|
|
#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
|
|
|
|
|
#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
|
|
|
|
|
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
|
|
|
|
|
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
|
|
|
|
|
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
|
|
|
|
|
#' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
|
|
|
|
|
#' \item \code{Tree}: ID of a tree in a model
|
|
|
|
|
#' \item \code{Node}: ID of a node in a tree
|
|
|
|
|
#' \item \code{ID}: unique identifier of a node in a model
|
|
|
|
|
#' \item \code{Feature}: for a branch node, it's a feature id or name (when available);
|
|
|
|
|
#' for a leaf note, it simply labels it as \code{'Leaf'}
|
|
|
|
|
#' \item \code{Split}: location of the split for a branch node (split condition is always "less than")
|
|
|
|
|
#' \item \code{Yes}: ID of the next node when the split condition is met
|
|
|
|
|
#' \item \code{No}: ID of the next node when the split condition is not met
|
|
|
|
|
#' \item \code{Missing}: ID of the next node when branch value is missing
|
|
|
|
|
#' \item \code{Quality}: either the split gain or the leaf value
|
|
|
|
|
#' \item \code{Cover}: metric related to the number of observation either seen by a split split
|
|
|
|
|
#' or collected by a leaf during training.
|
|
|
|
|
#' }
|
|
|
|
|
#'
|
|
|
|
|
#'
|
|
|
|
|
#' @examples
|
|
|
|
|
#' data(agaricus.train, package='xgboost')
|
|
|
|
|
#'
|
|
|
|
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
|
|
|
|
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
|
|
|
|
#'
|
|
|
|
|
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
|
|
|
|
#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
|
|
|
|
|
#' xgb.model.dt.tree(colnames(agaricus.train$data), bst)
|
|
|
|
|
#'
|
|
|
|
|
#' @export
|
|
|
|
|
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
|
|
|
|
|
|
|
|
|
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
|
|
|
|
|
n_first_tree = NULL){
|
|
|
|
|
|
|
|
|
|
if (!class(feature_names) %in% c("character", "NULL")) {
|
|
|
|
|
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
|
|
|
|
stop("feature_names: Has to be a vector of character\n",
|
|
|
|
|
" or NULL if the model dump already contains feature names.\n",
|
|
|
|
|
" Look at this function documentation to see where to get feature names.")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (class(model) != "xgb.Booster" & class(text) != "character") {
|
|
|
|
|
"model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>%
|
|
|
|
|
paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>%
|
|
|
|
|
stop()
|
|
|
|
|
stop("Either 'model' has to be an object of class xgb.Booster\n",
|
|
|
|
|
" or 'text' has to be a character vector with the result of xgb.dump\n",
|
|
|
|
|
" (or NULL if the model was provided).")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
|
|
|
|
|
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(is.null(text)){
|
|
|
|
|
|
|
|
|
|
if(is.null(text)){
|
|
|
|
|
text <- xgb.dump(model = model, with.stats = T)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
|
|
|
|
|
|
|
|
|
|
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
|
|
|
|
|
|
|
|
|
n_round <- min(length(position) - 1, n_first_tree)
|
|
|
|
|
|
|
|
|
|
position <- which(!is.na(str_match(text, "booster")))
|
|
|
|
|
|
|
|
|
|
addTreeId <- function(x, i) paste(i,x,sep = "-")
|
|
|
|
|
|
|
|
|
|
allTrees <- data.table()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
|
|
|
|
|
for (i in 1:n_round){
|
|
|
|
|
|
|
|
|
|
td <- data.table(t=text)
|
|
|
|
|
td[position, Tree := 1L]
|
|
|
|
|
td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L]
|
|
|
|
|
|
|
|
|
|
n_first_tree <- min(max(td$Tree), n_first_tree)
|
|
|
|
|
td <- td[Tree <= n_first_tree & !grepl('^booster', t)]
|
|
|
|
|
|
|
|
|
|
td[, Node := str_match(t, "(\\d+):")[,2] %>% as.numeric ]
|
|
|
|
|
td[, ID := addTreeId(Node, Tree)]
|
|
|
|
|
td[, isLeaf := !is.na(str_match(t, "leaf"))]
|
|
|
|
|
td[isLeaf==TRUE, Feature := "Leaf"]
|
|
|
|
|
td[isLeaf==FALSE, Feature := str_match(t, "f(\\d+)<")[,2] ]
|
|
|
|
|
td[isLeaf==FALSE & !is.null(feature_names), Feature := feature_names[as.numeric(Feature) + 1] ]
|
|
|
|
|
td[isLeaf==FALSE, Split := str_match(t, paste0("<(",anynumber_regex,")\\]"))[,2] ]
|
|
|
|
|
td[isLeaf==FALSE, Yes := str_match(t, "yes=(\\d+)")[,2] %>% addTreeId(Tree) ]
|
|
|
|
|
td[isLeaf==FALSE, No := str_match(t, "no=(\\d+)")[,2] %>% addTreeId(Tree) ]
|
|
|
|
|
td[isLeaf==FALSE, Missing := str_match(t, "missing=(\\d+)")[,2] %>% addTreeId(Tree) ]
|
|
|
|
|
td[isLeaf==FALSE, Quality := str_match(t, paste0("gain=(",anynumber_regex,")"))[,2] %>% as.numeric ]
|
|
|
|
|
td[isLeaf==TRUE, Quality := str_match(t, paste0("leaf=(",anynumber_regex,")"))[,2] %>% as.numeric ]
|
|
|
|
|
td[, Cover := str_match(t, paste0("cover=(\\d*\\.*\\d*)"))[,2] %>% as.numeric ]
|
|
|
|
|
|
|
|
|
|
td[, t := NULL]
|
|
|
|
|
td[, isLeaf := NULL]
|
|
|
|
|
|
|
|
|
|
tree <- text[(position[i] + 1):(position[i + 1] - 1)]
|
|
|
|
|
|
|
|
|
|
# avoid tree made of a leaf only (no split)
|
|
|
|
|
if(length(tree) < 2) next
|
|
|
|
|
|
|
|
|
|
treeID <- i - 1
|
|
|
|
|
|
|
|
|
|
notLeaf <- str_match(tree, "leaf") %>% is.na
|
|
|
|
|
leaf <- notLeaf %>% not %>% tree[.]
|
|
|
|
|
branch <- notLeaf %>% tree[.]
|
|
|
|
|
idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
|
|
|
|
|
idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
|
|
|
|
|
featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
|
|
|
|
|
if(!is.null(feature_names)){
|
|
|
|
|
featureBranch <- feature_names[featureBranch + 1]
|
|
|
|
|
}
|
|
|
|
|
featureLeaf <- rep("Leaf", length(leaf))
|
|
|
|
|
splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
|
|
|
|
|
splitLeaf <- rep(NA, length(leaf))
|
|
|
|
|
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
|
|
|
|
|
yesLeaf <- rep(NA, length(leaf))
|
|
|
|
|
noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
|
|
|
|
|
noLeaf <- rep(NA, length(leaf))
|
|
|
|
|
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
|
|
|
|
|
missingLeaf <- rep(NA, length(leaf))
|
|
|
|
|
qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
|
|
|
|
|
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
|
|
|
|
|
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
|
|
|
|
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
|
|
|
|
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID]
|
|
|
|
|
|
|
|
|
|
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
yes <- allTrees[!is.na(Yes), Yes]
|
|
|
|
|
|
|
|
|
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
|
|
|
|
j = "Yes.Feature",
|
|
|
|
|
value = allTrees[ID %in% yes, Feature])
|
|
|
|
|
|
|
|
|
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
|
|
|
|
j = "Yes.Cover",
|
|
|
|
|
value = allTrees[ID %in% yes, Cover])
|
|
|
|
|
|
|
|
|
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
|
|
|
|
j = "Yes.Quality",
|
|
|
|
|
value = allTrees[ID %in% yes, Quality])
|
|
|
|
|
no <- allTrees[!is.na(No), No]
|
|
|
|
|
|
|
|
|
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
|
|
|
|
j = "No.Feature",
|
|
|
|
|
value = allTrees[ID %in% no, Feature])
|
|
|
|
|
|
|
|
|
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
|
|
|
|
j = "No.Cover",
|
|
|
|
|
value = allTrees[ID %in% no, Cover])
|
|
|
|
|
|
|
|
|
|
set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
|
|
|
|
|
j = "No.Quality",
|
|
|
|
|
value = allTrees[ID %in% no, Quality])
|
|
|
|
|
|
|
|
|
|
allTrees
|
|
|
|
|
td
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Avoid error messages during CRAN check.
|
|
|
|
|
# The reason is that these variables are never declared
|
|
|
|
|
# They are mainly column names inferred by Data.table...
|
|
|
|
|
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency"))
|
|
|
|
|
globalVariables(c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover"))
|