#' Parse boosted tree model text dump #' #' Parse a boosted tree model text dump and return a \code{data.table}. #' #' @importFrom data.table data.table #' @importFrom data.table set #' @importFrom data.table rbindlist #' @importFrom data.table copy #' @importFrom data.table := #' @importFrom magrittr %>% #' @importFrom magrittr not #' @importFrom magrittr add #' @importFrom stringr str_extract #' @importFrom stringr str_split #' @importFrom stringr str_trim #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value). #' @param model object created by the \code{xgb.train} function. #' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}). #' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model. #' #' @return A \code{data.table} of the features used in the model with their gain, cover and few other information. #' #' @details #' General function to convert a text dump of tree model to a \code{data.table}. #' #' The purpose is to help user to explore the model and get a better understanding of it. #' #' The columns of the \code{data.table} are: #' #' \itemize{ #' \item \code{ID}: unique identifier of a node ; #' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; #' \item \code{Split}: value of the chosen feature where is operated the split ; #' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; #' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; #' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; #' \item \code{Quality}: it's the gain related to the split in this specific node ; #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; #' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ; #' } #' #' @examples #' data(agaricus.train, package='xgboost') #' #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' #' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix. #' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } if (class(model) != "xgb.Booster" & class(text) != "character") { "model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>% paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>% stop() } if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { stop("n_first_tree: Has to be a numeric vector of size 1.") } if(is.null(text)){ text <- xgb.dump(model = model, with.stats = T) } position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1) extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist n_round <- min(length(position) - 1, n_first_tree) addTreeId <- function(x, i) paste(i,x,sep = "-") allTrees <- data.table() anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" for (i in 1:n_round){ tree <- text[(position[i] + 1):(position[i + 1] - 1)] # avoid tree made of a leaf only (no split) if(length(tree) < 2) next treeID <- i - 1 notLeaf <- str_match(tree, "leaf") %>% is.na leaf <- notLeaf %>% not %>% tree[.] branch <- notLeaf %>% tree[.] idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID) idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID) featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric if(!is.null(feature_names)){ featureBranch <- feature_names[featureBranch + 1] } featureLeaf <- rep("Leaf", length(leaf)) splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "") splitLeaf <- rep(NA, length(leaf)) yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID) yesLeaf <- rep(NA, length(leaf)) noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID) noLeaf <- rep(NA, length(leaf)) missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID) missingLeaf <- rep(NA, length(leaf)) qualityBranch <- extract(branch, paste0("gain=",anynumber_regex)) qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex)) coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID] allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } yes <- allTrees[!is.na(Yes), Yes] set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "Yes.Feature", value = allTrees[ID %in% yes, Feature]) set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "Yes.Cover", value = allTrees[ID %in% yes, Cover]) set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "Yes.Quality", value = allTrees[ID %in% yes, Quality]) no <- allTrees[!is.na(No), No] set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "No.Feature", value = allTrees[ID %in% no, Feature]) set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "No.Cover", value = allTrees[ID %in% no, Cover]) set(allTrees, i = which(allTrees[, Feature] != "Leaf"), j = "No.Quality", value = allTrees[ID %in% no, Quality]) allTrees } # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency"))