From be65949ba276dd12de3b2e3f858f74bcf6ee0a6d Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Wed, 13 Apr 2016 02:32:00 -0500 Subject: [PATCH] xgb.model.dt.tree up to x100 faster --- R-package/NAMESPACE | 5 - R-package/R/xgb.model.dt.tree.R | 185 +++++++++--------------- R-package/man/xgb.model.dt.tree.Rd | 49 +++---- R-package/tests/testthat/test_helpers.R | 7 +- 4 files changed, 96 insertions(+), 150 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f9f602ce7..d61e9ddae 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -43,19 +43,14 @@ importFrom(Matrix,sparse.model.matrix) importFrom(Matrix,sparseVector) importFrom(data.table,":=") importFrom(data.table,as.data.table) -importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fread) importFrom(data.table,rbindlist) -importFrom(data.table,set) importFrom(data.table,setnames) importFrom(magrittr,"%>%") -importFrom(magrittr,add) -importFrom(magrittr,not) importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) importFrom(stringr,str_replace) importFrom(stringr,str_split) -importFrom(stringr,str_trim) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 0083dae93..052312836 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -1,153 +1,106 @@ #' Parse boosted tree model text dump #' -#' Parse a boosted tree model text dump and return a \code{data.table}. +#' Parse a boosted tree model text dump into a \code{data.table} structure. #' #' @importFrom data.table data.table -#' @importFrom data.table set -#' @importFrom data.table rbindlist -#' @importFrom data.table copy #' @importFrom data.table := #' @importFrom magrittr %>% -#' @importFrom magrittr not -#' @importFrom magrittr add -#' @importFrom stringr str_extract -#' @importFrom stringr str_split -#' @importFrom stringr str_trim -#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value). -#' @param model object created by the \code{xgb.train} function. -#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}). -#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model. -#' -#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information. -#' -#' @details -#' General function to convert a text dump of tree model to a \code{data.table}. -#' -#' The purpose is to help user to explore the model and get a better understanding of it. +#' @importFrom stringr str_match #' +#' @param feature_names character vector of feature names. If the model already +#' contains feature names, this argument should be \code{NULL} (default value) +#' @param model object of class \code{xgb.Booster} +#' @param text \code{character} vector previously generated by the \code{xgb.dump} +#' function (where parameter \code{with.stats = TRUE} should have been set). +#' @param n_first_tree limit the parsing to the \code{n} first trees. +#' If set to \code{NULL}, all trees of the model are parsed. +#' +#' @return +#' A \code{data.table} with detailed information about model trees' nodes. +#' #' The columns of the \code{data.table} are: #' #' \itemize{ -#' \item \code{ID}: unique identifier of a node ; -#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; -#' \item \code{Split}: value of the chosen feature where is operated the split ; -#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; -#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; -#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; -#' \item \code{Quality}: it's the gain related to the split in this specific node ; -#' \item \code{Cover}: metric to measure the number of observation affected by the split ; -#' \item \code{Tree}: ID of the tree. It is included in the main ID ; -#' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ; +#' \item \code{Tree}: ID of a tree in a model +#' \item \code{Node}: ID of a node in a tree +#' \item \code{ID}: unique identifier of a node in a model +#' \item \code{Feature}: for a branch node, it's a feature id or name (when available); +#' for a leaf note, it simply labels it as \code{'Leaf'} +#' \item \code{Split}: location of the split for a branch node (split condition is always "less than") +#' \item \code{Yes}: ID of the next node when the split condition is met +#' \item \code{No}: ID of the next node when the split condition is not met +#' \item \code{Missing}: ID of the next node when branch value is missing +#' \item \code{Quality}: either the split gain or the leaf value +#' \item \code{Cover}: metric related to the number of observation either seen by a split split +#' or collected by a leaf during training. #' } -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' -#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst) +#' xgb.model.dt.tree(colnames(agaricus.train$data), bst) #' #' @export -xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){ - +xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, + n_first_tree = NULL){ + if (!class(feature_names) %in% c("character", "NULL")) { - stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") + stop("feature_names: Has to be a vector of character\n", + " or NULL if the model dump already contains feature names.\n", + " Look at this function documentation to see where to get feature names.") } - + if (class(model) != "xgb.Booster" & class(text) != "character") { - "model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>% - paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>% - stop() + stop("Either 'model' has to be an object of class xgb.Booster\n", + " or 'text' has to be a character vector with the result of xgb.dump\n", + " (or NULL if the model was provided).") } - + if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { stop("n_first_tree: Has to be a numeric vector of size 1.") } - - if(is.null(text)){ + + if(is.null(text)){ text <- xgb.dump(model = model, with.stats = T) } - position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1) - - extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist - - n_round <- min(length(position) - 1, n_first_tree) - + position <- which(!is.na(str_match(text, "booster"))) + addTreeId <- function(x, i) paste(i,x,sep = "-") - - allTrees <- data.table() - + anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" - for (i in 1:n_round){ + + td <- data.table(t=text) + td[position, Tree := 1L] + td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L] + + n_first_tree <- min(max(td$Tree), n_first_tree) + td <- td[Tree <= n_first_tree & !grepl('^booster', t)] + + td[, Node := str_match(t, "(\\d+):")[,2] %>% as.numeric ] + td[, ID := addTreeId(Node, Tree)] + td[, isLeaf := !is.na(str_match(t, "leaf"))] + td[isLeaf==TRUE, Feature := "Leaf"] + td[isLeaf==FALSE, Feature := str_match(t, "f(\\d+)<")[,2] ] + td[isLeaf==FALSE & !is.null(feature_names), Feature := feature_names[as.numeric(Feature) + 1] ] + td[isLeaf==FALSE, Split := str_match(t, paste0("<(",anynumber_regex,")\\]"))[,2] ] + td[isLeaf==FALSE, Yes := str_match(t, "yes=(\\d+)")[,2] %>% addTreeId(Tree) ] + td[isLeaf==FALSE, No := str_match(t, "no=(\\d+)")[,2] %>% addTreeId(Tree) ] + td[isLeaf==FALSE, Missing := str_match(t, "missing=(\\d+)")[,2] %>% addTreeId(Tree) ] + td[isLeaf==FALSE, Quality := str_match(t, paste0("gain=(",anynumber_regex,")"))[,2] %>% as.numeric ] + td[isLeaf==TRUE, Quality := str_match(t, paste0("leaf=(",anynumber_regex,")"))[,2] %>% as.numeric ] + td[, Cover := str_match(t, paste0("cover=(\\d*\\.*\\d*)"))[,2] %>% as.numeric ] + + td[, t := NULL] + td[, isLeaf := NULL] - tree <- text[(position[i] + 1):(position[i + 1] - 1)] - - # avoid tree made of a leaf only (no split) - if(length(tree) < 2) next - - treeID <- i - 1 - - notLeaf <- str_match(tree, "leaf") %>% is.na - leaf <- notLeaf %>% not %>% tree[.] - branch <- notLeaf %>% tree[.] - idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID) - idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID) - featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric - if(!is.null(feature_names)){ - featureBranch <- feature_names[featureBranch + 1] - } - featureLeaf <- rep("Leaf", length(leaf)) - splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "") - splitLeaf <- rep(NA, length(leaf)) - yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID) - yesLeaf <- rep(NA, length(leaf)) - noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID) - noLeaf <- rep(NA, length(leaf)) - missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID) - missingLeaf <- rep(NA, length(leaf)) - qualityBranch <- extract(branch, paste0("gain=",anynumber_regex)) - qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex)) - coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") - coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID] - - allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) - } - - yes <- allTrees[!is.na(Yes), Yes] - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "Yes.Feature", - value = allTrees[ID %in% yes, Feature]) - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "Yes.Cover", - value = allTrees[ID %in% yes, Cover]) - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "Yes.Quality", - value = allTrees[ID %in% yes, Quality]) - no <- allTrees[!is.na(No), No] - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "No.Feature", - value = allTrees[ID %in% no, Feature]) - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "No.Cover", - value = allTrees[ID %in% no, Cover]) - - set(allTrees, i = which(allTrees[, Feature] != "Leaf"), - j = "No.Quality", - value = allTrees[ID %in% no, Quality]) - - allTrees + td } # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency")) \ No newline at end of file +globalVariables(c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")) \ No newline at end of file diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index c82ba3cf4..91bd03a3d 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -8,48 +8,47 @@ xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL) } \arguments{ -\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).} +\item{feature_names}{character vector of feature names. If the model already +contains feature names, this argument should be \code{NULL} (default value)} -\item{model}{object created by the \code{xgb.train} function.} +\item{model}{object of class \code{xgb.Booster}} -\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).} +\item{text}{\code{character} vector previously generated by the \code{xgb.dump} +function (where parameter \code{with.stats = TRUE} should have been set).} -\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.} +\item{n_first_tree}{limit the parsing to the \code{n} first trees. +If set to \code{NULL}, all trees of the model are parsed.} } \value{ -A \code{data.table} of the features used in the model with their gain, cover and few other information. -} -\description{ -Parse a boosted tree model text dump and return a \code{data.table}. -} -\details{ -General function to convert a text dump of tree model to a \code{data.table}. - -The purpose is to help user to explore the model and get a better understanding of it. +A \code{data.table} with detailed information about model trees' nodes. The columns of the \code{data.table} are: \itemize{ -\item \code{ID}: unique identifier of a node ; - \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; - \item \code{Split}: value of the chosen feature where is operated the split ; - \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; - \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; - \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; - \item \code{Quality}: it's the gain related to the split in this specific node ; - \item \code{Cover}: metric to measure the number of observation affected by the split ; - \item \code{Tree}: ID of the tree. It is included in the main ID ; - \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ; + \item \code{Tree}: ID of a tree in a model + \item \code{Node}: ID of a node in a tree + \item \code{ID}: unique identifier of a node in a model + \item \code{Feature}: for a branch node, it's a feature id or name (when available); + for a leaf note, it simply labels it as \code{'Leaf'} + \item \code{Split}: location of the split for a branch node (split condition is always "less than") + \item \code{Yes}: ID of the next node when the split condition is met + \item \code{No}: ID of the next node when the split condition is not met + \item \code{Missing}: ID of the next node when branch value is missing + \item \code{Quality}: either the split gain or the leaf value + \item \code{Cover}: metric related to the number of observation either seen by a split split + or collected by a leaf during training. } } +\description{ +Parse a boosted tree model text dump into a \code{data.table} structure. +} \examples{ data(agaricus.train, package='xgboost') bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) +xgb.model.dt.tree(colnames(agaricus.train$data), bst) } diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index b5fb264e3..10af643b8 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -20,7 +20,7 @@ bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, bst.GLM <- xgboost(data = sparse_matrix, label = output_vector, eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear") -feature.names <- agaricus.train$data@Dimnames[[2]] +feature.names <- colnames(agaricus.train$data) test_that("xgb.dump works", { capture.output(print(xgb.dump(bst.Tree))) @@ -57,11 +57,10 @@ test_that("xgb-attribute functionality", { }) test_that("xgb.model.dt.tree works with and without feature names", { - names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover", - "Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality") + names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree) expect_equal(names.dt.trees, names(dt.tree)) - expect_equal(dim(dt.tree), c(162, 15)) + expect_equal(dim(dt.tree), c(162, 10)) xgb.model.dt.tree(model = bst.Tree) })