diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f68eafbc5..23de90d28 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -9,6 +9,7 @@ export(xgb.cv) export(xgb.dump) export(xgb.importance) export(xgb.load) +export(xgb.model.dt.tree) export(xgb.plot.tree) export(xgb.save) export(xgb.train) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R new file mode 100644 index 000000000..2a65c30f7 --- /dev/null +++ b/R-package/R/xgb.model.dt.tree.R @@ -0,0 +1,109 @@ +#' Convert tree model dump to data.table +#' +#' Read a tree model text dump and return a data.table. +#' +#' @importFrom data.table data.table +#' @importFrom data.table set +#' @importFrom data.table rbindlist +#' @importFrom data.table := +#' @importFrom magrittr %>% +#' @importFrom magrittr not +#' @importFrom magrittr add +#' @importFrom stringr str_extract +#' @importFrom stringr str_split +#' @importFrom stringr str_extract +#' @importFrom stringr str_trim +#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. +#' +#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing. +#' +#' @details +#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. +#' +#' The content of the \code{data.table} is organised that way: +#' +#' \itemize{ +#' \item \code{ID}: unique identifier of a node ; +#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; +#' \item \code{Split}: value of the chosen feature where is operated the split ; +#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; +#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; +#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; +#' \item \code{Quality}: it's the gain related to the split in this specific node ; +#' \item \code{Cover}: metric to measure the number of observation affected by the split ; +#' \item \code{Tree}: ID of the tree. It is included in the main ID ; +#' } +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' +#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). +#' #Each column of the sparse Matrix is a feature in one hot encoding format. +#' train <- agaricus.train +#' +#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' eta = 1, nround = 2,objective = "binary:logistic") +#' xgb.dump(bst, 'xgb.model.dump', with.stats = T) +#' +#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. +#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') +#' +#' @export +xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL){ + + if (!class(feature_names) %in% c("character", "NULL")) { + stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") + } + if (class(filename_dump) != "character" || !file.exists(filename_dump)) { + stop("filename_dump: Has to be a path to the model dump file.") + } + if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { + stop("n_first_tree: Has to be a numeric vector of size 1.") + } + + text <- readLines(filename_dump) %>% str_trim(side = "both") + position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) + + extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist + + n_round <- min(length(position) - 1, n_first_tree) + + addTreeId <- function(x, i) paste(i,x,sep = "-") + + allTrees <- data.table() + + for(i in 1:n_round){ + + tree <- text[(position[i]+1):(position[i+1]-1)] + + notLeaf <- str_match(tree, "leaf") %>% is.na + leaf <- notLeaf %>% not %>% tree[.] + branch <- notLeaf %>% tree[.] + idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric + if(!is.null(feature_names)){ + featureBranch <- feature_names[featureBranch + 1] + } + featureLeaf <- rep("Leaf", length(leaf)) + splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") + splitLeaf <- rep(NA, length(leaf)) + yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) + yesLeaf <- rep(NA, length(leaf)) + noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) + noLeaf <- rep(NA, length(leaf)) + missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) + missingLeaf <- rep(NA, length(leaf)) + qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") + qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") + coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") + coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) + } + + allTrees +} diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 1efcbf813..b980671b0 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -1,6 +1,6 @@ #' Plot a boosted tree model #' -#' Read a xgboost model text dump. +#' Read a tree model text dump. #' Plotting only works for boosted tree model (not linear model). #' #' @importFrom data.table data.table @@ -21,7 +21,7 @@ #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. #' -#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +#' @return A \code{DiagrammeR} of the model. #' #' @details #' @@ -34,7 +34,7 @@ #' } #' #' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. -#' It uses Mermaid JS library for that purpose. +#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose. #' #' @examples #' data(agaricus.train, package='xgboost') @@ -51,63 +51,13 @@ #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ - - if (!class(feature_names) %in% c("character", "NULL")) { - stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") - } - if (class(filename_dump) != "character" || !file.exists(filename_dump)) { - stop("filename_dump: Has to be a path to the model dump file.") - } - if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { - stop("n_first_tree: Has to be a numeric vector of size 1.") - } +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) { stop("style: Has to be a character vector of size 1.") } - - text <- readLines(filename_dump) %>% str_trim(side = "both") - position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) - - extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist - - n_round <- min(length(position) - 1, n_first_tree) - - addTreeId <- function(x, i) paste(i,x,sep = "-") - - allTrees <- data.table() - - for(i in 1:n_round){ - tree <- text[(position[i]+1):(position[i+1]-1)] - - notLeaf <- str_match(tree, "leaf") %>% is.na - leaf <- notLeaf %>% not %>% tree[.] - branch <- notLeaf %>% tree[.] - idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric - if(!is.null(feature_names)){ - featureBranch <- feature_names[featureBranch + 1] - } - featureLeaf <- rep("Leaf", length(leaf)) - splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") - splitLeaf <- rep(NA, length(leaf)) - yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) - yesLeaf <- rep(NA, length(leaf)) - noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) - noLeaf <- rep(NA, length(leaf)) - missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) - missingLeaf <- rep(NA, length(leaf)) - qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") - qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") - coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") - coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] - - allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) - } + allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree) set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(allTrees)[,ID:=Yes][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd new file mode 100644 index 000000000..8c46ffe4f --- /dev/null +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/xgb.model.dt.tree.R +\name{xgb.model.dt.tree} +\alias{xgb.model.dt.tree} +\title{Convert tree model dump to data.table} +\usage{ +xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, + n_first_tree = NULL) +} +\arguments{ +\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} + +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} + +\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} +} +\value{ +A \code{data.table} of the features used in the model with their gain, cover and few other thing. +} +\description{ +Read a tree model text dump and return a data.table. +} +\details{ +General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. + +The content of the \code{data.table} is organised that way: + +\itemize{ +\item \code{ID}: unique identifier of a node ; + \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; + \item \code{Split}: value of the chosen feature where is operated the split ; + \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; + \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; + \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; + \item \code{Quality}: it's the gain related to the split in this specific node ; + \item \code{Cover}: metric to measure the number of observation affected by the split ; + \item \code{Tree}: ID of the tree. It is included in the main ID ; +} +} +\examples{ +data(agaricus.train, package='xgboost') + +#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). +#Each column of the sparse Matrix is a feature in one hot encoding format. +train <- agaricus.train + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nround = 2,objective = "binary:logistic") +xgb.dump(bst, 'xgb.model.dump', with.stats = T) + +#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump') +} + diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 17ef49ced..ba65cdd7c 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -17,14 +17,13 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, \item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.} } \value{ -A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +A \code{DiagrammeR} of the model. } \description{ -Read a xgboost model text dump. +Read a tree model text dump. +Plotting only works for boosted tree model (not linear model). } \details{ -Plotting only works for boosted tree model (not linear model). - The content of each node is organised that way: \itemize{ @@ -34,7 +33,7 @@ The content of each node is organised that way: } Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. -It uses Mermaid JS library for that purpose. +It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose. } \examples{ data(agaricus.train, package='xgboost')