diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 4d8e10e3b..0083dae93 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -1,6 +1,6 @@ -#' Convert tree model dump to data.table +#' Parse boosted tree model text dump #' -#' Read a tree model text dump and return a data.table. +#' Parse a boosted tree model text dump and return a \code{data.table}. #' #' @importFrom data.table data.table #' @importFrom data.table set @@ -13,17 +13,19 @@ #' @importFrom stringr str_extract #' @importFrom stringr str_split #' @importFrom stringr str_trim -#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. -#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). -#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. +#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value). +#' @param model object created by the \code{xgb.train} function. +#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}). +#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model. #' -#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing. +#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information. #' #' @details -#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. +#' General function to convert a text dump of tree model to a \code{data.table}. #' -#' The content of the \code{data.table} is organised that way: +#' The purpose is to help user to explore the model and get a better understanding of it. +#' +#' The columns of the \code{data.table} are: #' #' \itemize{ #' \item \code{ID}: unique identifier of a node ; @@ -35,21 +37,16 @@ #' \item \code{Quality}: it's the gain related to the split in this specific node ; #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; -#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; +#' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ; #' } #' #' @examples #' data(agaricus.train, package='xgboost') #' -#' #Both dataset are list with two items, a sparse matrix and labels -#' #(labels = outcome column which will be learned). -#' #Each column of the sparse Matrix is a feature in one hot encoding format. -#' train <- agaricus.train -#' -#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' -#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. +#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix. #' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 7dadb20aa..c82ba3cf4 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -2,30 +2,32 @@ % Please edit documentation in R/xgb.model.dt.tree.R \name{xgb.model.dt.tree} \alias{xgb.model.dt.tree} -\title{Convert tree model dump to data.table} +\title{Parse boosted tree model text dump} \usage{ xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL) } \arguments{ -\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} +\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).} -\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} +\item{model}{object created by the \code{xgb.train} function.} -\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} +\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).} -\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} +\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.} } \value{ -A \code{data.table} of the features used in the model with their gain, cover and few other thing. +A \code{data.table} of the features used in the model with their gain, cover and few other information. } \description{ -Read a tree model text dump and return a data.table. +Parse a boosted tree model text dump and return a \code{data.table}. } \details{ -General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. +General function to convert a text dump of tree model to a \code{data.table}. -The content of the \code{data.table} is organised that way: +The purpose is to help user to explore the model and get a better understanding of it. + +The columns of the \code{data.table} are: \itemize{ \item \code{ID}: unique identifier of a node ; @@ -37,21 +39,16 @@ The content of the \code{data.table} is organised that way: \item \code{Quality}: it's the gain related to the split in this specific node ; \item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Tree}: ID of the tree. It is included in the main ID ; - \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; + \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ; } } \examples{ data(agaricus.train, package='xgboost') -#Both dataset are list with two items, a sparse matrix and labels -#(labels = outcome column which will be learned). -#Each column of the sparse Matrix is a feature in one hot encoding format. -train <- agaricus.train - -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix. xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) }