model dt tree function documentation improvement

This commit is contained in:
pommedeterresautee 2015-12-08 11:21:25 +01:00
parent c1b2d9cb86
commit 855be97011
2 changed files with 28 additions and 34 deletions

View File

@ -1,6 +1,6 @@
#' Convert tree model dump to data.table #' Parse boosted tree model text dump
#' #'
#' Read a tree model text dump and return a data.table. #' Parse a boosted tree model text dump and return a \code{data.table}.
#' #'
#' @importFrom data.table data.table #' @importFrom data.table data.table
#' @importFrom data.table set #' @importFrom data.table set
@ -13,17 +13,19 @@
#' @importFrom stringr str_extract #' @importFrom stringr str_extract
#' @importFrom stringr str_split #' @importFrom stringr str_split
#' @importFrom stringr str_trim #' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param model object created by the \code{xgb.train} function.
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
#' #'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing. #' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
#' #'
#' @details #' @details
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. #' General function to convert a text dump of tree model to a \code{data.table}.
#' #'
#' The content of the \code{data.table} is organised that way: #' The purpose is to help user to explore the model and get a better understanding of it.
#'
#' The columns of the \code{data.table} are:
#' #'
#' \itemize{ #' \itemize{
#' \item \code{ID}: unique identifier of a node ; #' \item \code{ID}: unique identifier of a node ;
@ -35,21 +37,16 @@
#' \item \code{Quality}: it's the gain related to the split in this specific node ; #' \item \code{Quality}: it's the gain related to the split in this specific node ;
#' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Cover}: metric to measure the number of observation affected by the split ;
#' \item \code{Tree}: ID of the tree. It is included in the main ID ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ;
#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; #' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
#' } #' }
#' #'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' #'
#' #Both dataset are list with two items, a sparse matrix and labels #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' #'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. #' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst) #' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
#' #'
#' @export #' @export

View File

@ -2,30 +2,32 @@
% Please edit documentation in R/xgb.model.dt.tree.R % Please edit documentation in R/xgb.model.dt.tree.R
\name{xgb.model.dt.tree} \name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree} \alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table} \title{Parse boosted tree model text dump}
\usage{ \usage{
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL, xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
n_first_tree = NULL) n_first_tree = NULL)
} }
\arguments{ \arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).}
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} \item{model}{object created by the \code{xgb.train} function.}
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} \item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} \item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.}
} }
\value{ \value{
A \code{data.table} of the features used in the model with their gain, cover and few other thing. A \code{data.table} of the features used in the model with their gain, cover and few other information.
} }
\description{ \description{
Read a tree model text dump and return a data.table. Parse a boosted tree model text dump and return a \code{data.table}.
} }
\details{ \details{
General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. General function to convert a text dump of tree model to a \code{data.table}.
The content of the \code{data.table} is organised that way: The purpose is to help user to explore the model and get a better understanding of it.
The columns of the \code{data.table} are:
\itemize{ \itemize{
\item \code{ID}: unique identifier of a node ; \item \code{ID}: unique identifier of a node ;
@ -37,21 +39,16 @@ The content of the \code{data.table} is organised that way:
\item \code{Quality}: it's the gain related to the split in this specific node ; \item \code{Quality}: it's the gain related to the split in this specific node ;
\item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Cover}: metric to measure the number of observation affected by the split ;
\item \code{Tree}: ID of the tree. It is included in the main ID ; \item \code{Tree}: ID of the tree. It is included in the main ID ;
\item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
} }
} }
\examples{ \examples{
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. # agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
} }