new included feature in dt.tree function

This commit is contained in:
El Potaeto 2015-02-04 23:59:53 +01:00
parent 451944c52b
commit 9f5889f1e3
3 changed files with 9 additions and 1 deletions

View File

@ -41,6 +41,7 @@ importFrom(ggplot2,ylab)
importFrom(magrittr,"%>%") importFrom(magrittr,"%>%")
importFrom(magrittr,add) importFrom(magrittr,add)
importFrom(magrittr,not) importFrom(magrittr,not)
importFrom(stringr,str_detect)
importFrom(stringr,str_extract) importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all) importFrom(stringr,str_extract_all)
importFrom(stringr,str_match) importFrom(stringr,str_match)

View File

@ -14,6 +14,7 @@
#' @importFrom stringr str_split #' @importFrom stringr str_split
#' @importFrom stringr str_extract #' @importFrom stringr str_extract
#' @importFrom stringr str_trim #' @importFrom stringr str_trim
#' @importFrom stringr str_detect
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
@ -37,6 +38,8 @@
#' \item \code{Quality}: it's the gain related to the split in this specific node ; #' \item \code{Quality}: it's the gain related to the split in this specific node ;
#' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Cover}: metric to measure the number of observation affected by the split ;
#' \item \code{Tree}: ID of the tree. It is included in the main ID ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ;
#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
#' \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ;
#' } #' }
#' #'
#' @examples #' @examples
@ -158,6 +161,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "No.Quality", j = "No.Quality",
value = allTrees[ID == no,Quality]) value = allTrees[ID == no,Quality])
allTrees[,"Included":=F][ID == allTrees[!is.na(Yes), Yes], Included:=T][str_detect(ID, "-0$"), Included:=T]
allTrees allTrees
} }
@ -165,4 +170,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
# Avoid error messages during CRAN check. # Avoid error messages during CRAN check.
# The reason is that these variables are never declared # The reason is that these variables are never declared
# They are mainly column names inferred by Data.table... # They are mainly column names inferred by Data.table...
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence")) globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence", "Included"))

View File

@ -39,6 +39,8 @@ The content of the \code{data.table} is organised that way:
\item \code{Quality}: it's the gain related to the split in this specific node ; \item \code{Quality}: it's the gain related to the split in this specific node ;
\item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Cover}: metric to measure the number of observation affected by the split ;
\item \code{Tree}: ID of the tree. It is included in the main ID ; \item \code{Tree}: ID of the tree. It is included in the main ID ;
\item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
\item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ;
} }
} }
\examples{ \examples{