From 9f5889f1e385623d473fa9b7cd588fd69cb9c584 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 4 Feb 2015 23:59:53 +0100 Subject: [PATCH] new included feature in dt.tree function --- R-package/NAMESPACE | 1 + R-package/R/xgb.model.dt.tree.R | 7 ++++++- R-package/man/xgb.model.dt.tree.Rd | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index fab1546a2..7d9c64563 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -41,6 +41,7 @@ importFrom(ggplot2,ylab) importFrom(magrittr,"%>%") importFrom(magrittr,add) importFrom(magrittr,not) +importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 373f29403..42ca8237b 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -14,6 +14,7 @@ #' @importFrom stringr str_split #' @importFrom stringr str_extract #' @importFrom stringr str_trim +#' @importFrom stringr str_detect #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. @@ -37,6 +38,8 @@ #' \item \code{Quality}: it's the gain related to the split in this specific node ; #' \item \code{Cover}: metric to measure the number of observation affected by the split ; #' \item \code{Tree}: ID of the tree. It is included in the main ID ; +#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; +#' \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; #' } #' #' @examples @@ -158,6 +161,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "No.Quality", value = allTrees[ID == no,Quality]) + + allTrees[,"Included":=F][ID == allTrees[!is.na(Yes), Yes], Included:=T][str_detect(ID, "-0$"), Included:=T] allTrees } @@ -165,4 +170,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence")) \ No newline at end of file +globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence", "Included")) \ No newline at end of file diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 069e7ad77..31910cc49 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -39,6 +39,8 @@ The content of the \code{data.table} is organised that way: \item \code{Quality}: it's the gain related to the split in this specific node ; \item \code{Cover}: metric to measure the number of observation affected by the split ; \item \code{Tree}: ID of the tree. It is included in the main ID ; + \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ; + \item \code{Included}: \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ; } } \examples{