From 9f5889f1e385623d473fa9b7cd588fd69cb9c584 Mon Sep 17 00:00:00 2001
From: El Potaeto <pommedeterresautee@msn.com>
Date: Wed, 4 Feb 2015 23:59:53 +0100
Subject: [PATCH] new included feature in dt.tree function

---
 R-package/NAMESPACE                | 1 +
 R-package/R/xgb.model.dt.tree.R    | 7 ++++++-
 R-package/man/xgb.model.dt.tree.Rd | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index fab1546a2..7d9c64563 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -41,6 +41,7 @@ importFrom(ggplot2,ylab)
 importFrom(magrittr,"%>%")
 importFrom(magrittr,add)
 importFrom(magrittr,not)
+importFrom(stringr,str_detect)
 importFrom(stringr,str_extract)
 importFrom(stringr,str_extract_all)
 importFrom(stringr,str_match)
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 373f29403..42ca8237b 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -14,6 +14,7 @@
 #' @importFrom stringr str_split
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_trim
+#' @importFrom stringr str_detect
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
@@ -37,6 +38,8 @@
 #'  \item \code{Quality}: it's the gain related to the split in this specific node ;
 #'  \item \code{Cover}: metric to measure the number of observation affected by the split ;
 #'  \item \code{Tree}: ID of the tree. It is included in the main ID ;
+#'  \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
+#'  \item \code{Included}:  \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ;
 #' } 
 #'   
 #' @examples
@@ -158,6 +161,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
   set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
       j = "No.Quality", 
       value = allTrees[ID == no,Quality])
+  
+  allTrees[,"Included":=F][ID == allTrees[!is.na(Yes), Yes], Included:=T][str_detect(ID, "-0$"), Included:=T]
       
   allTrees
 }
@@ -165,4 +170,4 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
\ No newline at end of file
+globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence", "Included"))
\ No newline at end of file
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index 069e7ad77..31910cc49 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -39,6 +39,8 @@ The content of the \code{data.table} is organised that way:
  \item \code{Quality}: it's the gain related to the split in this specific node ;
  \item \code{Cover}: metric to measure the number of observation affected by the split ;
  \item \code{Tree}: ID of the tree. It is included in the main ID ;
+ \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
+ \item \code{Included}:  \code{boolean} value which indicates if this value has been pointed by a Yes branch (\code{True}) or a No branch (\code{False}) ;
 }
 }
 \examples{