Merge pull request #138 from pommedeterresautee/master

new parameters, refactoring...
2015-01-11 14:27:38 -08:00
parent 72f6fbd46f 31a3b38ef8
commit 9a2ad91b48
8 changed files with 88 additions and 46 deletions
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -37,11 +37,15 @@
 #' 
 xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
  if (class(model) != "xgb.Booster") {
-    stop("xgb.dump: first argument must be type xgb.Booster")
+    stop("model: argument must be type xgb.Booster")
  }
-  if (!class(fname) %in% c("character", "NULL")) {
-    stop("xgb.dump: second argument must be type character when provided")
+  if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
+    stop("fname: argument must be type character (when provided)")
  }
+  if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
+    stop("fmap: argument must be type character (when provided)")
+  }
+  
  result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
  
  if(is.null(fname)) {
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -9,6 +9,7 @@
 #' @importFrom magrittr %>%
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
+#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #'
 #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 #'
@@ -38,24 +39,34 @@
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.importance <- function(feature_names = NULL, filename_dump = NULL){  
+xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){  
  if (!class(feature_names) %in% c("character", "NULL")) {	   
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
-  if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
+  
+  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
    stop("filename_dump: Has to be a path to the model dump file.")
  }
-  text <- readLines(filename_dump)
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  if(is.null(model)){
+    text <- readLines(filename_dump)  
+  } else {
+    text <- xgb.dump(model = model, with.stats = T)
+  } 
+  
  if(text[2] == "bias:"){
-    result <- linearDump(feature_names, text)
+    result <- readLines(filename_dump) %>% linearDump(feature_names, .)
  }  else {
-    result <- treeDump(feature_names, text)
+    result <- treeDump(feature_names, text = text)
  }
  result
 }
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -16,6 +16,8 @@
 #' @importFrom stringr str_trim
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
+#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
 #'
 #' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
@@ -49,29 +51,37 @@
 #' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump')
 #' 
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){
+xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
  
  if (!class(feature_names) %in% c("character", "NULL")) {     
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
-  if (!class(filename_dump) %in% c("character", "NULL")) {
-    stop("filename_dump: Has to be a character vector representing the path to the model dump file.")
-  } else if (class(filename_dump) == "character" && !file.exists(filename_dump)) {
+  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
+    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
+  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
    stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) & is.null(text)){
-    stop("filename_dump: no path and no string version of the model dump have been provided.")
+  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
+    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
  }
-  if (!class(text) %in% c("character", "NULL")) {     
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  if (!class(text) %in% c("character", "NULL")) { 
    stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
  }
+  
  if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
    stop("n_first_tree: Has to be a numeric vector of size 1.")
  }
  
-  if(is.null(text)){
+  if(!is.null(model)){
+    text = xgb.dump(model = model, with.stats = T)
+  } else if(!is.null(filename_dump)){
    text <- readLines(filename_dump) %>% str_trim(side = "both")  
  }
  
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -17,9 +17,12 @@
 #' @importFrom stringr str_trim
 #' @importFrom DiagrammeR DiagrammeR
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
+#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
-#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
+#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
+#' @param  width  the width of the diagram in pixels.
+#' @param height	the height of the diagram in pixels.
 #'
 #' @return A \code{DiagrammeR} of the model.
 #'
@@ -45,33 +48,41 @@
 #' 
 #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
 #'                eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 #' 
 #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){  
+#' 
+xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){  
  
-  if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) {
+  if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
    stop("style: Has to be a character vector of size 1.")
  }
+  
+  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
    
-  allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree)
+  if(is.null(model)){
+    allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)  
+  } else {
+    allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)  
+  }
  
  allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
  
  allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
  
  
-  if(is.null(styles)){
-    styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
+  if(is.null(CSSstyle)){
+    CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
  }  
  
  yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
  
  no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
  
-  path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";")
-  DiagrammeR(path)
+  path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
+  DiagrammeR(path, width, height)
 }
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -4,12 +4,14 @@
 \alias{xgb.importance}
 \title{Show importance of features in a model}
 \usage{
-xgb.importance(feature_names = NULL, filename_dump = NULL)
+xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}

 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
+
+\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 }
 \value{
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
@@ -43,9 +45,8 @@ test <- agaricus.test

 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)

 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst)
 }

--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -4,14 +4,18 @@
 \alias{xgb.model.dt.tree}
 \title{Convert tree model dump to data.table}
 \usage{
-xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL,
-  n_first_tree = NULL)
+xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
+  model = NULL, text = NULL, n_first_tree = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}

 \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}

+\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+
+\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+
 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
 }
 \value{
@@ -49,6 +53,6 @@ bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
 xgb.dump(bst, 'xgb.model.dump', with.stats = T)

 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump')
 }

--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -4,17 +4,19 @@
 \alias{xgb.plot.tree}
 \title{Plot a boosted tree model}
 \usage{
-xgb.plot.tree(feature_names = NULL, filename_dump = NULL,
-  n_first_tree = NULL, styles = NULL)
+xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
+  n_first_tree = NULL, CSSstyle = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}

-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
+
+\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}

 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}

-\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
+\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
 }
 \value{
 A \code{DiagrammeR} of the model.
@@ -44,9 +46,8 @@ train <- agaricus.train

 bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
               eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)

 #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
 }

--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -3,6 +3,7 @@
 #include <utility>
 #include <cstring>
 #include <cstdio>
+#include <sstream> 
 #include "xgboost_R.h"
 #include "wrapper/xgboost_wrapper.h"
 #include "src/utils/utils.h"
@@ -280,11 +281,10 @@ extern "C" {
    asInteger(with_stats),
    &olen);
    SEXP out = PROTECT(allocVector(STRSXP, olen));    
-    char buffer [2000];
    for (size_t i = 0; i < olen; ++i) {     
-      memset(buffer, 0, sizeof buffer);
-      sprintf (buffer, "booster[%u]:\n%s", static_cast<unsigned>(i), res[i]);
-      SET_STRING_ELT(out, i, mkChar(buffer));
+      stringstream stream;
+      stream <<  "booster["<<i<<"]\n" << res[i];
+      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
    }
    _WrapperEnd();
    UNPROTECT(1);