From 70df2276890d08240391adee07408f9495c78db5 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 11 Jan 2015 01:04:54 +0100 Subject: [PATCH 1/3] dump function is now memory safe --- R-package/src/xgboost_R.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index 9320547df..5a8ddbf52 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "xgboost_R.h" #include "wrapper/xgboost_wrapper.h" #include "src/utils/utils.h" @@ -280,11 +281,10 @@ extern "C" { asInteger(with_stats), &olen); SEXP out = PROTECT(allocVector(STRSXP, olen)); - char buffer [2000]; for (size_t i = 0; i < olen; ++i) { - memset(buffer, 0, sizeof buffer); - sprintf (buffer, "booster[%u]:\n%s", static_cast(i), res[i]); - SET_STRING_ELT(out, i, mkChar(buffer)); + stringstream stream; + stream << "booster["< Date: Sun, 11 Jan 2015 03:06:41 +0100 Subject: [PATCH 2/3] add new parameters to several functions avoid the need of a text dump --- R-package/R/xgb.dump.R | 10 +++++++--- R-package/R/xgb.importance.R | 21 ++++++++++++++++----- R-package/R/xgb.plot.tree.R | 22 ++++++++++++++++------ R-package/man/xgb.importance.Rd | 7 ++++--- R-package/man/xgb.plot.tree.Rd | 9 +++++---- 5 files changed, 48 insertions(+), 21 deletions(-) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 3df8c9605..b6c829663 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -37,11 +37,15 @@ #' xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { if (class(model) != "xgb.Booster") { - stop("xgb.dump: first argument must be type xgb.Booster") + stop("model: argument must be type xgb.Booster") } - if (!class(fname) %in% c("character", "NULL")) { - stop("xgb.dump: second argument must be type character when provided") + if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) { + stop("fname: argument must be type character (when provided)") } + if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) { + stop("fmap: argument must be type character (when provided)") + } + result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") if(is.null(fname)) { diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 189ee03b4..7dd3a8ca3 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -9,6 +9,7 @@ #' @importFrom magrittr %>% #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). +#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' @@ -38,20 +39,30 @@ #' #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' eta = 1, nround = 2,objective = "binary:logistic") -#' xgb.dump(bst, 'xgb.model.dump', with.stats = T) #' #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump') +#' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst) #' #' @export -xgb.importance <- function(feature_names = NULL, filename_dump = NULL){ +xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (class(filename_dump) != "character" || !file.exists(filename_dump)) { + + if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { stop("filename_dump: Has to be a path to the model dump file.") } - text <- readLines(filename_dump) + + if (!class(model) %in% c("xgb.Booster", "NULL")) { + stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") + } + + if(is.null(model)){ + text <- readLines(filename_dump) + } else { + text <- xgb.dump(model = model, with.stats = T) + } + if(text[2] == "bias:"){ result <- linearDump(feature_names, text) } else { diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 1a8a04e8a..7fb23c88a 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -17,7 +17,8 @@ #' @importFrom stringr str_trim #' @importFrom DiagrammeR DiagrammeR #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). +#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. #' @@ -45,19 +46,28 @@ #' #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' eta = 1, nround = 2,objective = "binary:logistic") -#' xgb.dump(bst, 'xgb.model.dump', with.stats = T) #' #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') +#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ +#' +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, styles = NULL){ - if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) { + if (!(class(styles) %in% c("character", "NULL") && length(styles) == 1)) { stop("style: Has to be a character vector of size 1.") } + + if (!class(model) %in% c("xgb.Booster", "NULL")) { + stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") + } - allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree) + if(is.null(model)){ + allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree) + } else { + text = xgb.dump(model = model, with.stats = T) + allTrees <- xgb.model.dt.tree(feature_names = feature_names, text = text, n_first_tree = n_first_tree) + } allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 78be4b91b..c173b1e8e 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -4,12 +4,14 @@ \alias{xgb.importance} \title{Show importance of features in a model} \usage{ -xgb.importance(feature_names = NULL, filename_dump = NULL) +xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} + +\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. @@ -43,9 +45,8 @@ test <- agaricus.test bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic") -xgb.dump(bst, 'xgb.model.dump', with.stats = T) #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump') +xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst) } diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index ba65cdd7c..c1b8418cd 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -4,13 +4,15 @@ \alias{xgb.plot.tree} \title{Plot a boosted tree model} \usage{ -xgb.plot.tree(feature_names = NULL, filename_dump = NULL, +xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, styles = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).} + +\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.} \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} @@ -44,9 +46,8 @@ train <- agaricus.train bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic") -xgb.dump(bst, 'xgb.model.dump', with.stats = T) #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump') +xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst) } From 31a3b38ef85dd19daacd68429aa885546925ff25 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 11 Jan 2015 09:40:55 +0100 Subject: [PATCH 3/3] add new parameters model to avoid the use of dump file for functions plot, dt.tree, importance add new size parameter for plot function --- R-package/R/xgb.importance.R | 4 ++-- R-package/R/xgb.model.dt.tree.R | 28 +++++++++++++++++++--------- R-package/R/xgb.plot.tree.R | 19 ++++++++++--------- R-package/man/xgb.model.dt.tree.Rd | 10 +++++++--- R-package/man/xgb.plot.tree.Rd | 4 ++-- 5 files changed, 40 insertions(+), 25 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 7dd3a8ca3..174d92704 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -64,9 +64,9 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N } if(text[2] == "bias:"){ - result <- linearDump(feature_names, text) + result <- readLines(filename_dump) %>% linearDump(feature_names, .) } else { - result <- treeDump(feature_names, text) + result <- treeDump(feature_names, text = text) } result } diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 3e0723c61..5ad6c6b3d 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -16,6 +16,8 @@ #' @importFrom stringr str_trim #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. +#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' #' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing. @@ -49,29 +51,37 @@ #' xgb.dump(bst, 'xgb.model.dump', with.stats = T) #' #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') +#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump') #' #' @export -xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){ +xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (!class(filename_dump) %in% c("character", "NULL")) { - stop("filename_dump: Has to be a character vector representing the path to the model dump file.") - } else if (class(filename_dump) == "character" && !file.exists(filename_dump)) { + if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { + stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.") + } else if (!is.null(filename_dump) && !file.exists(filename_dump)) { stop("filename_dump: path to the model doesn't exist.") - } else if(is.null(filename_dump) & is.null(text)){ - stop("filename_dump: no path and no string version of the model dump have been provided.") + } else if(is.null(filename_dump) && is.null(model) && is.null(text)){ + stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.") } - if (!class(text) %in% c("character", "NULL")) { + + if (!class(model) %in% c("xgb.Booster", "NULL")) { + stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") + } + + if (!class(text) %in% c("character", "NULL")) { stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") } + if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { stop("n_first_tree: Has to be a numeric vector of size 1.") } - if(is.null(text)){ + if(!is.null(model)){ + text = xgb.dump(model = model, with.stats = T) + } else if(!is.null(filename_dump)){ text <- readLines(filename_dump) %>% str_trim(side = "both") } diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 7fb23c88a..01261fab3 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -20,7 +20,9 @@ #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. -#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. +#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. +#' @param width the width of the diagram in pixels. +#' @param height the height of the diagram in pixels. #' #' @return A \code{DiagrammeR} of the model. #' @@ -52,9 +54,9 @@ #' #' @export #' -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, styles = NULL){ +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){ - if (!(class(styles) %in% c("character", "NULL") && length(styles) == 1)) { + if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) { stop("style: Has to be a character vector of size 1.") } @@ -65,8 +67,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU if(is.null(model)){ allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree) } else { - text = xgb.dump(model = model, with.stats = T) - allTrees <- xgb.model.dt.tree(feature_names = feature_names, text = text, n_first_tree = n_first_tree) + allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree) } allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] @@ -74,14 +75,14 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] - if(is.null(styles)){ - styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + if(is.null(CSSstyle)){ + CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" } yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") - path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";") - DiagrammeR(path) + path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";") + DiagrammeR(path, width, height) } diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 2bc48c4d0..fb5bd94bd 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -4,14 +4,18 @@ \alias{xgb.model.dt.tree} \title{Convert tree model dump to data.table} \usage{ -xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL, - n_first_tree = NULL) +xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, + model = NULL, text = NULL, n_first_tree = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} +\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} + +\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} + \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} } \value{ @@ -49,6 +53,6 @@ bst <- xgboost(data = train$data, label = train$label, max.depth = 2, xgb.dump(bst, 'xgb.model.dump', with.stats = T) #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump') +xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump') } diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index c1b8418cd..ce69d4431 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -5,7 +5,7 @@ \title{Plot a boosted tree model} \usage{ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, - n_first_tree = NULL, styles = NULL) + n_first_tree = NULL, CSSstyle = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} @@ -16,7 +16,7 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} -\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.} +\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.} } \value{ A \code{DiagrammeR} of the model.