Polishing API + wording in function description #Rstat

2015-11-30 10:22:14 +01:00
parent 28060d5595
commit 07d62a4b89
8 changed files with 41 additions and 77 deletions
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -66,16 +66,12 @@
 #' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
 #' 
 #' @export
-xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
+xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
  if (!class(feature_names) %in% c("character", "NULL")) {
-    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
+    stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
  }

-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a path to the model dump file.")
-  }
-
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+  if (class(model) != "xgb.Booster") {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }

@@ -87,12 +83,8 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
    if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
  }

-  if(is.null(model)){
-    text <- readLines(filename_dump)
-  } else {
-    text <- xgb.dump(model = model, with.stats = T)
-  }
-
+  text <- xgb.dump(model = model, with.stats = T)
+  
  if(text[2] == "bias:"){
    result <- readLines(filename_dump) %>% linearDump(feature_names, .)
    if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -14,7 +14,6 @@
 #' @importFrom stringr str_split
 #' @importFrom stringr str_trim
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
@@ -54,20 +53,13 @@
 #' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
+xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){

  if (!class(feature_names) %in% c("character", "NULL")) {
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
-  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
-    stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
-    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
-  }

-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+  if (class(model) != "xgb.Booster") {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }

@@ -79,12 +71,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
    stop("n_first_tree: Has to be a numeric vector of size 1.")
  }

-  if(!is.null(model)){
-    text <- xgb.dump(model = model, with.stats = T)
-  } else if(!is.null(filename_dump)){
-    text <- readLines(filename_dump) %>% str_trim(side = "both")
-  }
-
+  text <- xgb.dump(model = model, with.stats = T)
+  
  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)

  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -69,7 +69,6 @@ get.paths.to.leaf <- function(dt.tree) {
 #' @importFrom data.table setnames
 #' @importFrom data.table :=
 #' @importFrom magrittr %>%
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #'
 #' @return Two graphs showing the distribution of the model deepness.
@@ -77,7 +76,7 @@ get.paths.to.leaf <- function(dt.tree) {
 #' @details
 #' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
 #' by tree deepness level.
-#' The purpose of this function is to help the user to find the best trad-off to set
+#' The purpose of this function is to help the user to find the best trade-off to set
 #' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
 #' 
 #' See \link{xgb.train} for more information about these parameters.
@@ -89,7 +88,7 @@ get.paths.to.leaf <- function(dt.tree) {
 #'  \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
 #' }
 #'
-#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -101,7 +100,7 @@ get.paths.to.leaf <- function(dt.tree) {
 #' xgb.plot.deepness(model = bst)
 #'
 #' @export
-xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
+xgb.plot.deepness <- function(model = NULL) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("ggplot2 package is required for plotting the graph deepness.",
         call. = FALSE)
@@ -117,23 +116,11 @@ xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
         call. = FALSE)
  }
  
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+  if (class(model) != "xgb.Booster") {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  
-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
-  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
-    stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
-    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
-  }
-  
-  if(!is.null(model)){
-    dt.tree <- xgb.model.dt.tree(model = model)
-  } else if(!is.null(filename_dump)){
-    dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
-  }
+  dt.tree <- xgb.model.dt.tree(model = model)
  
  dt.edge.elements <- data.table()
  paths <- get.paths.to.leaf(dt.tree)
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -10,9 +10,8 @@
 #' @importFrom stringr str_detect
 #' @importFrom stringr str_extract
 #' 
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
-#' @param features.keep number of features to keep in each position of the multi tree.
+#' @param features.keep number of features to keep in each position of the multi trees.
 #' @param plot.width width in pixels of the graph to produce
 #' @param plot.height height in pixels of the graph to produce
 #' 
@@ -20,21 +19,23 @@
 #' 
 #' @details
 #' 
-#' This function tries to capture the complexity of gradient boosted tree ensembles 
+#' This function tries to capture the complexity of gradient boosted tree ensemble 
 #' in a cohesive way. 
+#' 
 #' The goal is to improve the interpretability of the model generally seen as black box.
 #' The function is dedicated to boosting applied to decision trees only.
 #' 
 #' The purpose is to move from an ensemble of trees to a single tree only.
+#' 
 #' It takes advantage of the fact that the shape of a binary tree is only defined by 
-#' its deepness.
-#' Therefore in a boosting model, all trees have the same shape. 
+#' its deepness (therefore in a boosting model, all trees have the same shape). 
+#' 
 #' Moreover, the trees tend to reuse the same features.
 #' 
-#' The function will project each trees on one, and keep for each position the 
-#' \code{features.keep} first features (based on Gain per feature).
+#' The function will project each tree on one, and keep for each position the 
+#' \code{features.keep} first features (based on Gain per feature measure).
 #' 
-#' This function is inspired from this blog post:
+#' This function is inspired by this blog post:
 #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 #'
 #' @examples
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -4,14 +4,12 @@
 \alias{xgb.importance}
 \title{Show importance of features in a model}
 \usage{
-xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
-  data = NULL, label = NULL, target = function(x) ((x + label) == 2))
+xgb.importance(feature_names = NULL, model = NULL, data = NULL,
+  label = NULL, target = function(x) ((x + label) == 2))
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}

-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
-
 \item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}

 \item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
@@ -19,6 +17,8 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
 \item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}

 \item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
+
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
 }
 \value{
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -4,14 +4,12 @@
 \alias{xgb.model.dt.tree}
 \title{Convert tree model dump to data.table}
 \usage{
-xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
-  model = NULL, text = NULL, n_first_tree = NULL)
+xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
+  n_first_tree = NULL)
 }
 \arguments{
 \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}

-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
-
 \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}

 \item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -4,11 +4,9 @@
 \alias{xgb.plot.deepness}
 \title{Plot model trees deepness}
 \usage{
-xgb.plot.deepness(filename_dump = NULL, model = NULL)
+xgb.plot.deepness(model = NULL)
 }
 \arguments{
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
-
 \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 }
 \value{
@@ -20,7 +18,7 @@ Generate a graph to plot the distribution of deepness among trees.
 \details{
 Display both the number of \code{leaf} and the distribution of \code{weighted observations}
 by tree deepness level.
-The purpose of this function is to help the user to find the best trad-off to set
+The purpose of this function is to help the user to find the best trade-off to set
 the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.

 See \link{xgb.train} for more information about these parameters.
@@ -32,7 +30,7 @@ The graph is made of two parts:
 \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
 }

-This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
 }
 \examples{
 data(agaricus.train, package='xgboost')
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -10,13 +10,11 @@ xgb.plot.multi.trees(model, names, features.keep = 5, plot.width = NULL,
 \arguments{
 \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}

-\item{features.keep}{number of features to keep in each position of the multi tree.}
+\item{features.keep}{number of features to keep in each position of the multi trees.}

 \item{plot.width}{width in pixels of the graph to produce}

 \item{plot.height}{height in pixels of the graph to produce}
-
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
 }
 \value{
 Two graphs showing the distribution of the model deepness.
@@ -25,21 +23,23 @@ Two graphs showing the distribution of the model deepness.
 Visualization of the ensemble of trees as a single collective unit.
 }
 \details{
-This function tries to capture the complexity of gradient boosted tree ensembles 
+This function tries to capture the complexity of gradient boosted tree ensemble 
 in a cohesive way. 
+
 The goal is to improve the interpretability of the model generally seen as black box.
 The function is dedicated to boosting applied to decision trees only.

 The purpose is to move from an ensemble of trees to a single tree only.
+
 It takes advantage of the fact that the shape of a binary tree is only defined by 
-its deepness.
-Therefore in a boosting model, all trees have the same shape. 
+its deepness (therefore in a boosting model, all trees have the same shape). 
+
 Moreover, the trees tend to reuse the same features.

-The function will project each trees on one, and keep for each position the 
-\code{features.keep} first features (based on Gain per feature).
+The function will project each tree on one, and keep for each position the 
+\code{features.keep} first features (based on Gain per feature measure).

-This function is inspired from this blog post:
+This function is inspired by this blog post:
 \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 }
 \examples{