Polishing API + wording in function description #Rstat

2015-11-30 10:22:14 +01:00
parent 28060d5595
commit 07d62a4b89
8 changed files with 41 additions and 77 deletions
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -66,16 +66,12 @@
 #' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
 #' 
 #' @export
-xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
+xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
  if (!class(feature_names) %in% c("character", "NULL")) {
-    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
+    stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
  }

-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a path to the model dump file.")
-  }
-
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+  if (class(model) != "xgb.Booster") {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }

@@ -87,12 +83,8 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
    if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
  }

-  if(is.null(model)){
-    text <- readLines(filename_dump)
-  } else {
-    text <- xgb.dump(model = model, with.stats = T)
-  }
-
+  text <- xgb.dump(model = model, with.stats = T)
+  
  if(text[2] == "bias:"){
    result <- readLines(filename_dump) %>% linearDump(feature_names, .)
    if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -14,7 +14,6 @@
 #' @importFrom stringr str_split
 #' @importFrom stringr str_trim
 #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
@@ -54,20 +53,13 @@
 #' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
+xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){

  if (!class(feature_names) %in% c("character", "NULL")) {
    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
  }
-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
-  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
-    stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
-    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
-  }

-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+  if (class(model) != "xgb.Booster") {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }

@@ -79,12 +71,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
    stop("n_first_tree: Has to be a numeric vector of size 1.")
  }

-  if(!is.null(model)){
-    text <- xgb.dump(model = model, with.stats = T)
-  } else if(!is.null(filename_dump)){
-    text <- readLines(filename_dump) %>% str_trim(side = "both")
-  }
-
+  text <- xgb.dump(model = model, with.stats = T)
+  
  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)

  extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -69,7 +69,6 @@ get.paths.to.leaf <- function(dt.tree) {
 #' @importFrom data.table setnames
 #' @importFrom data.table :=
 #' @importFrom magrittr %>%
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #'
 #' @return Two graphs showing the distribution of the model deepness.
@@ -77,7 +76,7 @@ get.paths.to.leaf <- function(dt.tree) {
 #' @details
 #' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
 #' by tree deepness level.
-#' The purpose of this function is to help the user to find the best trad-off to set
+#' The purpose of this function is to help the user to find the best trade-off to set
 #' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
 #' 
 #' See \link{xgb.train} for more information about these parameters.
@@ -89,7 +88,7 @@ get.paths.to.leaf <- function(dt.tree) {
 #'  \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
 #' }
 #'
-#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -101,7 +100,7 @@ get.paths.to.leaf <- function(dt.tree) {
 #' xgb.plot.deepness(model = bst)
 #'
 #' @export
-xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
+xgb.plot.deepness <- function(model = NULL) {
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("ggplot2 package is required for plotting the graph deepness.",
         call. = FALSE)
@@ -117,23 +116,11 @@ xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
         call. = FALSE)
  }
  
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+  if (class(model) != "xgb.Booster") {
    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
  }
  
-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
-  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
-    stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
-    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
-  }
-  
-  if(!is.null(model)){
-    dt.tree <- xgb.model.dt.tree(model = model)
-  } else if(!is.null(filename_dump)){
-    dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
-  }
+  dt.tree <- xgb.model.dt.tree(model = model)
  
  dt.edge.elements <- data.table()
  paths <- get.paths.to.leaf(dt.tree)
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -10,9 +10,8 @@
 #' @importFrom stringr str_detect
 #' @importFrom stringr str_extract
 #' 
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
 #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
-#' @param features.keep number of features to keep in each position of the multi tree.
+#' @param features.keep number of features to keep in each position of the multi trees.
 #' @param plot.width width in pixels of the graph to produce
 #' @param plot.height height in pixels of the graph to produce
 #' 
@@ -20,21 +19,23 @@
 #' 
 #' @details
 #' 
-#' This function tries to capture the complexity of gradient boosted tree ensembles 
+#' This function tries to capture the complexity of gradient boosted tree ensemble 
 #' in a cohesive way. 
+#' 
 #' The goal is to improve the interpretability of the model generally seen as black box.
 #' The function is dedicated to boosting applied to decision trees only.
 #' 
 #' The purpose is to move from an ensemble of trees to a single tree only.
+#' 
 #' It takes advantage of the fact that the shape of a binary tree is only defined by 
-#' its deepness.
-#' Therefore in a boosting model, all trees have the same shape. 
+#' its deepness (therefore in a boosting model, all trees have the same shape). 
+#' 
 #' Moreover, the trees tend to reuse the same features.
 #' 
-#' The function will project each trees on one, and keep for each position the 
-#' \code{features.keep} first features (based on Gain per feature).
+#' The function will project each tree on one, and keep for each position the 
+#' \code{features.keep} first features (based on Gain per feature measure).
 #' 
-#' This function is inspired from this blog post:
+#' This function is inspired by this blog post:
 #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
 #'
 #' @examples