Polishing API + wording in function description #Rstat

This commit is contained in:
pommedeterresautee
2015-11-30 10:22:14 +01:00
parent 28060d5595
commit 07d62a4b89
8 changed files with 41 additions and 77 deletions

View File

@@ -66,16 +66,12 @@
#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
#'
#' @export
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a path to the model dump file.")
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
@@ -87,12 +83,8 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
}
if(is.null(model)){
text <- readLines(filename_dump)
} else {
text <- xgb.dump(model = model, with.stats = T)
}
text <- xgb.dump(model = model, with.stats = T)
if(text[2] == "bias:"){
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")

View File

@@ -14,7 +14,6 @@
#' @importFrom stringr str_split
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
@@ -54,20 +53,13 @@
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
stop("filename_dump: path to the model doesn't exist.")
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
@@ -79,12 +71,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
stop("n_first_tree: Has to be a numeric vector of size 1.")
}
if(!is.null(model)){
text <- xgb.dump(model = model, with.stats = T)
} else if(!is.null(filename_dump)){
text <- readLines(filename_dump) %>% str_trim(side = "both")
}
text <- xgb.dump(model = model, with.stats = T)
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist

View File

@@ -69,7 +69,6 @@ get.paths.to.leaf <- function(dt.tree) {
#' @importFrom data.table setnames
#' @importFrom data.table :=
#' @importFrom magrittr %>%
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#'
#' @return Two graphs showing the distribution of the model deepness.
@@ -77,7 +76,7 @@ get.paths.to.leaf <- function(dt.tree) {
#' @details
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
#' by tree deepness level.
#' The purpose of this function is to help the user to find the best trad-off to set
#' The purpose of this function is to help the user to find the best trade-off to set
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
#'
#' See \link{xgb.train} for more information about these parameters.
@@ -89,7 +88,7 @@ get.paths.to.leaf <- function(dt.tree) {
#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
#' }
#'
#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#'
#' @examples
#' data(agaricus.train, package='xgboost')
@@ -101,7 +100,7 @@ get.paths.to.leaf <- function(dt.tree) {
#' xgb.plot.deepness(model = bst)
#'
#' @export
xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
xgb.plot.deepness <- function(model = NULL) {
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("ggplot2 package is required for plotting the graph deepness.",
call. = FALSE)
@@ -117,23 +116,11 @@ xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
call. = FALSE)
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
if (class(model) != "xgb.Booster") {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
stop("filename_dump: path to the model doesn't exist.")
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
}
if(!is.null(model)){
dt.tree <- xgb.model.dt.tree(model = model)
} else if(!is.null(filename_dump)){
dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
}
dt.tree <- xgb.model.dt.tree(model = model)
dt.edge.elements <- data.table()
paths <- get.paths.to.leaf(dt.tree)

View File

@@ -10,9 +10,8 @@
#' @importFrom stringr str_detect
#' @importFrom stringr str_extract
#'
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param features.keep number of features to keep in each position of the multi tree.
#' @param features.keep number of features to keep in each position of the multi trees.
#' @param plot.width width in pixels of the graph to produce
#' @param plot.height height in pixels of the graph to produce
#'
@@ -20,21 +19,23 @@
#'
#' @details
#'
#' This function tries to capture the complexity of gradient boosted tree ensembles
#' This function tries to capture the complexity of gradient boosted tree ensemble
#' in a cohesive way.
#'
#' The goal is to improve the interpretability of the model generally seen as black box.
#' The function is dedicated to boosting applied to decision trees only.
#'
#' The purpose is to move from an ensemble of trees to a single tree only.
#'
#' It takes advantage of the fact that the shape of a binary tree is only defined by
#' its deepness.
#' Therefore in a boosting model, all trees have the same shape.
#' its deepness (therefore in a boosting model, all trees have the same shape).
#'
#' Moreover, the trees tend to reuse the same features.
#'
#' The function will project each trees on one, and keep for each position the
#' \code{features.keep} first features (based on Gain per feature).
#' The function will project each tree on one, and keep for each position the
#' \code{features.keep} first features (based on Gain per feature measure).
#'
#' This function is inspired from this blog post:
#' This function is inspired by this blog post:
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
#'
#' @examples