Polishing API + wording in function description #Rstat
This commit is contained in:
@@ -66,16 +66,12 @@
|
||||
#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
|
||||
#'
|
||||
#' @export
|
||||
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
||||
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a path to the model dump file.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
@@ -87,12 +83,8 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
|
||||
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
|
||||
}
|
||||
|
||||
if(is.null(model)){
|
||||
text <- readLines(filename_dump)
|
||||
} else {
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
}
|
||||
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
|
||||
if(text[2] == "bias:"){
|
||||
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
|
||||
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_trim
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
@@ -54,20 +53,13 @@
|
||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
@@ -79,12 +71,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
||||
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
||||
}
|
||||
|
||||
if(!is.null(model)){
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
} else if(!is.null(filename_dump)){
|
||||
text <- readLines(filename_dump) %>% str_trim(side = "both")
|
||||
}
|
||||
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
|
||||
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
|
||||
|
||||
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
||||
|
||||
@@ -69,7 +69,6 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' @importFrom data.table setnames
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#'
|
||||
#' @return Two graphs showing the distribution of the model deepness.
|
||||
@@ -77,7 +76,7 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' @details
|
||||
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||
#' by tree deepness level.
|
||||
#' The purpose of this function is to help the user to find the best trad-off to set
|
||||
#' The purpose of this function is to help the user to find the best trade-off to set
|
||||
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||
#'
|
||||
#' See \link{xgb.train} for more information about these parameters.
|
||||
@@ -89,7 +88,7 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
|
||||
#' }
|
||||
#'
|
||||
#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@@ -101,7 +100,7 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' xgb.plot.deepness(model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
|
||||
xgb.plot.deepness <- function(model = NULL) {
|
||||
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||
stop("ggplot2 package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
@@ -117,23 +116,11 @@ xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
||||
}
|
||||
|
||||
if(!is.null(model)){
|
||||
dt.tree <- xgb.model.dt.tree(model = model)
|
||||
} else if(!is.null(filename_dump)){
|
||||
dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
|
||||
}
|
||||
dt.tree <- xgb.model.dt.tree(model = model)
|
||||
|
||||
dt.edge.elements <- data.table()
|
||||
paths <- get.paths.to.leaf(dt.tree)
|
||||
|
||||
@@ -10,9 +10,8 @@
|
||||
#' @importFrom stringr str_detect
|
||||
#' @importFrom stringr str_extract
|
||||
#'
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param features.keep number of features to keep in each position of the multi tree.
|
||||
#' @param features.keep number of features to keep in each position of the multi trees.
|
||||
#' @param plot.width width in pixels of the graph to produce
|
||||
#' @param plot.height height in pixels of the graph to produce
|
||||
#'
|
||||
@@ -20,21 +19,23 @@
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' This function tries to capture the complexity of gradient boosted tree ensembles
|
||||
#' This function tries to capture the complexity of gradient boosted tree ensemble
|
||||
#' in a cohesive way.
|
||||
#'
|
||||
#' The goal is to improve the interpretability of the model generally seen as black box.
|
||||
#' The function is dedicated to boosting applied to decision trees only.
|
||||
#'
|
||||
#' The purpose is to move from an ensemble of trees to a single tree only.
|
||||
#'
|
||||
#' It takes advantage of the fact that the shape of a binary tree is only defined by
|
||||
#' its deepness.
|
||||
#' Therefore in a boosting model, all trees have the same shape.
|
||||
#' its deepness (therefore in a boosting model, all trees have the same shape).
|
||||
#'
|
||||
#' Moreover, the trees tend to reuse the same features.
|
||||
#'
|
||||
#' The function will project each trees on one, and keep for each position the
|
||||
#' \code{features.keep} first features (based on Gain per feature).
|
||||
#' The function will project each tree on one, and keep for each position the
|
||||
#' \code{features.keep} first features (based on Gain per feature measure).
|
||||
#'
|
||||
#' This function is inspired from this blog post:
|
||||
#' This function is inspired by this blog post:
|
||||
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
|
||||
#'
|
||||
#' @examples
|
||||
|
||||
Reference in New Issue
Block a user