Merge pull request #660 from pommedeterresautee/master
Polishing API + wording in function description #Rstat
This commit is contained in:
commit
423764ca2e
@ -12,15 +12,9 @@
|
||||
#' @importFrom Matrix sparseVector
|
||||
#'
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#'
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
|
||||
#'
|
||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#'
|
||||
#' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||
#'
|
||||
#' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
|
||||
#'
|
||||
#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
|
||||
#'
|
||||
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
|
||||
@ -54,28 +48,23 @@
|
||||
#' # Both dataset are list with two items, a sparse matrix and labels
|
||||
#' # (labels = outcome column which will be learned).
|
||||
#' # Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.importance(train$data@@Dimnames[[2]], model = bst)
|
||||
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' # Same thing with co-occurence computation this time
|
||||
#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
|
||||
#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
|
||||
#'
|
||||
#' @export
|
||||
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
||||
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a path to the model dump file.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
@ -86,18 +75,23 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
|
||||
if(class(label) == "numeric"){
|
||||
if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
|
||||
}
|
||||
|
||||
if(is.null(model)){
|
||||
text <- readLines(filename_dump)
|
||||
} else {
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
|
||||
treeDump <- function(feature_names, text, keepDetail){
|
||||
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
|
||||
xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
|
||||
}
|
||||
|
||||
linearDump <- function(feature_names, text){
|
||||
which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
|
||||
}
|
||||
|
||||
if(text[2] == "bias:"){
|
||||
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
|
||||
model.text.dump <- xgb.dump(model = model, with.stats = T)
|
||||
|
||||
if(model.text.dump[2] == "bias:"){
|
||||
result <- model.text.dump %>% linearDump(feature_names, .)
|
||||
if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
|
||||
} else {
|
||||
result <- treeDump(feature_names, text = text, keepDetail = !is.null(data))
|
||||
result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data))
|
||||
|
||||
# Co-occurence computation
|
||||
if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
|
||||
@ -116,17 +110,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
|
||||
result
|
||||
}
|
||||
|
||||
treeDump <- function(feature_names, text, keepDetail){
|
||||
if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
|
||||
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
linearDump <- function(feature_names, text){
|
||||
which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_trim
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
@ -51,40 +50,29 @@
|
||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if (!class(text) %in% c("character", "NULL")) {
|
||||
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
|
||||
if (class(model) != "xgb.Booster" & class(text) != "character") {
|
||||
"model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>%
|
||||
paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>%
|
||||
stop()
|
||||
}
|
||||
|
||||
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
|
||||
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
||||
}
|
||||
|
||||
if(!is.null(model)){
|
||||
if(is.null(text)){
|
||||
text <- xgb.dump(model = model, with.stats = T)
|
||||
} else if(!is.null(filename_dump)){
|
||||
text <- readLines(filename_dump) %>% str_trim(side = "both")
|
||||
}
|
||||
|
||||
|
||||
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
|
||||
|
||||
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
||||
|
||||
@ -69,7 +69,6 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' @importFrom data.table setnames
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#'
|
||||
#' @return Two graphs showing the distribution of the model deepness.
|
||||
@ -77,7 +76,7 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' @details
|
||||
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||
#' by tree deepness level.
|
||||
#' The purpose of this function is to help the user to find the best trad-off to set
|
||||
#' The purpose of this function is to help the user to find the best trade-off to set
|
||||
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||
#'
|
||||
#' See \link{xgb.train} for more information about these parameters.
|
||||
@ -89,7 +88,7 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
|
||||
#' }
|
||||
#'
|
||||
#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
@ -101,7 +100,7 @@ get.paths.to.leaf <- function(dt.tree) {
|
||||
#' xgb.plot.deepness(model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
|
||||
xgb.plot.deepness <- function(model = NULL) {
|
||||
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
||||
stop("ggplot2 package is required for plotting the graph deepness.",
|
||||
call. = FALSE)
|
||||
@ -117,23 +116,11 @@ xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) {
|
||||
call. = FALSE)
|
||||
}
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
|
||||
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
|
||||
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
|
||||
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
|
||||
}
|
||||
|
||||
if(!is.null(model)){
|
||||
dt.tree <- xgb.model.dt.tree(model = model)
|
||||
} else if(!is.null(filename_dump)){
|
||||
dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump)
|
||||
}
|
||||
dt.tree <- xgb.model.dt.tree(model = model)
|
||||
|
||||
dt.edge.elements <- data.table()
|
||||
paths <- get.paths.to.leaf(dt.tree)
|
||||
|
||||
@ -19,13 +19,12 @@
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||
#' #(labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst)
|
||||
#' #agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' importance_matrix <- xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#' xgb.plot.importance(importance_matrix)
|
||||
#'
|
||||
#' @export
|
||||
|
||||
@ -10,9 +10,9 @@
|
||||
#' @importFrom stringr str_detect
|
||||
#' @importFrom stringr str_extract
|
||||
#'
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param features.keep number of features to keep in each position of the multi tree.
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param features.keep number of features to keep in each position of the multi trees.
|
||||
#' @param plot.width width in pixels of the graph to produce
|
||||
#' @param plot.height height in pixels of the graph to produce
|
||||
#'
|
||||
@ -20,21 +20,23 @@
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' This function tries to capture the complexity of gradient boosted tree ensembles
|
||||
#' This function tries to capture the complexity of gradient boosted tree ensemble
|
||||
#' in a cohesive way.
|
||||
#'
|
||||
#' The goal is to improve the interpretability of the model generally seen as black box.
|
||||
#' The function is dedicated to boosting applied to decision trees only.
|
||||
#'
|
||||
#' The purpose is to move from an ensemble of trees to a single tree only.
|
||||
#'
|
||||
#' It takes advantage of the fact that the shape of a binary tree is only defined by
|
||||
#' its deepness.
|
||||
#' Therefore in a boosting model, all trees have the same shape.
|
||||
#' its deepness (therefore in a boosting model, all trees have the same shape).
|
||||
#'
|
||||
#' Moreover, the trees tend to reuse the same features.
|
||||
#'
|
||||
#' The function will project each trees on one, and keep for each position the
|
||||
#' \code{features.keep} first features (based on Gain per feature).
|
||||
#' The function will project each tree on one, and keep for each position the
|
||||
#' \code{features.keep} first features (based on Gain per feature measure).
|
||||
#'
|
||||
#' This function is inspired from this blog post:
|
||||
#' This function is inspired by this blog post:
|
||||
#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
|
||||
#'
|
||||
#' @examples
|
||||
@ -44,12 +46,12 @@
|
||||
#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||
#' min_child_weight = 50)
|
||||
#'
|
||||
#' p <- xgb.plot.multi.trees(bst, agaricus.train$data@Dimnames[[2]], 3)
|
||||
#' p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
|
||||
#' print(p)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.multi.trees <- function(model, names, features.keep = 5, plot.width = NULL, plot.height = NULL){
|
||||
tree.matrix <- xgb.model.dt.tree(names, model = model)
|
||||
xgb.plot.multi.trees <- function(model, feature_names = NULL, features.keep = 5, plot.width = NULL, plot.height = NULL){
|
||||
tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
|
||||
|
||||
# first number of the path represents the tree, then the following numbers are related to the path to follow
|
||||
# root init
|
||||
|
||||
@ -7,7 +7,6 @@
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
|
||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
#' @param plot.width the width of the diagram in pixels.
|
||||
@ -34,31 +33,26 @@
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||
#' #(labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#' xgb.plot.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){
|
||||
xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){
|
||||
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
|
||||
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
|
||||
}
|
||||
|
||||
if(is.null(model)){
|
||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
|
||||
} else {
|
||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
|
||||
}
|
||||
|
||||
|
||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
|
||||
|
||||
allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)]
|
||||
allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
|
||||
allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
|
||||
|
||||
@ -4,14 +4,12 @@
|
||||
\alias{xgb.importance}
|
||||
\title{Show importance of features in a model}
|
||||
\usage{
|
||||
xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
|
||||
data = NULL, label = NULL, target = function(x) ((x + label) == 2))
|
||||
xgb.importance(feature_names = NULL, model = NULL, data = NULL,
|
||||
label = NULL, target = function(x) ((x + label) == 2))
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
|
||||
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
||||
@ -56,16 +54,15 @@ data(agaricus.train, package='xgboost')
|
||||
# Both dataset are list with two items, a sparse matrix and labels
|
||||
# (labels = outcome column which will be learned).
|
||||
# Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
|
||||
# train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.importance(train$data@Dimnames[[2]], model = bst)
|
||||
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
|
||||
# Same thing with co-occurence computation this time
|
||||
xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label)
|
||||
xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -4,14 +4,12 @@
|
||||
\alias{xgb.model.dt.tree}
|
||||
\title{Convert tree model dump to data.table}
|
||||
\usage{
|
||||
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
|
||||
model = NULL, text = NULL, n_first_tree = NULL)
|
||||
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
|
||||
n_first_tree = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
|
||||
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
@ -54,7 +52,7 @@ bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -4,11 +4,9 @@
|
||||
\alias{xgb.plot.deepness}
|
||||
\title{Plot model trees deepness}
|
||||
\usage{
|
||||
xgb.plot.deepness(filename_dump = NULL, model = NULL)
|
||||
xgb.plot.deepness(model = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
}
|
||||
\value{
|
||||
@ -20,7 +18,7 @@ Generate a graph to plot the distribution of deepness among trees.
|
||||
\details{
|
||||
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
|
||||
by tree deepness level.
|
||||
The purpose of this function is to help the user to find the best trad-off to set
|
||||
The purpose of this function is to help the user to find the best trade-off to set
|
||||
the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
|
||||
|
||||
See \link{xgb.train} for more information about these parameters.
|
||||
@ -32,7 +30,7 @@ The graph is made of two parts:
|
||||
\item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
|
||||
}
|
||||
|
||||
This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
@ -28,13 +28,12 @@ data(agaricus.train, package='xgboost')
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst)
|
||||
#agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
importance_matrix <- xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
xgb.plot.importance(importance_matrix)
|
||||
|
||||
}
|
||||
|
||||
@ -4,19 +4,19 @@
|
||||
\alias{xgb.plot.multi.trees}
|
||||
\title{Project all trees on one tree and plot it}
|
||||
\usage{
|
||||
xgb.plot.multi.trees(model, names, features.keep = 5, plot.width = NULL,
|
||||
plot.height = NULL)
|
||||
xgb.plot.multi.trees(model, feature_names = NULL, features.keep = 5,
|
||||
plot.width = NULL, plot.height = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
|
||||
\item{features.keep}{number of features to keep in each position of the multi tree.}
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{features.keep}{number of features to keep in each position of the multi trees.}
|
||||
|
||||
\item{plot.width}{width in pixels of the graph to produce}
|
||||
|
||||
\item{plot.height}{height in pixels of the graph to produce}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
}
|
||||
\value{
|
||||
Two graphs showing the distribution of the model deepness.
|
||||
@ -25,21 +25,23 @@ Two graphs showing the distribution of the model deepness.
|
||||
Visualization of the ensemble of trees as a single collective unit.
|
||||
}
|
||||
\details{
|
||||
This function tries to capture the complexity of gradient boosted tree ensembles
|
||||
This function tries to capture the complexity of gradient boosted tree ensemble
|
||||
in a cohesive way.
|
||||
|
||||
The goal is to improve the interpretability of the model generally seen as black box.
|
||||
The function is dedicated to boosting applied to decision trees only.
|
||||
|
||||
The purpose is to move from an ensemble of trees to a single tree only.
|
||||
|
||||
It takes advantage of the fact that the shape of a binary tree is only defined by
|
||||
its deepness.
|
||||
Therefore in a boosting model, all trees have the same shape.
|
||||
its deepness (therefore in a boosting model, all trees have the same shape).
|
||||
|
||||
Moreover, the trees tend to reuse the same features.
|
||||
|
||||
The function will project each trees on one, and keep for each position the
|
||||
\code{features.keep} first features (based on Gain per feature).
|
||||
The function will project each tree on one, and keep for each position the
|
||||
\code{features.keep} first features (based on Gain per feature measure).
|
||||
|
||||
This function is inspired from this blog post:
|
||||
This function is inspired by this blog post:
|
||||
\url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
|
||||
}
|
||||
\examples{
|
||||
@ -49,7 +51,7 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.dep
|
||||
eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
|
||||
min_child_weight = 50)
|
||||
|
||||
p <- xgb.plot.multi.trees(bst, agaricus.train$data@Dimnames[[2]], 3)
|
||||
p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
|
||||
print(p)
|
||||
|
||||
}
|
||||
|
||||
@ -4,14 +4,12 @@
|
||||
\alias{xgb.plot.tree}
|
||||
\title{Plot a boosted tree model}
|
||||
\usage{
|
||||
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
|
||||
n_first_tree = NULL, plot.width = NULL, plot.height = NULL)
|
||||
xgb.plot.tree(feature_names = NULL, model = NULL, n_first_tree = NULL,
|
||||
plot.width = NULL, plot.height = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
|
||||
|
||||
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
|
||||
|
||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||
@ -45,13 +43,12 @@ data(agaricus.train, package='xgboost')
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
|
||||
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ require(data.table)
|
||||
require(Matrix)
|
||||
require(vcd)
|
||||
|
||||
set.seed(1994)
|
||||
set.seed(1982)
|
||||
data(Arthritis)
|
||||
data(agaricus.train, package='xgboost')
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
@ -17,17 +17,38 @@ output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y]
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
||||
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
|
||||
|
||||
feature.names <- agaricus.train$data@Dimnames[[2]]
|
||||
|
||||
test_that("xgb.dump works", {
|
||||
capture.output(print(xgb.dump(bst)))
|
||||
expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T))
|
||||
})
|
||||
|
||||
test_that("xgb.importance works", {
|
||||
expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T))
|
||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
||||
test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover",
|
||||
"Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality")
|
||||
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst)
|
||||
expect_equal(names.dt.trees, names(dt.tree))
|
||||
expect_equal(dim(dt.tree), c(162, 15))
|
||||
xgb.model.dt.tree(model = bst)
|
||||
})
|
||||
|
||||
test_that("xgb.importance works with and without feature names", {
|
||||
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
|
||||
expect_equal(dim(importance), c(7, 4))
|
||||
expect_equal(colnames(importance), c("Feature", "Gain", "Cover", "Frequency"))
|
||||
xgb.importance(model = bst)
|
||||
})
|
||||
|
||||
test_that("xgb.plot.tree works", {
|
||||
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
})
|
||||
test_that("xgb.plot.tree works with and without feature names", {
|
||||
xgb.plot.tree(feature_names = feature.names, model = bst)
|
||||
xgb.plot.tree(model = bst)
|
||||
})
|
||||
|
||||
test_that("xgb.plot.multi.trees works with and without feature names", {
|
||||
xgb.plot.multi.trees(model = bst, feature_names = feature.names, features.keep = 3)
|
||||
xgb.plot.multi.trees(model = bst, features.keep = 3)
|
||||
})
|
||||
test_that("xgb.plot.deepness works", {
|
||||
xgb.plot.deepness(model = bst)
|
||||
})
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user