From 60dd75745f1410ad16658c662c8a5077e0efee4e Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 23 Nov 2015 18:19:59 +0100 Subject: [PATCH 01/25] Implement #431 PR --- R-package/demo/basic_walkthrough.R | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 532c5d873..0b1e5b817 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -14,28 +14,28 @@ class(train$data) # this is the basic usage of xgboost you can put matrix in data field # note: we are putting in sparse matrix here, xgboost naturally handles sparse input # use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector) -print("training xgboost with sparseMatrix") +print("Training xgboost with sparseMatrix") bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2, nthread = 2, objective = "binary:logistic") # alternatively, you can put in dense matrix, i.e. basic R-matrix -print("training xgboost with Matrix") +print("Training xgboost with Matrix") bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2, nthread = 2, objective = "binary:logistic") # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features -print("training xgboost with xgb.DMatrix") +print("Training xgboost with xgb.DMatrix") dtrain <- xgb.DMatrix(data = train$data, label = train$label) bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, objective = "binary:logistic") # Verbose = 0,1,2 -print ('train xgboost with verbose 0, no message') +print("Train xgboost with verbose 0, no message") bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, objective = "binary:logistic", verbose = 0) -print ('train xgboost with verbose 1, print evaluation metric') +print("Train xgboost with verbose 1, print evaluation metric") bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, objective = "binary:logistic", verbose = 1) -print ('train xgboost with verbose 2, also print information about tree') +print("Train xgboost with verbose 2, also print information about tree") bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, objective = "binary:logistic", verbose = 2) @@ -76,11 +76,11 @@ dtest <- xgb.DMatrix(data = test$data, label=test$label) watchlist <- list(train=dtrain, test=dtest) # to train with watchlist, use xgb.train, which contains more advanced features # watchlist allows us to monitor the evaluation result on all data in the list -print ('train xgboost using xgb.train with watchlist') +print("Train xgboost using xgb.train with watchlist") bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist, nthread = 2, objective = "binary:logistic") # we can change evaluation metrics, or use multiple evaluation metrics -print ('train xgboost using xgb.train with watchlist, watch logloss and error') +print("train xgboost using xgb.train with watchlist, watch logloss and error") bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", nthread = 2, objective = "binary:logistic") @@ -102,4 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T) # Finally, you can check which features are the most important. print("Most important features (look at column Gain):") -print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt")) +imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt") +print(imp_matrix) + +# Feature importance bar plot by gain +print("Feature importance Plot : ") +print(xgb.plot.importance(imp_matrix)) From 485b30027f45d01c58a2c502ca39f72c2ccc34d3 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 24 Nov 2015 11:45:32 +0100 Subject: [PATCH 02/25] Plot model deepness New function to explore the model by ploting the way splits are done. --- R-package/DESCRIPTION | 2 +- R-package/NAMESPACE | 1 + R-package/R/xgb.plot.deepness.R | 172 +++++++++++++++++++++++++++ R-package/R/xgb.plot.importance.R | 81 ++++++++----- R-package/R/xgb.train.R | 2 +- R-package/demo/xgb.plot.multi.tree.R | 64 ++++++++++ R-package/man/edge.parser.Rd | 15 +++ R-package/man/get.paths.to.leaf.Rd | 15 +++ R-package/man/multiplot.Rd | 15 +++ R-package/man/xgb.plot.deepness.Rd | 47 ++++++++ R-package/man/xgb.plot.importance.Rd | 6 +- R-package/man/xgb.train.Rd | 2 +- 12 files changed, 383 insertions(+), 39 deletions(-) create mode 100644 R-package/R/xgb.plot.deepness.R create mode 100644 R-package/demo/xgb.plot.multi.tree.R create mode 100644 R-package/man/edge.parser.Rd create mode 100644 R-package/man/get.paths.to.leaf.Rd create mode 100644 R-package/man/multiplot.Rd create mode 100644 R-package/man/xgb.plot.deepness.Rd diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index b4201e793..f36e34274 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -33,4 +33,4 @@ Imports: data.table (>= 1.9.6), magrittr (>= 1.5), stringr (>= 0.6.2) -RoxygenNote: 5.0.0 +RoxygenNote: 5.0.1 diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 3fb05b7d8..7f6fa5817 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -10,6 +10,7 @@ export(xgb.dump) export(xgb.importance) export(xgb.load) export(xgb.model.dt.tree) +export(xgb.plot.deepness) export(xgb.plot.importance) export(xgb.plot.tree) export(xgb.save) diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R new file mode 100644 index 000000000..30aea46b8 --- /dev/null +++ b/R-package/R/xgb.plot.deepness.R @@ -0,0 +1,172 @@ +#' Plot multiple graphs at the same time +#' +#' Plot multiple graph aligned by rows and columns. +#' +#' @importFrom data.table data.table +#' @param cols number of columns +#' @return NULL +multiplot <- function(..., cols = 1) { + plots <- list(...) + numPlots = length(plots) + + layout <- matrix(seq(1, cols * ceiling(numPlots / cols)), + ncol = cols, nrow = ceiling(numPlots / cols)) + + if (numPlots == 1) { + print(plots[[1]]) + } else { + grid::grid.newpage() + grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout)))) + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.table(which(layout == i, arr.ind = TRUE)) + + print( + plots[[i]], vp = grid::viewport( + layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col + ) + ) + } + } +} + +#' Parse the graph to extract vector of edges +#' @param element igraph object containing the path from the root to the leaf. +edge.parser <- function(element) { + edges.vector <- igraph::as_ids(element) + t <- tail(edges.vector, n = 1) + l <- length(edges.vector) + list(t,l) +} + +#' Extract path from root to leaf from data.table +#' @param dt.tree data.table containing the nodes and edges of the trees +get.paths.to.leaf <- function(dt.tree) { + dt.not.leaf.edges <- + dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F) + + trees <- dt.tree[,unique(Tree)] + + paths <- list() + for (tree in trees) { + graph <- + igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree]) + paths.tmp <- + igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree & + Feature == "Leaf", c(ID)]) + paths <- c(paths, paths.tmp$vpath) + } + paths +} + +#' Plot model trees deepness +#' +#' Generate a graph to plot the distribution of deepness among trees. +#' +#' @importFrom data.table data.table +#' @importFrom data.table rbindlist +#' @importFrom data.table setnames +#' @importFrom data.table := +#' @importFrom magrittr %>% +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. +#' +#' @return Two graphs showing the distribution of the model deepness. +#' +#' @details +#' Display both the number of \code{leaf} and the distribution of \code{weighted observations} +#' by tree deepness level. +#' The purpose of this function is to help the user to find the best trad-off to set +#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off. +#' +#' See \link{xgb.train} for more information about these parameters. +#' +#' The graph is made of two parts: +#' +#' \itemize{ +#' \item Count: number of leaf per level of deepness; +#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances). +#' } +#' +#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html} +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, +#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", +#' min_child_weight = 50) +#' +#' xgb.plot.deepness(model = bst) +#' +#' @export +xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop("ggplot2 package is required for plotting the graph deepness.", + call. = FALSE) + } + + if (!requireNamespace("igraph", quietly = TRUE)) { + stop("igraph package is required for plotting the graph deepness.", + call. = FALSE) + } + + if (!requireNamespace("grid", quietly = TRUE)) { + stop("grid package is required for plotting the graph deepness.", + call. = FALSE) + } + + if (!class(model) %in% c("xgb.Booster", "NULL")) { + stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") + } + + if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { + stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.") + } else if (!is.null(filename_dump) && !file.exists(filename_dump)) { + stop("filename_dump: path to the model doesn't exist.") + } else if(is.null(filename_dump) && is.null(model) && is.null(text)){ + stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.") + } + + if(!is.null(model)){ + dt.tree <- xgb.model.dt.tree(model = model) + } else if(!is.null(filename_dump)){ + dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump) + } + + dt.edge.elements <- data.table() + paths <- get.paths.to.leaf(dt.tree) + + dt.edge.elements <- + lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>% + merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements) + + dt.edge.summuize <- + dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)] + + p1 <- + ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) + + ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") + + ggplot2::theme( + plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"), + panel.grid.major.y = ggplot2::element_blank(), + axis.ticks = ggplot2::element_blank(), + axis.text.x = ggplot2::element_blank() + ) + + p2 <- + ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) + + ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover") + + multiplot(p1,p2,cols = 1) +} + +# Avoid error messages during CRAN check. +# The reason is that these variables are never declared +# They are mainly column names inferred by Data.table... +globalVariables( + c( + "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree" + ) +) diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 92399516d..ea3e17892 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -1,57 +1,72 @@ #' Plot feature importance bar graph -#' +#' #' Read a data.table containing feature importance details and plot it. -#' +#' #' @importFrom magrittr %>% #' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function. #' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars. #' #' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar. #' -#' @details +#' @details #' The purpose of this function is to easily represent the importance of each feature of a model. #' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it). -#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function. -#' +#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function. +#' #' @examples #' data(agaricus.train, package='xgboost') -#' -#' #Both dataset are list with two items, a sparse matrix and labels -#' #(labels = outcome column which will be learned). +#' +#' #Both dataset are list with two items, a sparse matrix and labels +#' #(labels = outcome column which will be learned). #' #Each column of the sparse Matrix is a feature in one hot encoding format. #' train <- agaricus.train -#' -#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' +#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -#' +#' #' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix. #' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst) #' xgb.plot.importance(importance_matrix) -#' +#' #' @export -xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){ - if (!"data.table" %in% class(importance_matrix)) { - stop("importance_matrix: Should be a data.table.") +xgb.plot.importance <- + function(importance_matrix = NULL, numberOfClusters = c(1:10)) { + if (!"data.table" %in% class(importance_matrix)) { + stop("importance_matrix: Should be a data.table.") + } + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop("ggplot2 package is required for plotting the importance", call. = FALSE) + } + if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { + stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE) + } + + # To avoid issues in clustering when co-occurences are used + importance_matrix <- + importance_matrix[, .(Gain = sum(Gain)), by = Feature] + + clusters <- + suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) + importance_matrix[,"Cluster":= clusters$cluster %>% as.character] + + plot <- + ggplot2::ggplot( + importance_matrix, ggplot2::aes( + x = stats::reorder(Feature, Gain), y = Gain, width = 0.05 + ), environment = environment() + ) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = + "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme( + plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank() + ) + + return(plot) } - if (!requireNamespace("ggplot2", quietly = TRUE)) { - stop("ggplot2 package is required for plotting the importance", call. = FALSE) - } - if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { - stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE) - } - - # To avoid issues in clustering when co-occurences are used - importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature] - - clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) - importance_matrix[,"Cluster" := clusters$cluster %>% as.character] - - plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width = 0.05), environment = environment()) + ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() ) - - return(plot) -} # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text")) +globalVariables( + c( + "Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text" + ) +) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index ffc94e34f..768bed27b 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -19,7 +19,7 @@ #' \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 #' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. #' \item \code{max_depth} maximum depth of a tree. Default: 6 -#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 +#' \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 #' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 #' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 #' \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 diff --git a/R-package/demo/xgb.plot.multi.tree.R b/R-package/demo/xgb.plot.multi.tree.R new file mode 100644 index 000000000..feb7e667e --- /dev/null +++ b/R-package/demo/xgb.plot.multi.tree.R @@ -0,0 +1,64 @@ +library(stringr) +library(data.table) +library(xgboost) + + +data(agaricus.train, package='xgboost') + +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). +#Each column of the sparse Matrix is a feature in one hot encoding format. +train <- agaricus.train + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 4, objective = "binary:logistic") + +#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) + + +# first number of the path represents the tree, then the following numbers are related to the path to follow + +# root init +root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] + +precedent.nodes <- root.nodes + +while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { + yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] + no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)] + yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0") + no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1") + + tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos] + tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos] + precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) +} + +tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")] +tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")] +tree.matrix[,ID:= Abs.Position] + +tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))] +keepN <- 3 +tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position] + +tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] + +tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] + +tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")] + +CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + + +yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") + +no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") + +path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n") +DiagrammeR::mermaid(path) + +# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf" +# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) diff --git a/R-package/man/edge.parser.Rd b/R-package/man/edge.parser.Rd new file mode 100644 index 000000000..25ee4a30a --- /dev/null +++ b/R-package/man/edge.parser.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.deepness.R +\name{edge.parser} +\alias{edge.parser} +\title{Parse the graph to extract vector of edges} +\usage{ +edge.parser(element) +} +\arguments{ +\item{element}{igraph object containing the path from the root to the leaf.} +} +\description{ +Parse the graph to extract vector of edges +} + diff --git a/R-package/man/get.paths.to.leaf.Rd b/R-package/man/get.paths.to.leaf.Rd new file mode 100644 index 000000000..1fdcfd5d7 --- /dev/null +++ b/R-package/man/get.paths.to.leaf.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.deepness.R +\name{get.paths.to.leaf} +\alias{get.paths.to.leaf} +\title{Extract path from root to leaf from data.table} +\usage{ +get.paths.to.leaf(dt.tree) +} +\arguments{ +\item{dt.tree}{data.table containing the nodes and edges of the trees} +} +\description{ +Extract path from root to leaf from data.table +} + diff --git a/R-package/man/multiplot.Rd b/R-package/man/multiplot.Rd new file mode 100644 index 000000000..a2fef7d99 --- /dev/null +++ b/R-package/man/multiplot.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.deepness.R +\name{multiplot} +\alias{multiplot} +\title{Plot multiple graphs at the same time} +\usage{ +multiplot(..., cols = 1) +} +\arguments{ +\item{cols}{number of columns} +} +\description{ +Plot multiple graph aligned by rows and columns. +} + diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd new file mode 100644 index 000000000..e54d5141b --- /dev/null +++ b/R-package/man/xgb.plot.deepness.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.deepness.R +\name{xgb.plot.deepness} +\alias{xgb.plot.deepness} +\title{Plot model trees deepness} +\usage{ +xgb.plot.deepness(filename_dump = NULL, model = NULL) +} +\arguments{ +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} + +\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} +} +\value{ +Two graphs showing the distribution of the model deepness. +} +\description{ +Generate a graph to plot the distribution of deepness among trees. +} +\details{ +Display both the number of \code{leaf} and the distribution of \code{weighted observations} +by tree deepness level. +The purpose of this function is to help the user to find the best trad-off to set +the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off. + +See \link{xgb.train} for more information about these parameters. + +The graph is made of two parts: + +\itemize{ + \item Count: number of leaf per level of deepness; + \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances). +} + +This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html} +} +\examples{ +data(agaricus.train, package='xgboost') + +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, + eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", + min_child_weight = 50) + +xgb.plot.deepness(model = bst) + +} + diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index de70624cb..4ade2cda3 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -25,12 +25,12 @@ In particular you may want to override the title of the graph. To do so, add \co \examples{ data(agaricus.train, package='xgboost') -#Both dataset are list with two items, a sparse matrix and labels -#(labels = outcome column which will be learned). +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #train$data@Dimnames[[2]] represents the column names of the sparse matrix. diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 50bfb46d0..7f7ae4962 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -27,7 +27,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. \item \code{max_depth} maximum depth of a tree. Default: 6 - \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 + \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 From 6e9017c47439d4aa6e855f5c6fe2ded93c077db9 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 24 Nov 2015 13:12:35 +0100 Subject: [PATCH 03/25] fix for Travis --- R-package/DESCRIPTION | 5 ++- R-package/demo/xgb.plot.multi.tree.R | 64 ---------------------------- 2 files changed, 3 insertions(+), 66 deletions(-) delete mode 100644 R-package/demo/xgb.plot.multi.tree.R diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index f36e34274..6594954f3 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -20,11 +20,12 @@ BugReports: https://github.com/dmlc/xgboost/issues VignetteBuilder: knitr Suggests: knitr, - ggplot2 (>= 1.0.0), + ggplot2 (>= 1.0.1), DiagrammeR (>= 0.8.1), Ckmeans.1d.dp (>= 3.3.1), vcd (>= 1.3), - testthat + testthat, + igraph (>= 1.0.1) Depends: R (>= 2.10) Imports: diff --git a/R-package/demo/xgb.plot.multi.tree.R b/R-package/demo/xgb.plot.multi.tree.R deleted file mode 100644 index feb7e667e..000000000 --- a/R-package/demo/xgb.plot.multi.tree.R +++ /dev/null @@ -1,64 +0,0 @@ -library(stringr) -library(data.table) -library(xgboost) - - -data(agaricus.train, package='xgboost') - -#Both dataset are list with two items, a sparse matrix and labels -#(labels = outcome column which will be learned). -#Each column of the sparse Matrix is a feature in one hot encoding format. -train <- agaricus.train - -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, - eta = 1, nthread = 2, nround = 4, objective = "binary:logistic") - -#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. -tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) - - -# first number of the path represents the tree, then the following numbers are related to the path to follow - -# root init -root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] -tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] - -precedent.nodes <- root.nodes - -while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { - yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] - no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)] - yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0") - no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1") - - tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos] - tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos] - precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) -} - -tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")] -tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")] -tree.matrix[,ID:= Abs.Position] - -tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))] -keepN <- 3 -tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position] - -tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] - -tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] - -tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")] - -CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" - - -yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") - -no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") - -path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n") -DiagrammeR::mermaid(path) - -# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf" -# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) From e43830955fcbbb086556e8cdf2e778f3101b0de5 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Fri, 27 Nov 2015 14:48:54 +0100 Subject: [PATCH 04/25] parameter names change in R function --- R-package/R/xgb.plot.tree.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 63bebf6cf..2976f1b07 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -10,8 +10,8 @@ #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. -#' @param width the width of the diagram in pixels. -#' @param height the height of the diagram in pixels. +#' @param plot.width the width of the diagram in pixels. +#' @param plot.height the height of the diagram in pixels. #' #' @return A \code{DiagrammeR} of the model. #' @@ -43,7 +43,7 @@ #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, width = NULL, height = NULL){ +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){ if (!class(model) %in% c("xgb.Booster", "NULL")) { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") @@ -87,7 +87,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU edges_df = edges, graph_attrs = "rankdir = LR") - DiagrammeR::render_graph(graph, width = width, height = height) + DiagrammeR::render_graph(graph, width = plot.width, height = plot.height) } # Avoid error messages during CRAN check. From 5169d087353f23d90d5b3cf4439179ed6d03ff3e Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Fri, 27 Nov 2015 14:49:06 +0100 Subject: [PATCH 05/25] Add new multi.tree function to R package --- R-package/R/xgb.plot.multi.trees.R | 100 +++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 R-package/R/xgb.plot.multi.trees.R diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R new file mode 100644 index 000000000..037b66e70 --- /dev/null +++ b/R-package/R/xgb.plot.multi.trees.R @@ -0,0 +1,100 @@ +library(stringr) +library(data.table) +library(xgboost) + +#' Project all trees on one and plot it +#' +#' Provide a way to display on one tree all trees of the model. +#' +#' @importFrom data.table data.table +#' @importFrom data.table rbindlist +#' @importFrom data.table setnames +#' @importFrom data.table := +#' @importFrom magrittr %>% +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. +#' +#' @return Two graphs showing the distribution of the model deepness. +#' +#' @details +#' +#' This function tries to capture the complexity of gradient boosted tree ensembles in a cohesive way. +#' The goal is to improve the interpretability of the model generally seen as black box. +#' The function is dedicated to boosting applied to trees only. It won't work on GLM. +#' +#' The purpose is to move from an ensemble of trees to a single tree only. +#' It leverages the fact that the shape of a binary tree is only defined by its deepness. +#' The second fact which is leverage is that all trees in a boosting model tend to share the features they use. +#' +#' The function will project each trees on one tree, and keep the \code{keepN} first feature for each position. +#' This function is inspired from this blog post: +#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, +#' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", +#' min_child_weight = 50) +#' +#' p <- xgb.plot.multi.trees(bst, agaricus.train$data@Dimnames[[2]], 3) +#' print(p) +#' +#' @export +xgb.plot.multi.trees <- function(model, names, keepN = 5, plot.width = NULL, plot.height = NULL){ + tree.matrix <- xgb.model.dt.tree(names, model = model) + + # first number of the path represents the tree, then the following numbers are related to the path to follow + # root init + root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] + tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes] + + precedent.nodes <- root.nodes + + while(tree.matrix[,sum(is.na(abs.node.position))] > 0) { + yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)] + no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)] + yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0") + no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1") + + tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos] + tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos] + precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) + } + + tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")] + tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")] + + + + remove.tree <- . %>% str_replace(pattern = "^\\d+-", replacement = "") + + tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))] + + nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), keepN)], " (", Quality[1:min(length(Quality), keepN)], ")") %>% paste0(collapse = "\n")), by=abs.node.position] + edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL] + + nodes <- DiagrammeR::create_nodes(nodes = nodes.dt[,abs.node.position], + label = nodes.dt[,Text], + style = "filled", + color = "DimGray", + fillcolor= "Blue", + shape = "oval", + #data = allTrees[,Feature] + fontname = "Helvetica" + ) + + edges <- DiagrammeR::create_edges(from = edges.dt[,From], + to = edges.dt[,To], + color = "DimGray", + arrowsize = "1.5", + arrowhead = "vee", + fontname = "Helvetica", + rel = "leading_to") + + graph <- DiagrammeR::create_graph(nodes_df = nodes, + edges_df = edges, + graph_attrs = "rankdir = LR") + + DiagrammeR::render_graph(graph, width = plot.width, height = plot.height) +} From 2fc9dcc54943baf543e2d4f57cf809df932700c8 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Fri, 27 Nov 2015 17:34:26 +0100 Subject: [PATCH 06/25] Improve description wording --- R-package/R/xgb.plot.multi.trees.R | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 037b66e70..2f0fb1d3f 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -1,32 +1,33 @@ -library(stringr) -library(data.table) -library(xgboost) - -#' Project all trees on one and plot it -#' -#' Provide a way to display on one tree all trees of the model. +#' Project all trees on one tree and plot it +#' +#' visualization to view the ensemble of trees as a single collective unit. #' #' @importFrom data.table data.table #' @importFrom data.table rbindlist #' @importFrom data.table setnames #' @importFrom data.table := #' @importFrom magrittr %>% +#' #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. -#' +#' @param features.keep number of features to keep in each position of the multi tree. +#' @param plot.width width in pixels of the graph to produce +#' @param plot.height height in pixels of the graph to produce +#' #' @return Two graphs showing the distribution of the model deepness. #' #' @details #' #' This function tries to capture the complexity of gradient boosted tree ensembles in a cohesive way. #' The goal is to improve the interpretability of the model generally seen as black box. -#' The function is dedicated to boosting applied to trees only. It won't work on GLM. +#' The function is dedicated to boosting applied to decision trees only. #' #' The purpose is to move from an ensemble of trees to a single tree only. -#' It leverages the fact that the shape of a binary tree is only defined by its deepness. -#' The second fact which is leverage is that all trees in a boosting model tend to share the features they use. +#' It takes advantage of the fact that the shape of a binary tree is only defined by its deepness. +#' Therefore in a boosting model, all trees have the same shape. +#' Moreover, the trees tend to reuse the same features. #' -#' The function will project each trees on one tree, and keep the \code{keepN} first feature for each position. +#' The function will project each trees on one tree, and keep the \code{features.keep} first feature for each position. #' This function is inspired from this blog post: #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} #' @@ -41,7 +42,7 @@ library(xgboost) #' print(p) #' #' @export -xgb.plot.multi.trees <- function(model, names, keepN = 5, plot.width = NULL, plot.height = NULL){ +xgb.plot.multi.trees <- function(model, names, features.keep = 5, plot.width = NULL, plot.height = NULL){ tree.matrix <- xgb.model.dt.tree(names, model = model) # first number of the path represents the tree, then the following numbers are related to the path to follow @@ -71,7 +72,7 @@ xgb.plot.multi.trees <- function(model, names, keepN = 5, plot.width = NULL, plo tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))] - nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), keepN)], " (", Quality[1:min(length(Quality), keepN)], ")") %>% paste0(collapse = "\n")), by=abs.node.position] + nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), features.keep)], " (", Quality[1:min(length(Quality), features.keep)], ")") %>% paste0(collapse = "\n")), by=abs.node.position] edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL] nodes <- DiagrammeR::create_nodes(nodes = nodes.dt[,abs.node.position], From 92e904dec981c3ffe8fc67976e0e2d49b41f7021 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Fri, 27 Nov 2015 17:58:50 +0100 Subject: [PATCH 07/25] add exclusion of global variables + generate Roxygen doc --- R-package/NAMESPACE | 1 + R-package/R/xgb.plot.multi.trees.R | 18 +++++++-- R-package/man/xgb.plot.multi.trees.Rd | 56 +++++++++++++++++++++++++++ R-package/man/xgb.plot.tree.Rd | 6 +-- 4 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 R-package/man/xgb.plot.multi.trees.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 7f6fa5817..3a590f27a 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -12,6 +12,7 @@ export(xgb.load) export(xgb.model.dt.tree) export(xgb.plot.deepness) export(xgb.plot.importance) +export(xgb.plot.multi.trees) export(xgb.plot.tree) export(xgb.save) export(xgb.save.raw) diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 2f0fb1d3f..d30d86332 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -1,6 +1,6 @@ #' Project all trees on one tree and plot it #' -#' visualization to view the ensemble of trees as a single collective unit. +#' Visualization of the ensemble of trees as a single collective unit. #' #' @importFrom data.table data.table #' @importFrom data.table rbindlist @@ -18,16 +18,20 @@ #' #' @details #' -#' This function tries to capture the complexity of gradient boosted tree ensembles in a cohesive way. +#' This function tries to capture the complexity of gradient boosted tree ensembles +#' in a cohesive way. #' The goal is to improve the interpretability of the model generally seen as black box. #' The function is dedicated to boosting applied to decision trees only. #' #' The purpose is to move from an ensemble of trees to a single tree only. -#' It takes advantage of the fact that the shape of a binary tree is only defined by its deepness. +#' It takes advantage of the fact that the shape of a binary tree is only defined by +#' its deepness. #' Therefore in a boosting model, all trees have the same shape. #' Moreover, the trees tend to reuse the same features. #' -#' The function will project each trees on one tree, and keep the \code{features.keep} first feature for each position. +#' The function will project each trees on one, and keep for each position the +#' \code{features.keep} first features (based on Gain per feature). +#' #' This function is inspired from this blog post: #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} #' @@ -99,3 +103,9 @@ xgb.plot.multi.trees <- function(model, names, features.keep = 5, plot.width = N DiagrammeR::render_graph(graph, width = plot.width, height = plot.height) } + +globalVariables( + c( + "Feature", "no.nodes.abs.pos", "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position" + ) +) \ No newline at end of file diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd new file mode 100644 index 000000000..2bbe29ca5 --- /dev/null +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.multi.trees.R +\name{xgb.plot.multi.trees} +\alias{xgb.plot.multi.trees} +\title{Project all trees on one tree and plot it} +\usage{ +xgb.plot.multi.trees(model, names, features.keep = 5, plot.width = NULL, + plot.height = NULL) +} +\arguments{ +\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} + +\item{features.keep}{number of features to keep in each position of the multi tree.} + +\item{plot.width}{width in pixels of the graph to produce} + +\item{plot.height}{height in pixels of the graph to produce} + +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} +} +\value{ +Two graphs showing the distribution of the model deepness. +} +\description{ +Visualization of the ensemble of trees as a single collective unit. +} +\details{ +This function tries to capture the complexity of gradient boosted tree ensembles +in a cohesive way. +The goal is to improve the interpretability of the model generally seen as black box. +The function is dedicated to boosting applied to decision trees only. + +The purpose is to move from an ensemble of trees to a single tree only. +It takes advantage of the fact that the shape of a binary tree is only defined by +its deepness. +Therefore in a boosting model, all trees have the same shape. +Moreover, the trees tend to reuse the same features. + +The function will project each trees on one, and keep for each position the +\code{features.keep} first features (based on Gain per feature). + +This function is inspired from this blog post: +\url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} +} +\examples{ +data(agaricus.train, package='xgboost') + +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, + eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", + min_child_weight = 50) + +p <- xgb.plot.multi.trees(bst, agaricus.train$data@Dimnames[[2]], 3) +print(p) + +} + diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index f34e75bf9..2008014cf 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -5,7 +5,7 @@ \title{Plot a boosted tree model} \usage{ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, - n_first_tree = NULL, width = NULL, height = NULL) + n_first_tree = NULL, plot.width = NULL, plot.height = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} @@ -16,9 +16,9 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} -\item{width}{the width of the diagram in pixels.} +\item{plot.width}{the width of the diagram in pixels.} -\item{height}{the height of the diagram in pixels.} +\item{plot.height}{the height of the diagram in pixels.} } \value{ A \code{DiagrammeR} of the model. From 28060d5595f599f1eaa1725c9e68337b3ee3242d Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Fri, 27 Nov 2015 18:19:51 +0100 Subject: [PATCH 08/25] Fix missing dependencies --- R-package/NAMESPACE | 1 + R-package/R/xgb.model.dt.tree.R | 1 - R-package/R/xgb.plot.multi.trees.R | 7 ++++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 3a590f27a..a9ae672a3 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -37,6 +37,7 @@ importFrom(data.table,setnames) importFrom(magrittr,"%>%") importFrom(magrittr,add) importFrom(magrittr,not) +importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 5833389e2..13d3ecc5b 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -12,7 +12,6 @@ #' @importFrom magrittr add #' @importFrom stringr str_extract #' @importFrom stringr str_split -#' @importFrom stringr str_extract #' @importFrom stringr str_trim #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index d30d86332..f53d1a13f 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -7,6 +7,8 @@ #' @importFrom data.table setnames #' @importFrom data.table := #' @importFrom magrittr %>% +#' @importFrom stringr str_detect +#' @importFrom stringr str_extract #' #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. @@ -83,9 +85,8 @@ xgb.plot.multi.trees <- function(model, names, features.keep = 5, plot.width = N label = nodes.dt[,Text], style = "filled", color = "DimGray", - fillcolor= "Blue", - shape = "oval", - #data = allTrees[,Feature] + fillcolor= "Beige", + shape = "oval", fontname = "Helvetica" ) From 07d62a4b89235c6972d1e3a3440428fe088a45aa Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 10:22:14 +0100 Subject: [PATCH 09/25] Polishing API + wording in function description #Rstat --- R-package/R/xgb.importance.R | 18 +++++------------- R-package/R/xgb.model.dt.tree.R | 20 ++++---------------- R-package/R/xgb.plot.deepness.R | 23 +++++------------------ R-package/R/xgb.plot.multi.trees.R | 17 +++++++++-------- R-package/man/xgb.importance.Rd | 8 ++++---- R-package/man/xgb.model.dt.tree.Rd | 6 ++---- R-package/man/xgb.plot.deepness.Rd | 8 +++----- R-package/man/xgb.plot.multi.trees.Rd | 18 +++++++++--------- 8 files changed, 41 insertions(+), 77 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 029c3725b..54c94245c 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -66,16 +66,12 @@ #' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label) #' #' @export -xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){ +xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){ if (!class(feature_names) %in% c("character", "NULL")) { - stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") + stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.") } - if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { - stop("filename_dump: Has to be a path to the model dump file.") - } - - if (!class(model) %in% c("xgb.Booster", "NULL")) { + if (class(model) != "xgb.Booster") { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } @@ -87,12 +83,8 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector") } - if(is.null(model)){ - text <- readLines(filename_dump) - } else { - text <- xgb.dump(model = model, with.stats = T) - } - + text <- xgb.dump(model = model, with.stats = T) + if(text[2] == "bias:"){ result <- readLines(filename_dump) %>% linearDump(feature_names, .) if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.") diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 13d3ecc5b..a70c344cc 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -14,7 +14,6 @@ #' @importFrom stringr str_split #' @importFrom stringr str_trim #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. @@ -54,20 +53,13 @@ #' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export -xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){ +xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { - stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.") - } else if (!is.null(filename_dump) && !file.exists(filename_dump)) { - stop("filename_dump: path to the model doesn't exist.") - } else if(is.null(filename_dump) && is.null(model) && is.null(text)){ - stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.") - } - if (!class(model) %in% c("xgb.Booster", "NULL")) { + if (class(model) != "xgb.Booster") { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } @@ -79,12 +71,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model stop("n_first_tree: Has to be a numeric vector of size 1.") } - if(!is.null(model)){ - text <- xgb.dump(model = model, with.stats = T) - } else if(!is.null(filename_dump)){ - text <- readLines(filename_dump) %>% str_trim(side = "both") - } - + text <- xgb.dump(model = model, with.stats = T) + position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1) extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index 30aea46b8..b6c05f727 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -69,7 +69,6 @@ get.paths.to.leaf <- function(dt.tree) { #' @importFrom data.table setnames #' @importFrom data.table := #' @importFrom magrittr %>% -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' #' @return Two graphs showing the distribution of the model deepness. @@ -77,7 +76,7 @@ get.paths.to.leaf <- function(dt.tree) { #' @details #' Display both the number of \code{leaf} and the distribution of \code{weighted observations} #' by tree deepness level. -#' The purpose of this function is to help the user to find the best trad-off to set +#' The purpose of this function is to help the user to find the best trade-off to set #' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off. #' #' See \link{xgb.train} for more information about these parameters. @@ -89,7 +88,7 @@ get.paths.to.leaf <- function(dt.tree) { #' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances). #' } #' -#' This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html} +#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html} #' #' @examples #' data(agaricus.train, package='xgboost') @@ -101,7 +100,7 @@ get.paths.to.leaf <- function(dt.tree) { #' xgb.plot.deepness(model = bst) #' #' @export -xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) { +xgb.plot.deepness <- function(model = NULL) { if (!requireNamespace("ggplot2", quietly = TRUE)) { stop("ggplot2 package is required for plotting the graph deepness.", call. = FALSE) @@ -117,23 +116,11 @@ xgb.plot.deepness <- function(filename_dump = NULL, model = NULL) { call. = FALSE) } - if (!class(model) %in% c("xgb.Booster", "NULL")) { + if (class(model) != "xgb.Booster") { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } - if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) { - stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.") - } else if (!is.null(filename_dump) && !file.exists(filename_dump)) { - stop("filename_dump: path to the model doesn't exist.") - } else if(is.null(filename_dump) && is.null(model) && is.null(text)){ - stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.") - } - - if(!is.null(model)){ - dt.tree <- xgb.model.dt.tree(model = model) - } else if(!is.null(filename_dump)){ - dt.tree <- xgb.model.dt.tree(filename_dump = filename_dump) - } + dt.tree <- xgb.model.dt.tree(model = model) dt.edge.elements <- data.table() paths <- get.paths.to.leaf(dt.tree) diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index f53d1a13f..13416b480 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -10,9 +10,8 @@ #' @importFrom stringr str_detect #' @importFrom stringr str_extract #' -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. -#' @param features.keep number of features to keep in each position of the multi tree. +#' @param features.keep number of features to keep in each position of the multi trees. #' @param plot.width width in pixels of the graph to produce #' @param plot.height height in pixels of the graph to produce #' @@ -20,21 +19,23 @@ #' #' @details #' -#' This function tries to capture the complexity of gradient boosted tree ensembles +#' This function tries to capture the complexity of gradient boosted tree ensemble #' in a cohesive way. +#' #' The goal is to improve the interpretability of the model generally seen as black box. #' The function is dedicated to boosting applied to decision trees only. #' #' The purpose is to move from an ensemble of trees to a single tree only. +#' #' It takes advantage of the fact that the shape of a binary tree is only defined by -#' its deepness. -#' Therefore in a boosting model, all trees have the same shape. +#' its deepness (therefore in a boosting model, all trees have the same shape). +#' #' Moreover, the trees tend to reuse the same features. #' -#' The function will project each trees on one, and keep for each position the -#' \code{features.keep} first features (based on Gain per feature). +#' The function will project each tree on one, and keep for each position the +#' \code{features.keep} first features (based on Gain per feature measure). #' -#' This function is inspired from this blog post: +#' This function is inspired by this blog post: #' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} #' #' @examples diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index a1ce89d4f..eac2da657 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -4,14 +4,12 @@ \alias{xgb.importance} \title{Show importance of features in a model} \usage{ -xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL, - data = NULL, label = NULL, target = function(x) ((x + label) == 2)) +xgb.importance(feature_names = NULL, model = NULL, data = NULL, + label = NULL, target = function(x) ((x + label) == 2)) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} - \item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.} \item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} @@ -19,6 +17,8 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL, \item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} \item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.} + +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 9a3efc39f..8d88f60f5 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -4,14 +4,12 @@ \alias{xgb.model.dt.tree} \title{Convert tree model dump to data.table} \usage{ -xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, - model = NULL, text = NULL, n_first_tree = NULL) +xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL, + n_first_tree = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} - \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} \item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index e54d5141b..6488514dd 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -4,11 +4,9 @@ \alias{xgb.plot.deepness} \title{Plot model trees deepness} \usage{ -xgb.plot.deepness(filename_dump = NULL, model = NULL) +xgb.plot.deepness(model = NULL) } \arguments{ -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} - \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} } \value{ @@ -20,7 +18,7 @@ Generate a graph to plot the distribution of deepness among trees. \details{ Display both the number of \code{leaf} and the distribution of \code{weighted observations} by tree deepness level. -The purpose of this function is to help the user to find the best trad-off to set +The purpose of this function is to help the user to find the best trade-off to set the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off. See \link{xgb.train} for more information about these parameters. @@ -32,7 +30,7 @@ The graph is made of two parts: \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances). } -This function is very inspired from this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html} +This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html} } \examples{ data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index 2bbe29ca5..b3cacc122 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -10,13 +10,11 @@ xgb.plot.multi.trees(model, names, features.keep = 5, plot.width = NULL, \arguments{ \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} -\item{features.keep}{number of features to keep in each position of the multi tree.} +\item{features.keep}{number of features to keep in each position of the multi trees.} \item{plot.width}{width in pixels of the graph to produce} \item{plot.height}{height in pixels of the graph to produce} - -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} } \value{ Two graphs showing the distribution of the model deepness. @@ -25,21 +23,23 @@ Two graphs showing the distribution of the model deepness. Visualization of the ensemble of trees as a single collective unit. } \details{ -This function tries to capture the complexity of gradient boosted tree ensembles +This function tries to capture the complexity of gradient boosted tree ensemble in a cohesive way. + The goal is to improve the interpretability of the model generally seen as black box. The function is dedicated to boosting applied to decision trees only. The purpose is to move from an ensemble of trees to a single tree only. + It takes advantage of the fact that the shape of a binary tree is only defined by -its deepness. -Therefore in a boosting model, all trees have the same shape. +its deepness (therefore in a boosting model, all trees have the same shape). + Moreover, the trees tend to reuse the same features. -The function will project each trees on one, and keep for each position the -\code{features.keep} first features (based on Gain per feature). +The function will project each tree on one, and keep for each position the +\code{features.keep} first features (based on Gain per feature measure). -This function is inspired from this blog post: +This function is inspired by this blog post: \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/} } \examples{ From 476a6842eaaecc9ecf1295a0ace2bda18098b3b9 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 10:26:23 +0100 Subject: [PATCH 10/25] Fix Rstat --- R-package/R/xgb.importance.R | 6 ------ R-package/man/xgb.importance.Rd | 2 -- 2 files changed, 8 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 54c94245c..74151b1c4 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -12,15 +12,9 @@ #' @importFrom Matrix sparseVector #' #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). -#' #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. -#' #' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional. -#' #' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional. -#' #' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index eac2da657..1f8498deb 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -17,8 +17,6 @@ xgb.importance(feature_names = NULL, model = NULL, data = NULL, \item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} \item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.} - -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. From 376ba6912ec1cf058862d7ebe7ea6de836ff4d8f Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 14:08:27 +0100 Subject: [PATCH 11/25] Update test to take care of API change --- R-package/tests/testthat/test_helpers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index c51fef1bd..11368216b 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -23,7 +23,7 @@ test_that("xgb.dump works", { test_that("xgb.importance works", { expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T)) - importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') + importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst) expect_equal(dim(importance), c(7, 4)) expect_equal(colnames(importance), c("Feature", "Gain", "Cover", "Frequency")) }) From c09c02300a01844dd3a933de0f3dde6677581b10 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 15:04:17 +0100 Subject: [PATCH 12/25] Add new tests for new functions --- R-package/R/xgb.importance.R | 29 ++++++++++++------------- R-package/R/xgb.model.dt.tree.R | 14 ++++++------ R-package/R/xgb.plot.multi.trees.R | 2 +- R-package/man/xgb.plot.multi.trees.Rd | 2 +- R-package/tests/testthat/test_helpers.R | 12 ++++++++-- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 74151b1c4..d3a5910b4 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -76,14 +76,23 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe if(class(label) == "numeric"){ if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector") } - - text <- xgb.dump(model = model, with.stats = T) - if(text[2] == "bias:"){ - result <- readLines(filename_dump) %>% linearDump(feature_names, .) + treeDump <- function(feature_names, text, keepDetail){ + if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature" + xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)] + } + + linearDump <- function(feature_names, text){ + which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) + } + + model.text.dump <- xgb.dump(model = model, with.stats = T) + + if(model.text.dump[2] == "bias:"){ + result <- model.text.dump %>% linearDump(feature_names, .) if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.") } else { - result <- treeDump(feature_names, text = text, keepDetail = !is.null(data)) + result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data)) # Co-occurence computation if(!is.null(data) & !is.null(label) & nrow(result) > 0) { @@ -102,17 +111,7 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe result } -treeDump <- function(feature_names, text, keepDetail){ - if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature" - result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)] - - result -} - -linearDump <- function(feature_names, text){ - which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) -} # Avoid error messages during CRAN check. # The reason is that these variables are never declared diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index a70c344cc..29ef2e1df 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -59,19 +59,19 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (class(model) != "xgb.Booster") { - stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") - } - - if (!class(text) %in% c("character", "NULL")) { - stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") + if (class(model) != "xgb.Booster" & class(text) != "character") { + "model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>% + paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>% + stop() } if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { stop("n_first_tree: Has to be a numeric vector of size 1.") } - text <- xgb.dump(model = model, with.stats = T) + if(is.null(text)){ + text <- xgb.dump(model = model, with.stats = T) + } position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1) diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 13416b480..1efa375a4 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -45,7 +45,7 @@ #' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", #' min_child_weight = 50) #' -#' p <- xgb.plot.multi.trees(bst, agaricus.train$data@Dimnames[[2]], 3) +#' p <- xgb.plot.multi.trees(model = bst, names = agaricus.train$data@Dimnames[[2]], 3) #' print(p) #' #' @export diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index b3cacc122..6e59915e2 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -49,7 +49,7 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.dep eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", min_child_weight = 50) -p <- xgb.plot.multi.trees(bst, agaricus.train$data@Dimnames[[2]], 3) +p <- xgb.plot.multi.trees(model = bst, names = agaricus.train$data@Dimnames[[2]], 3) print(p) } diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 11368216b..490b6b867 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -19,15 +19,23 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, test_that("xgb.dump works", { capture.output(print(xgb.dump(bst))) + expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T)) }) test_that("xgb.importance works", { - expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T)) importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst) expect_equal(dim(importance), c(7, 4)) expect_equal(colnames(importance), c("Feature", "Gain", "Cover", "Frequency")) }) test_that("xgb.plot.tree works", { - xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst) + xgb.plot.tree(names = agaricus.train$data@Dimnames[[2]], model = bst) +}) + +test_that("xgb.plot.deepness works", { + xgb.plot.deepness(model = bst) +}) + +test_that("xgb.plot.multi.trees works", { + xgb.plot.multi.trees(model = bst, names = agaricus.train$data@Dimnames[[2]], 3) }) \ No newline at end of file From 730bd72056d05d3f6144c1bf5ea2aa25109cdaf9 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 15:47:10 +0100 Subject: [PATCH 13/25] some fixes for Travis #Rstat --- R-package/R/xgb.model.dt.tree.R | 2 +- R-package/R/xgb.plot.deepness.R | 2 +- R-package/R/xgb.plot.multi.trees.R | 7 ++++--- R-package/R/xgb.plot.tree.R | 17 ++++++----------- R-package/man/xgb.model.dt.tree.Rd | 2 +- R-package/man/xgb.plot.deepness.Rd | 2 +- R-package/man/xgb.plot.multi.trees.Rd | 8 +++++--- R-package/man/xgb.plot.tree.Rd | 8 +++----- R-package/tests/testthat/test_helpers.R | 6 +++--- 9 files changed, 25 insertions(+), 29 deletions(-) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 29ef2e1df..4d8e10e3b 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -50,7 +50,7 @@ #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst) +#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){ diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index b6c05f727..bebb7605a 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -93,7 +93,7 @@ get.paths.to.leaf <- function(dt.tree) { #' @examples #' data(agaricus.train, package='xgboost') #' -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, +#' bst <- xgboost(data = agaricus.train$data, max.depth = 15, #' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", #' min_child_weight = 50) #' diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 1efa375a4..f140a959f 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -11,6 +11,7 @@ #' @importFrom stringr str_extract #' #' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file. +#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param features.keep number of features to keep in each position of the multi trees. #' @param plot.width width in pixels of the graph to produce #' @param plot.height height in pixels of the graph to produce @@ -45,12 +46,12 @@ #' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", #' min_child_weight = 50) #' -#' p <- xgb.plot.multi.trees(model = bst, names = agaricus.train$data@Dimnames[[2]], 3) +#' p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3) #' print(p) #' #' @export -xgb.plot.multi.trees <- function(model, names, features.keep = 5, plot.width = NULL, plot.height = NULL){ - tree.matrix <- xgb.model.dt.tree(names, model = model) +xgb.plot.multi.trees <- function(model, feature_names = NULL, features.keep = 5, plot.width = NULL, plot.height = NULL){ + tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model) # first number of the path represents the tree, then the following numbers are related to the path to follow # root init diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 2976f1b07..ea7fabef7 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -7,7 +7,6 @@ #' @importFrom data.table := #' @importFrom magrittr %>% #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' @param plot.width the width of the diagram in pixels. @@ -40,25 +39,21 @@ #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst) +#' xgb.plot.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){ +xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){ - if (!class(model) %in% c("xgb.Booster", "NULL")) { + if (class(model) != "xgb.Booster") { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } if (!requireNamespace("DiagrammeR", quietly = TRUE)) { stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE) } - - if(is.null(model)){ - allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree) - } else { - allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree) - } - + + allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree) + allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)] allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"] allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"] diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 8d88f60f5..7dadb20aa 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -52,7 +52,7 @@ bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) +xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) } diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index 6488514dd..d011a4dc5 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -35,7 +35,7 @@ This function is inspired by this blog post \url{http://aysent.github.io/2015/11 \examples{ data(agaricus.train, package='xgboost') -bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, +bst <- xgboost(data = agaricus.train$data, max.depth = 15, eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", min_child_weight = 50) diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index 6e59915e2..2d0a1d3e8 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -4,12 +4,14 @@ \alias{xgb.plot.multi.trees} \title{Project all trees on one tree and plot it} \usage{ -xgb.plot.multi.trees(model, names, features.keep = 5, plot.width = NULL, - plot.height = NULL) +xgb.plot.multi.trees(model, feature_names = NULL, features.keep = 5, + plot.width = NULL, plot.height = NULL) } \arguments{ \item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.} +\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} + \item{features.keep}{number of features to keep in each position of the multi trees.} \item{plot.width}{width in pixels of the graph to produce} @@ -49,7 +51,7 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.dep eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", min_child_weight = 50) -p <- xgb.plot.multi.trees(model = bst, names = agaricus.train$data@Dimnames[[2]], 3) +p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3) print(p) } diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 2008014cf..16e80f9ee 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -4,14 +4,12 @@ \alias{xgb.plot.tree} \title{Plot a boosted tree model} \usage{ -xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, - n_first_tree = NULL, plot.width = NULL, plot.height = NULL) +xgb.plot.tree(feature_names = NULL, model = NULL, n_first_tree = NULL, + plot.width = NULL, plot.height = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).} - \item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.} \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} @@ -51,7 +49,7 @@ bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst) +xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) } diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 490b6b867..d4e547de5 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -23,13 +23,13 @@ test_that("xgb.dump works", { }) test_that("xgb.importance works", { - importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst) + importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) expect_equal(dim(importance), c(7, 4)) expect_equal(colnames(importance), c("Feature", "Gain", "Cover", "Frequency")) }) test_that("xgb.plot.tree works", { - xgb.plot.tree(names = agaricus.train$data@Dimnames[[2]], model = bst) + xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) }) test_that("xgb.plot.deepness works", { @@ -37,5 +37,5 @@ test_that("xgb.plot.deepness works", { }) test_that("xgb.plot.multi.trees works", { - xgb.plot.multi.trees(model = bst, names = agaricus.train$data@Dimnames[[2]], 3) + xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], 3) }) \ No newline at end of file From 2ca4016a1fdd7b62bd821b34ee7985024f74d1c0 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 16:21:43 +0100 Subject: [PATCH 14/25] fix relative to examples #Rstat --- R-package/R/xgb.importance.R | 3 +-- R-package/R/xgb.plot.deepness.R | 2 +- R-package/R/xgb.plot.tree.R | 3 +-- R-package/man/xgb.importance.Rd | 3 +-- R-package/man/xgb.plot.deepness.Rd | 2 +- R-package/man/xgb.plot.tree.Rd | 3 +-- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index d3a5910b4..78fcaf3ac 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -48,9 +48,8 @@ #' # Both dataset are list with two items, a sparse matrix and labels #' # (labels = outcome column which will be learned). #' # Each column of the sparse Matrix is a feature in one hot encoding format. -#' train <- agaricus.train #' -#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' #' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix. diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index bebb7605a..b6c05f727 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -93,7 +93,7 @@ get.paths.to.leaf <- function(dt.tree) { #' @examples #' data(agaricus.train, package='xgboost') #' -#' bst <- xgboost(data = agaricus.train$data, max.depth = 15, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, #' eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", #' min_child_weight = 50) #' diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index ea7fabef7..59822ec83 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -33,9 +33,8 @@ #' #Both dataset are list with two items, a sparse matrix and labels #' #(labels = outcome column which will be learned). #' #Each column of the sparse Matrix is a feature in one hot encoding format. -#' train <- agaricus.train #' -#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 1f8498deb..14604312e 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -54,9 +54,8 @@ data(agaricus.train, package='xgboost') # Both dataset are list with two items, a sparse matrix and labels # (labels = outcome column which will be learned). # Each column of the sparse Matrix is a feature in one hot encoding format. -train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") # train$data@Dimnames[[2]] represents the column names of the sparse matrix. diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index d011a4dc5..6488514dd 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -35,7 +35,7 @@ This function is inspired by this blog post \url{http://aysent.github.io/2015/11 \examples{ data(agaricus.train, package='xgboost') -bst <- xgboost(data = agaricus.train$data, max.depth = 15, +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15, eta = 1, nthread = 2, nround = 30, objective = "binary:logistic", min_child_weight = 50) diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 16e80f9ee..164b013c1 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -43,9 +43,8 @@ data(agaricus.train, package='xgboost') #Both dataset are list with two items, a sparse matrix and labels #(labels = outcome column which will be learned). #Each column of the sparse Matrix is a feature in one hot encoding format. -train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. From 8252d0d9f52d34e5fe182dcb667db520b5249060 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 30 Nov 2015 16:33:33 +0100 Subject: [PATCH 15/25] fix example --- R-package/R/xgb.importance.R | 6 +++--- R-package/R/xgb.plot.importance.R | 7 +++---- R-package/man/xgb.importance.Rd | 6 +++--- R-package/man/xgb.plot.importance.Rd | 7 +++---- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 78fcaf3ac..55f680c42 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -52,11 +52,11 @@ #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' -#' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' xgb.importance(train$data@@Dimnames[[2]], model = bst) +#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix. +#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst) #' #' # Same thing with co-occurence computation this time -#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label) +#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label) #' #' @export xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){ diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index ea3e17892..96b576ee3 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -19,13 +19,12 @@ #' #Both dataset are list with two items, a sparse matrix and labels #' #(labels = outcome column which will be learned). #' #Each column of the sparse Matrix is a feature in one hot encoding format. -#' train <- agaricus.train #' -#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' -#' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix. -#' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst) +#' #agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix. +#' importance_matrix <- xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst) #' xgb.plot.importance(importance_matrix) #' #' @export diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 14604312e..c144bb85f 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -58,11 +58,11 @@ data(agaricus.train, package='xgboost') bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -# train$data@Dimnames[[2]] represents the column names of the sparse matrix. -xgb.importance(train$data@Dimnames[[2]], model = bst) +# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix. +xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst) # Same thing with co-occurence computation this time -xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label) +xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label) } diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 4ade2cda3..f49f57027 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -28,13 +28,12 @@ data(agaricus.train, package='xgboost') #Both dataset are list with two items, a sparse matrix and labels #(labels = outcome column which will be learned). #Each column of the sparse Matrix is a feature in one hot encoding format. -train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -#train$data@Dimnames[[2]] represents the column names of the sparse matrix. -importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst) +#agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix. +importance_matrix <- xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst) xgb.plot.importance(importance_matrix) } From 0ab719b59b9880a253b03aa91c8e5fdbea8ea25b Mon Sep 17 00:00:00 2001 From: "Yuan (Terry) Tang" Date: Tue, 1 Dec 2015 08:39:25 -0600 Subject: [PATCH 16/25] Disable Python lint test temporarily --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c7049be94..c96c4b742 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ os: env: matrix: - TASK=lint LINT_LANG=cpp - - TASK=lint LINT_LANG=python + #- TASK=lint LINT_LANG=python - TASK=R-package CXX=g++ - TASK=python-package CXX=g++ - TASK=python-package3 CXX=g++ From 6ce57d9cf8cfe2bbb7756ff2c853f68abd56a5d5 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 1 Dec 2015 15:44:27 +0100 Subject: [PATCH 17/25] Add new tests for helper functions --- R-package/tests/testthat/test_helpers.R | 29 ++++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index d4e547de5..2fec51bef 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -5,7 +5,7 @@ require(data.table) require(Matrix) require(vcd) -set.seed(1994) +set.seed(1982) data(Arthritis) data(agaricus.train, package='xgboost') df <- data.table(Arthritis, keep.rownames = F) @@ -17,25 +17,38 @@ output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y] bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") +feature.names <- agaricus.train$data@Dimnames[[2]] + test_that("xgb.dump works", { capture.output(print(xgb.dump(bst))) expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T)) }) -test_that("xgb.importance works", { +test_that("xgb.model.dt.tree works with and without feature names", { + names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover", + "Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality") + dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst) + expect_equal(names.dt.trees, names(dt.tree)) + expect_equal(dim(dt.tree), c(162, 15)) + xgb.model.dt.tree(model = bst) +}) + +test_that("xgb.importance works with and without feature names", { importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) expect_equal(dim(importance), c(7, 4)) expect_equal(colnames(importance), c("Feature", "Gain", "Cover", "Frequency")) + xgb.importance(model = bst) }) -test_that("xgb.plot.tree works", { - xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst) +test_that("xgb.plot.tree works with and without feature names", { + xgb.plot.tree(feature_names = feature.names, model = bst) + xgb.plot.tree(model = bst) }) +test_that("xgb.plot.multi.trees works with and without feature names", { + xgb.plot.multi.trees(model = bst, feature_names = feature.names, features.keep = 3) + xgb.plot.multi.trees(model = bst, features.keep = 3) +}) test_that("xgb.plot.deepness works", { xgb.plot.deepness(model = bst) }) - -test_that("xgb.plot.multi.trees works", { - xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], 3) -}) \ No newline at end of file From b05d5d3f243973e0921eca24ce8683e447eaea8f Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 1 Dec 2015 18:44:25 +0100 Subject: [PATCH 18/25] Improve feature importance on GLM model --- R-package/R/xgb.importance.R | 4 +++- R-package/tests/testthat/test_helpers.R | 11 ++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 55f680c42..07211ff59 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -82,7 +82,9 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe } linearDump <- function(feature_names, text){ - which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .) + weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric + if(is.null(feature_names)) feature_names <- seq(to = length(weights)) + data.table(Feature = feature_names, Weight = weights) } model.text.dump <- xgb.dump(model = model, with.stats = T) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 2fec51bef..262ec1cd6 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -15,7 +15,7 @@ df[,ID := NULL] sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y] bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, - eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") + eta = 1, nthread = 2, nround = 10, objective = "binary:logistic") feature.names <- agaricus.train$data@Dimnames[[2]] @@ -40,6 +40,15 @@ test_that("xgb.importance works with and without feature names", { xgb.importance(model = bst) }) +test_that("xgb.importance works with GLM model", { + bst.GLM <- xgboost(data = sparse_matrix, label = output_vector, + eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear") + importance.GLM <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.GLM) + expect_equal(dim(importance.GLM), c(10, 2)) + expect_equal(colnames(importance.GLM), c("Feature", "Weight")) + xgb.importance(model = bst.GLM) +}) + test_that("xgb.plot.tree works with and without feature names", { xgb.plot.tree(feature_names = feature.names, model = bst) xgb.plot.tree(model = bst) From 5575257b086accf9df0300a7ffbb8c0b97d6132f Mon Sep 17 00:00:00 2001 From: Bing Xu Date: Wed, 2 Dec 2015 01:28:23 -0700 Subject: [PATCH 19/25] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 363799b23..b68b309b5 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,9 @@ Contents What's New ---------- +* XGBoost helps Vlad Mironov, Alexander Guschin to win the [CERN LHCb experiment Flavour of Physics competition](https://www.kaggle.com/c/flavours-of-physics). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/11/30/flavour-of-physics-technical-write-up-1st-place-go-polar-bears/). +* XGBoost helps Mario Filho, Josef Feigl, Lucas, Gilberto to win the [Caterpillar Tube Pricing competition](https://www.kaggle.com/c/caterpillar-tube-pricing). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/09/22/caterpillar-winners-interview-1st-place-gilberto-josef-leustagos-mario/). +* XGBoost helps Halla Yang to win the [Recruit Coupon Purchase Prediction Challenge](https://www.kaggle.com/c/coupon-purchase-prediction). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/10/21/recruit-coupon-purchase-winners-interview-2nd-place-halla-yang/). * XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/). * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance) Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower) From d04f7005deef7a4fa126ec691f2b8dabb7dfb770 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 10:39:57 +0100 Subject: [PATCH 20/25] add support of GLM model in importance plot function --- R-package/R/xgb.plot.importance.R | 22 +++++++++++++++------- R-package/man/xgb.plot.importance.Rd | 4 ++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 96b576ee3..1fcd7c014 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -1,6 +1,6 @@ #' Plot feature importance bar graph #' -#' Read a data.table containing feature importance details and plot it. +#' Read a data.table containing feature importance details and plot it (for both GLM and Trees). #' #' @importFrom magrittr %>% #' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function. @@ -10,7 +10,7 @@ #' #' @details #' The purpose of this function is to easily represent the importance of each feature of a model. -#' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it). +#' The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it). #' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function. #' #' @examples @@ -40,21 +40,29 @@ xgb.plot.importance <- stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE) } + if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Gain", "Cover", "Frequency")))){ + y.axe.name <- "Gain" + } else if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Weight")))){ + y.axe.name <- "Weight" + } else { + stop("Importance matrix is not correct (column names issue)") + } + # To avoid issues in clustering when co-occurences are used importance_matrix <- - importance_matrix[, .(Gain = sum(Gain)), by = Feature] + importance_matrix[, .(Gain.or.Weight = sum(get(y.axe.name))), by = Feature] clusters <- - suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) + suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain.or.Weight], numberOfClusters)) importance_matrix[,"Cluster":= clusters$cluster %>% as.character] plot <- ggplot2::ggplot( importance_matrix, ggplot2::aes( - x = stats::reorder(Feature, Gain), y = Gain, width = 0.05 + x = stats::reorder(Feature, Gain.or.Weight), y = Gain.or.Weight, width = 0.05 ), environment = environment() ) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = - "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme( + "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab(y.axe.name) + ggplot2::ggtitle("Feature importance") + ggplot2::theme( plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank() ) @@ -66,6 +74,6 @@ xgb.plot.importance <- # They are mainly column names inferred by Data.table... globalVariables( c( - "Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text" + "Feature", "Gain.or.Weight", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "Gain.or.Weight" ) ) diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index f49f57027..2f9d5651d 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -15,11 +15,11 @@ xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10)) A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar. } \description{ -Read a data.table containing feature importance details and plot it. +Read a data.table containing feature importance details and plot it (for both GLM and Trees). } \details{ The purpose of this function is to easily represent the importance of each feature of a model. -The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it). +The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it). In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function. } \examples{ From 1678a6fbdb4c32e3e21beace264e9869654e7a88 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 10:40:15 +0100 Subject: [PATCH 21/25] Increase cover of tests #Rstat --- R-package/tests/testthat/test_helpers.R | 39 ++++++++++++++----------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 262ec1cd6..efc22f0b9 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -14,50 +14,55 @@ df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] df[,ID := NULL] sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y] -bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, - eta = 1, nthread = 2, nround = 10, objective = "binary:logistic") +bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, + eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gbtree") + +bst.GLM <- xgboost(data = sparse_matrix, label = output_vector, + eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear") feature.names <- agaricus.train$data@Dimnames[[2]] test_that("xgb.dump works", { - capture.output(print(xgb.dump(bst))) - expect_true(xgb.dump(bst, 'xgb.model.dump', with.stats = T)) + capture.output(print(xgb.dump(bst.Tree))) + capture.output(print(xgb.dump(bst.GLM))) + expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with.stats = T)) }) test_that("xgb.model.dt.tree works with and without feature names", { names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover", "Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality") - dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst) + dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree) expect_equal(names.dt.trees, names(dt.tree)) expect_equal(dim(dt.tree), c(162, 15)) - xgb.model.dt.tree(model = bst) + xgb.model.dt.tree(model = bst.Tree) }) test_that("xgb.importance works with and without feature names", { - importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) - expect_equal(dim(importance), c(7, 4)) - expect_equal(colnames(importance), c("Feature", "Gain", "Cover", "Frequency")) - xgb.importance(model = bst) + importance.Tree <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.Tree) + expect_equal(dim(importance.Tree), c(7, 4)) + expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency")) + xgb.importance(model = bst.Tree) + xgb.plot.importance(importance_matrix = importance.Tree) }) test_that("xgb.importance works with GLM model", { - bst.GLM <- xgboost(data = sparse_matrix, label = output_vector, - eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear") importance.GLM <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.GLM) expect_equal(dim(importance.GLM), c(10, 2)) expect_equal(colnames(importance.GLM), c("Feature", "Weight")) xgb.importance(model = bst.GLM) + xgb.plot.importance(importance.GLM) }) test_that("xgb.plot.tree works with and without feature names", { - xgb.plot.tree(feature_names = feature.names, model = bst) - xgb.plot.tree(model = bst) + xgb.plot.tree(feature_names = feature.names, model = bst.Tree) + xgb.plot.tree(model = bst.Tree) }) test_that("xgb.plot.multi.trees works with and without feature names", { - xgb.plot.multi.trees(model = bst, feature_names = feature.names, features.keep = 3) - xgb.plot.multi.trees(model = bst, features.keep = 3) + xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features.keep = 3) + xgb.plot.multi.trees(model = bst.Tree, features.keep = 3) }) + test_that("xgb.plot.deepness works", { - xgb.plot.deepness(model = bst) + xgb.plot.deepness(model = bst.Tree) }) From 8233d589b64a7c487d8413cc032ce921789cc7f7 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:47:12 +0100 Subject: [PATCH 22/25] Improve predict function documentation --- R-package/R/predict.xgb.Booster.R | 11 +++++++++++ R-package/man/predict-xgb.Booster-method.Rd | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index abdb94e75..d608f3465 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -20,6 +20,17 @@ setClass("xgb.Booster", #' only valid for gbtree, but not for gblinear. set it to be value bigger #' than 0. It will use all trees by default. #' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object. +#' +#' @details +#' The option \code{ntreelimit} purpose is to let the user train a model with lots +#' of trees but use only the first trees for prediction to avoid overfitting +#' (without having to train a new model with less trees). +#' +#' The option \code{predleaf} purpose is inspired from §3.1 of the paper +#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +#' The idea is to use the model as a generator of new features which capture non linear link +#' from original features. +#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index 13f37802e..341ced8c6 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -31,6 +31,16 @@ than 0. It will use all trees by default.} \description{ Predicted values based on xgboost model object. } +\details{ +The option \code{ntreelimit} purpose is to let the user train a model with lots +of trees but use only the first trees for prediction to avoid overfitting +(without having to train a new model with less trees). + +The option \code{predleaf} purpose is inspired from §3.1 of the paper +\code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +The idea is to use the model as a generator of new features which capture non linear link +from original features. +} \examples{ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') From e384f549f4148f3daf650a8c1ccc701478d1b636 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:47:45 +0100 Subject: [PATCH 23/25] Cleaning of demo --- R-package/demo/basic_walkthrough.R | 4 ++-- R-package/demo/boost_from_prediction.R | 2 +- R-package/demo/create_sparse_matrix.R | 3 +-- R-package/demo/cross_validation.R | 4 ++-- R-package/demo/predict_leaf_indices.R | 8 ++++---- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 0b1e5b817..193618be3 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -102,9 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T) # Finally, you can check which features are the most important. print("Most important features (look at column Gain):") -imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt") +imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst) print(imp_matrix) # Feature importance bar plot by gain print("Feature importance Plot : ") -print(xgb.plot.importance(imp_matrix)) +print(xgb.plot.importance(importance_matrix = imp_matrix)) diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R index 9d7db806b..7fa7d8545 100644 --- a/R-package/demo/boost_from_prediction.R +++ b/R-package/demo/boost_from_prediction.R @@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain) setinfo(dtest, "base_margin", ptest) print('this is result of boost from initial prediction') -bst <- xgb.train( param, dtrain, 1, watchlist ) +bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist) diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R index 2fbf41772..7a8dfaa82 100644 --- a/R-package/demo/create_sparse_matrix.R +++ b/R-package/demo/create_sparse_matrix.R @@ -67,10 +67,9 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y] cat("Learning...\n") bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") -xgb.dump(bst, 'xgb.model.dump', with.stats = T) # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. -importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') +importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) print(importance) # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column). diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R index c3148ae21..5d748f679 100644 --- a/R-package/demo/cross_validation.R +++ b/R-package/demo/cross_validation.R @@ -43,9 +43,9 @@ evalerror <- function(preds, dtrain) { param <- list(max.depth=2,eta=1,silent=1, objective = logregobj, eval_metric = evalerror) # train with customized objective -xgb.cv(param, dtrain, nround, nfold = 5) +xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5) # do cross validation with prediction values for each fold -res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE) +res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE) res$dt length(res$pred) diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index c03a17955..110bf9602 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -2,15 +2,15 @@ require(xgboost) # load in the agaricus dataset data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') -dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) -dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) +dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) +dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) -param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic') +param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic') watchlist <- list(eval = dtest, train = dtrain) nround = 5 # training the model for two rounds -bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist) +bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2, watchlist = watchlist) cat('start testing prediction from first n trees\n') ### predict using first 2 tree From 0abb4338a9b01310dbabefb572fe04acee613b81 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:48:01 +0100 Subject: [PATCH 24/25] Cleaning in documentation --- R-package/vignettes/discoverYourData.Rmd | 4 ++-- R-package/vignettes/xgboostPresentation.Rmd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 22d996b08..08d6bfdf5 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -190,7 +190,7 @@ Measure feature importance In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature). ```{r} -importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst) +importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) head(importance) ``` @@ -213,7 +213,7 @@ One simple solution is to count the co-occurrences of a feature and a class of t For that purpose we will execute the same function as above but using two more parameters, `data` and `label`. ```{r} -importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) +importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) # Cleaning for better display importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)] diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 45d2e8b8e..7534240ac 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -345,7 +345,7 @@ Feature importance is similar to R gbm package's relative influence (rel.inf). ``` importance_matrix <- xgb.importance(model = bst) print(importance_matrix) -xgb.plot.importance(importance_matrix) +xgb.plot.importance(importance_matrix = importance_matrix) ``` View the trees from a model From db922e8c88ff69413af288fbfb3586f5ca784874 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 2 Dec 2015 15:48:22 +0100 Subject: [PATCH 25/25] Small rewording function xgb.importance --- R-package/R/xgb.importance.R | 13 ++++++++----- R-package/man/xgb.importance.Rd | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 07211ff59..e003277f0 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -25,14 +25,17 @@ #' Results are returned for both linear and tree models. #' #' \code{data.table} is returned by the function. -#' There are 3 columns : +#' The columns are : #' \itemize{ -#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. -#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ; -#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; -#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. +#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump; +#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models); +#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models); +#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. #' } #' +#' If you don't provide name, index of the features are used. +#' They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R). +#' #' Co-occurence count #' ------------------ #' diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index c144bb85f..0d59ba556 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -31,14 +31,17 @@ This is the function to understand the model trained (and through your model, yo Results are returned for both linear and tree models. \code{data.table} is returned by the function. -There are 3 columns : +The columns are : \itemize{ - \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. - \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ; - \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; - \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. + \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump; + \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models); + \item \code{Cover} metric of the number of observation related to this feature (only available for tree models); + \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. } +If you don't provide name, index of the features are used. +They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R). + Co-occurence count ------------------