add new parameters to several functions avoid the need of a text dump

This commit is contained in:
El Potaeto 2015-01-11 03:06:41 +01:00
parent 70df227689
commit c8c5789efd
5 changed files with 48 additions and 21 deletions

View File

@ -37,11 +37,15 @@
#' #'
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
if (class(model) != "xgb.Booster") { if (class(model) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster") stop("model: argument must be type xgb.Booster")
} }
if (!class(fname) %in% c("character", "NULL")) { if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("xgb.dump: second argument must be type character when provided") stop("fname: argument must be type character (when provided)")
} }
if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fmap: argument must be type character (when provided)")
}
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
if(is.null(fname)) { if(is.null(fname)) {

View File

@ -9,6 +9,7 @@
#' @importFrom magrittr %>% #' @importFrom magrittr %>%
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' #'
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#' #'
@ -38,20 +39,30 @@
#' #'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic") #' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#' #'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump') #' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst)
#' #'
#' @export #' @export
xgb.importance <- function(feature_names = NULL, filename_dump = NULL){ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){
if (!class(feature_names) %in% c("character", "NULL")) { if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
} }
if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a path to the model dump file.") stop("filename_dump: Has to be a path to the model dump file.")
} }
text <- readLines(filename_dump)
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if(is.null(model)){
text <- readLines(filename_dump)
} else {
text <- xgb.dump(model = model, with.stats = T)
}
if(text[2] == "bias:"){ if(text[2] == "bias:"){
result <- linearDump(feature_names, text) result <- linearDump(feature_names, text)
} else { } else {

View File

@ -17,7 +17,8 @@
#' @importFrom stringr str_trim #' @importFrom stringr str_trim
#' @importFrom DiagrammeR DiagrammeR #' @importFrom DiagrammeR DiagrammeR
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. #' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
#' #'
@ -45,19 +46,28 @@
#' #'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic") #' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#' #'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. #' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#' #'
#' @export #' @export
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ #'
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, styles = NULL){
if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) { if (!(class(styles) %in% c("character", "NULL") && length(styles) == 1)) {
stop("style: Has to be a character vector of size 1.") stop("style: Has to be a character vector of size 1.")
} }
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree) if(is.null(model)){
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
} else {
text = xgb.dump(model = model, with.stats = T)
allTrees <- xgb.model.dt.tree(feature_names = feature_names, text = text, n_first_tree = n_first_tree)
}
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]

View File

@ -4,12 +4,14 @@
\alias{xgb.importance} \alias{xgb.importance}
\title{Show importance of features in a model} \title{Show importance of features in a model}
\usage{ \usage{
xgb.importance(feature_names = NULL, filename_dump = NULL) xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL)
} }
\arguments{ \arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
} }
\value{ \value{
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
@ -43,9 +45,8 @@ test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic") eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump') xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst)
} }

View File

@ -4,13 +4,15 @@
\alias{xgb.plot.tree} \alias{xgb.plot.tree}
\title{Plot a boosted tree model} \title{Plot a boosted tree model}
\usage{ \usage{
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
n_first_tree = NULL, styles = NULL) n_first_tree = NULL, styles = NULL)
} }
\arguments{ \arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
@ -44,9 +46,8 @@ train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic") eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump') xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
} }