From cdea1685e581c081b72a7107bb6fb899fa6c5c2f Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 2 Jan 2015 11:21:53 +0100 Subject: [PATCH 01/28] Add a new verbose parameter to print progress during the process (set to true by default to not change behavior of existing code) + source code refactoring --- R-package/NAMESPACE | 1 - R-package/R/xgb.cv.R | 26 ++++++++++++-------------- R-package/man/xgb.cv.Rd | 9 ++++++--- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index bd12fc7ec..6e74d9ac2 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -25,5 +25,4 @@ importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) importFrom(stringr,str_replace) -importFrom(stringr,str_replace_all) importFrom(stringr,str_split) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index c2e73e202..7256980c6 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -8,8 +8,8 @@ #' @importFrom data.table := #' @importFrom data.table rbindlist #' @importFrom stringr str_extract_all +#' @importFrom stringr str_extract #' @importFrom stringr str_split -#' @importFrom stringr str_replace_all #' @importFrom stringr str_replace #' @importFrom stringr str_match #' @@ -31,7 +31,7 @@ #' @param nrounds the max number of iterations #' @param nfold number of folds used #' @param label option field, when data is Matrix -#' @param showsd boolean, whether show standard deviation of cross validation +#' @param showsd \code{boolean}, whether show standard deviation of cross validation #' @param metrics, list of evaluation metrics to be used in corss validation, #' when it is not specified, the evaluation metric is chosen according to objective function. #' Possible options are: @@ -49,9 +49,10 @@ #' prediction and dtrain, #' @param missing Missing is only used when input is dense matrix, pick a float # value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param verbose \code{boolean}, print the statistics during the process. #' @param ... other parameters to pass to \code{params}. #' -#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set. +#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set. #' #' @details #' This is the cross validation function for xgboost @@ -66,10 +67,11 @@ #' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) #' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"), #' "max.depth"=3, "eta"=1, "objective"="binary:logistic") +#' print(history) #' @export #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, - showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, ...) { + showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -94,28 +96,24 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (k in 1:nfold) { fd <- folds[[k]] succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) - msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval), - "\t")[[1]] + msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]] } ret <- xgb.cv.aggcv(msg, showsd) history <- c(history, ret) - cat(paste(ret, "\n", sep="")) + if(verbose) paste(ret, "\n", sep="") %>% cat } - colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".") - + colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".") colnamesMean <- paste(colnames, "mean") colnamesStd <- paste(colnames, "std") + colnames <- c() for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i]) type <- rep(x = "numeric", times = length(colnames)) - dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table - split = str_split(string = history, pattern = "\t") - for(line in split){ - dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)} - } + + for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)} dt } \ No newline at end of file diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 19f04ee79..7ba5eb727 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -6,7 +6,7 @@ \usage{ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, showsd = TRUE, metrics = list(), obj = NULL, - feval = NULL, ...) + feval = NULL, verbose = T, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -34,7 +34,7 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, \item{missing}{Missing is only used when input is dense matrix, pick a float} -\item{showsd}{boolean, whether show standard deviation of cross validation} +\item{showsd}{\code{boolean}, whether show standard deviation of cross validation} \item{metrics,}{list of evaluation metrics to be used in corss validation, when it is not specified, the evaluation metric is chosen according to objective function. @@ -54,10 +54,12 @@ gradient with given prediction and dtrain,} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain,} +\item{verbose}{\code{boolean}, print the statistics during the process.} + \item{...}{other parameters to pass to \code{params}.} } \value{ -a \code{data.table} with each mean and standard deviation stat for training set and test set. +A \code{data.table} with each mean and standard deviation stat for training set and test set. } \description{ The cross valudation function of xgboost @@ -75,5 +77,6 @@ data(agaricus.train, package='xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"), "max.depth"=3, "eta"=1, "objective"="binary:logistic") +print(history) } From cfe5015e5485eaabc6877087f57d9a88d10edaa4 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 4 Jan 2015 11:21:03 +0100 Subject: [PATCH 02/28] small fix in parsing --- R-package/R/xgb.cv.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 7256980c6..eb14b2e1e 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -114,6 +114,6 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table split = str_split(string = history, pattern = "\t") - for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)} + for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.*\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)} dt } \ No newline at end of file From 8b45ef07ca9b2f1b593b8f93279fe20f15f283d8 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 4 Jan 2015 11:21:39 +0100 Subject: [PATCH 03/28] build data.table from raw model data --- R-package/R/xgb.plot.tree.R | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 R-package/R/xgb.plot.tree.R diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R new file mode 100644 index 000000000..3f60d598a --- /dev/null +++ b/R-package/R/xgb.plot.tree.R @@ -0,0 +1,24 @@ +require(DiagrammeR) +require(stringr) +require(data.table) +require(magrittr) +text <- readLines('xgb.model.dump') %>% str_trim(side = "both") +position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) + +extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2]) %>% unlist %>% as.numeric + +#for(i in 1:(length(position)-1)){ +i=1 + cat(paste("\n",i,"\n")) + tree <- text[(position[i]+1):(position[i+1]-1)] + paste(tree, collapse = "\n") %>% cat +branch <- str_match(tree, "leaf") %>% is.na %>% tree[.] +id <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% as.numeric +feature <- str_extract(branch, "\\[.*\\]") +yes <- extract(branch, "yes=\\d*") +no <- extract(branch, "no=\\d*") +missing <- extract(branch, "missing=\\d+") +gain <- extract(branch, "gain=\\d*\\.*\\d*") +cover <- extract(branch, "cover=\\d*\\.*\\d*") +dt <- data.table(ID = id, Feature = feature, Yes = yes, No = no, Missing = missing, Gain = gain, Cover = cover) +#} From 33bb1685742b40e7b34e4408a6228f88a88a6c81 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 4 Jan 2015 17:23:53 +0100 Subject: [PATCH 04/28] basis to plot --- R-package/R/xgb.plot.tree.R | 43 ++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 3f60d598a..1a6791055 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -5,20 +5,43 @@ require(magrittr) text <- readLines('xgb.model.dump') %>% str_trim(side = "both") position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) -extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2]) %>% unlist %>% as.numeric +extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist #for(i in 1:(length(position)-1)){ i=1 cat(paste("\n",i,"\n")) tree <- text[(position[i]+1):(position[i+1]-1)] paste(tree, collapse = "\n") %>% cat -branch <- str_match(tree, "leaf") %>% is.na %>% tree[.] -id <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% as.numeric -feature <- str_extract(branch, "\\[.*\\]") -yes <- extract(branch, "yes=\\d*") -no <- extract(branch, "no=\\d*") -missing <- extract(branch, "missing=\\d+") -gain <- extract(branch, "gain=\\d*\\.*\\d*") -cover <- extract(branch, "cover=\\d*\\.*\\d*") -dt <- data.table(ID = id, Feature = feature, Yes = yes, No = no, Missing = missing, Gain = gain, Cover = cover) +notLeaf <- str_match(tree, "leaf") %>% is.na +leaf <- notLeaf %>% not %>% tree[.] +branch <- notLeaf %>% tree[.] +idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% as.numeric +idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% as.numeric +featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") #%>% as.numeric +featureLeaf <- rep("Leaf", length(leaf)) +yesBranch <- extract(branch, "yes=\\d*") +yesLeaf <- rep(NA, length(leaf)) +noBranch <- extract(branch, "no=\\d*") +noLeaf <- rep(NA, length(leaf)) +missingBranch <- extract(branch, "missing=\\d+") +missingLeaf <- rep(NA, length(leaf)) +qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") +qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") +coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") +coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") +dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + +set(dt, j = "YesFeature", value = ifelse(is.na(dt[,Yes]),NA,dt[ID == dt[,Yes], ID])) +set(dt, j = "NoFeature", value = ifelse(is.na(dt[,No]),NA,dt[ID == dt[,No], ID])) +dtBranch <- dt[Feature!="Leaf"] + +yesPath <- paste(dtBranch[,ID], "-->", dtBranch[,Yes], sep = "") +noPath <- paste(dtBranch[,ID], "-->", dtBranch[,No], sep = "") +missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") +yesPathStyle <- paste("style ", dtBranch[,Yes], " fill:#A2EB86, stroke:#04C4AB, stroke-width:2px", sep = "") +noPathStyle <- paste("style ", dtBranch[,No], " fill:#FFA070, stroke:#FF5E5E, stroke-width:2px", sep = "") + +path <- c(yesPath, noPath, yesPathStyle, noPathStyle) %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "",sep = ";") + +DiagrammeR(path, height = 400) #} From f6290ad792e4f398ab721e330d9d9f219166da43 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 4 Jan 2015 21:56:41 +0100 Subject: [PATCH 05/28] plot all trees --- R-package/R/xgb.plot.tree.R | 54 ++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 1a6791055..35579b9cd 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -7,41 +7,51 @@ position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length( extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist -#for(i in 1:(length(position)-1)){ -i=1 - cat(paste("\n",i,"\n")) +addTreeId <- function(x, i) paste(i,x,sep = "-") + +allTrees <- data.table() + +for(i in 1:(length(position)-1)){ + tree <- text[(position[i]+1):(position[i+1]-1)] - paste(tree, collapse = "\n") %>% cat + notLeaf <- str_match(tree, "leaf") %>% is.na leaf <- notLeaf %>% not %>% tree[.] branch <- notLeaf %>% tree[.] -idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% as.numeric -idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% as.numeric -featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") #%>% as.numeric +idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) +idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) +featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric featureLeaf <- rep("Leaf", length(leaf)) -yesBranch <- extract(branch, "yes=\\d*") -yesLeaf <- rep(NA, length(leaf)) -noBranch <- extract(branch, "no=\\d*") +splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") +splitLeaf <- rep(NA, length(leaf)) +yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) +yesLeaf <- rep(NA, length(leaf)) +noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) noLeaf <- rep(NA, length(leaf)) -missingBranch <- extract(branch, "missing=\\d+") +missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) missingLeaf <- rep(NA, length(leaf)) qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") -dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] +dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] -set(dt, j = "YesFeature", value = ifelse(is.na(dt[,Yes]),NA,dt[ID == dt[,Yes], ID])) -set(dt, j = "NoFeature", value = ifelse(is.na(dt[,No]),NA,dt[ID == dt[,No], ID])) -dtBranch <- dt[Feature!="Leaf"] +set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = dt[ID == dt[,Yes], Feature]) +set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = dt[ID == dt[,No], Feature]) -yesPath <- paste(dtBranch[,ID], "-->", dtBranch[,Yes], sep = "") -noPath <- paste(dtBranch[,ID], "-->", dtBranch[,No], sep = "") -missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") -yesPathStyle <- paste("style ", dtBranch[,Yes], " fill:#A2EB86, stroke:#04C4AB, stroke-width:2px", sep = "") -noPathStyle <- paste("style ", dtBranch[,No], " fill:#FFA070, stroke:#FF5E5E, stroke-width:2px", sep = "") +dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] -path <- c(yesPath, noPath, yesPathStyle, noPathStyle) %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "",sep = ";") +dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|> ", Split, "|", No, "[", NoFeature, "]", sep = "")] -DiagrammeR(path, height = 400) +#missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") + +dt[Feature!="Leaf", yesPathStyle := paste("style ", Yes, " fill:#A2EB86, stroke:#04C4AB, stroke-width:2px", sep = "")] + +dt[Feature!="Leaf", noPathStyle := paste("style ", No, " fill:#FFA070, stroke:#FF5E5E, stroke-width:2px", sep = "")] + +allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) +} +path <- dt[Feature!="Leaf", c(yesPath, noPath, yesPathStyle, noPathStyle)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "",sep = ";") + +DiagrammeR(path, height =700) #} From ffbd78fce4de0e5088ec86a9490af7c137ef1b22 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 4 Jan 2015 22:40:31 +0100 Subject: [PATCH 06/28] use style CSS class instead of q style per item --- R-package/R/xgb.plot.tree.R | 79 +++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 35579b9cd..788154afb 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -12,46 +12,49 @@ addTreeId <- function(x, i) paste(i,x,sep = "-") allTrees <- data.table() for(i in 1:(length(position)-1)){ - + tree <- text[(position[i]+1):(position[i+1]-1)] - -notLeaf <- str_match(tree, "leaf") %>% is.na -leaf <- notLeaf %>% not %>% tree[.] -branch <- notLeaf %>% tree[.] -idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) -idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) -featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric -featureLeaf <- rep("Leaf", length(leaf)) -splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") -splitLeaf <- rep(NA, length(leaf)) -yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) -yesLeaf <- rep(NA, length(leaf)) -noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) -noLeaf <- rep(NA, length(leaf)) -missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) -missingLeaf <- rep(NA, length(leaf)) -qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") -qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") -coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") -coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") -dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] - -set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = dt[ID == dt[,Yes], Feature]) -set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = dt[ID == dt[,No], Feature]) - -dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] - -dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|> ", Split, "|", No, "[", NoFeature, "]", sep = "")] - -#missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") - -dt[Feature!="Leaf", yesPathStyle := paste("style ", Yes, " fill:#A2EB86, stroke:#04C4AB, stroke-width:2px", sep = "")] - -dt[Feature!="Leaf", noPathStyle := paste("style ", No, " fill:#FFA070, stroke:#FF5E5E, stroke-width:2px", sep = "")] - -allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) + + notLeaf <- str_match(tree, "leaf") %>% is.na + leaf <- notLeaf %>% not %>% tree[.] + branch <- notLeaf %>% tree[.] + idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric + featureLeaf <- rep("Leaf", length(leaf)) + splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") + splitLeaf <- rep(NA, length(leaf)) + yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) + yesLeaf <- rep(NA, length(leaf)) + noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) + noLeaf <- rep(NA, length(leaf)) + missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) + missingLeaf <- rep(NA, length(leaf)) + qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") + qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") + coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") + coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + + set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = dt[ID == dt[,Yes], Feature]) + set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = dt[ID == dt[,No], Feature]) + + dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] + + dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|> ", Split, "|", No, "[", NoFeature, "]", sep = "")] + + #missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") + + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } -path <- dt[Feature!="Leaf", c(yesPath, noPath, yesPathStyle, noPathStyle)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "",sep = ";") + +styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px;" + +yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode;", sep = "") + +no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode;", sep = "") + +path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(";", styles, yes, no, collapse = ";", sep = "") DiagrammeR(path, height =700) #} From b9799c6ac489687761ae76655ea39183c0018c74 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Sun, 4 Jan 2015 22:42:17 +0100 Subject: [PATCH 07/28] refactor plot function --- R-package/R/xgb.plot.tree.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 788154afb..4031b6055 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -48,13 +48,13 @@ for(i in 1:(length(position)-1)){ allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } -styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px;" +styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" -yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode;", sep = "") +yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") -no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode;", sep = "") +no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") -path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(";", styles, yes, no, collapse = ";", sep = "") +path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";") DiagrammeR(path, height =700) #} From 3d068b4e1ade70768f223d5993835932cd1091bb Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Mon, 5 Jan 2015 19:26:09 +0100 Subject: [PATCH 08/28] new documentation new import --- R-package/NAMESPACE | 4 ++++ R-package/man/xgb.plot.tree.Rd | 40 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 R-package/man/xgb.plot.tree.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 6e74d9ac2..a20057e25 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -9,6 +9,7 @@ export(xgb.cv) export(xgb.dump) export(xgb.importance) export(xgb.load) +export(xgb.plot.tree) export(xgb.save) export(xgb.train) export(xgboost) @@ -16,13 +17,16 @@ exportMethods(predict) import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) +importFrom(DiagrammeR,DiagrammeR) importFrom(data.table,":=") importFrom(data.table,as.data.table) importFrom(data.table,data.table) importFrom(data.table,rbindlist) +importFrom(data.table,set) importFrom(magrittr,"%>%") importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) importFrom(stringr,str_replace) importFrom(stringr,str_split) +importFrom(stringr,str_trim) diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd new file mode 100644 index 000000000..08f8b9c94 --- /dev/null +++ b/R-package/man/xgb.plot.tree.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/xgb.plot.tree.R +\name{xgb.plot.tree} +\alias{xgb.plot.tree} +\title{Plot a boosted tree model} +\usage{ +xgb.plot.tree(feature_names = NULL, filename_dump = NULL) +} +\arguments{ +\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} + +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} +} +\value{ +A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +} +\description{ +Read a xgboost model text dump. +Only works for boosted tree model (not linear model). +} +\details{ +This is the function to plot the trees growned. +It uses Mermaid JS library for that purpose. +Performance can be low for huge models. +} +\examples{ +data(agaricus.train, package='xgboost') + +#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). +#Each column of the sparse Matrix is a feature in one hot encoding format. +train <- agaricus.train + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nround = 2,objective = "binary:logistic") +xgb.dump(bst, 'xgb.model.dump', with.stats = T) + +#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump') +} + From f793df671bdbb35b9a17a2a9a7c1b3a7018b6537 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Mon, 5 Jan 2015 19:26:26 +0100 Subject: [PATCH 09/28] Change code to look like a function --- R-package/R/xgb.plot.tree.R | 153 ++++++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 52 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 4031b6055..7770bff1a 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -1,60 +1,109 @@ -require(DiagrammeR) -require(stringr) -require(data.table) -require(magrittr) -text <- readLines('xgb.model.dump') %>% str_trim(side = "both") -position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) - -extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist - -addTreeId <- function(x, i) paste(i,x,sep = "-") - -allTrees <- data.table() - -for(i in 1:(length(position)-1)){ +#' Plot a boosted tree model +#' +#' Read a xgboost model text dump. +#' Only works for boosted tree model (not linear model). +#' +#' @importFrom data.table data.table +#' @importFrom data.table set +#' @importFrom data.table rbindlist +#' @importFrom magrittr %>% +#' @importFrom data.table := +#' @importFrom stringr str_extract +#' @importFrom stringr str_split +#' @importFrom stringr str_extract +#' @importFrom stringr str_trim +#' @importFrom DiagrammeR DiagrammeR +#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). +#' +#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +#' +#' @details +#' This is the function to plot the trees growned. +#' It uses Mermaid JS library for that purpose. +#' Performance can be low for huge models. +#' +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' +#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). +#' #Each column of the sparse Matrix is a feature in one hot encoding format. +#' train <- agaricus.train +#' +#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' eta = 1, nround = 2,objective = "binary:logistic") +#' xgb.dump(bst, 'xgb.model.dump', with.stats = T) +#' +#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. +#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') +#' +#' @export +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL){ - tree <- text[(position[i]+1):(position[i+1]-1)] + if (!class(feature_names) %in% c("character", "NULL")) { + stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") + } + if (class(filename_dump) != "character" & file.exists(filename_dump)) { + stop("filename_dump: Has to be a path to the model dump file.") + } - notLeaf <- str_match(tree, "leaf") %>% is.na - leaf <- notLeaf %>% not %>% tree[.] - branch <- notLeaf %>% tree[.] - idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric - featureLeaf <- rep("Leaf", length(leaf)) - splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") - splitLeaf <- rep(NA, length(leaf)) - yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) - yesLeaf <- rep(NA, length(leaf)) - noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) - noLeaf <- rep(NA, length(leaf)) - missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) - missingLeaf <- rep(NA, length(leaf)) - qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") - qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") - coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") - coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + text <- readLines(filename_dump) %>% str_trim(side = "both") + position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) - set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = dt[ID == dt[,Yes], Feature]) - set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = dt[ID == dt[,No], Feature]) + extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist - dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] + addTreeId <- function(x, i) paste(i,x,sep = "-") - dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|> ", Split, "|", No, "[", NoFeature, "]", sep = "")] + allTrees <- data.table() - #missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") + for(i in 1:(length(position)-1)){ + + tree <- text[(position[i]+1):(position[i+1]-1)] + + notLeaf <- str_match(tree, "leaf") %>% is.na + leaf <- notLeaf %>% not %>% tree[.] + branch <- notLeaf %>% tree[.] + idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric + if(!is.null(feature_names)){ + featureBranch <- feature_names[featureBranch + 1] + } + featureLeaf <- rep("Leaf", length(leaf)) + splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") + splitLeaf <- rep(NA, length(leaf)) + yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) + yesLeaf <- rep(NA, length(leaf)) + noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) + noLeaf <- rep(NA, length(leaf)) + missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) + missingLeaf <- rep(NA, length(leaf)) + qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") + qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") + coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") + coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + + set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = dt[ID == dt[,Yes], Feature]) + set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = dt[ID == dt[,No], Feature]) + + dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] + + dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|>= ", Split, "|", No, "[", NoFeature, "]", sep = "")] + + #missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") + + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) + } - allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) + styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + + yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") + + no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") + + path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";") + + DiagrammeR(path) } - -styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" - -yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") - -no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") - -path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";") - -DiagrammeR(path, height =700) -#} From c64bfad5bbcd5d8ea32fb572ab530379c6b9e00d Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Mon, 5 Jan 2015 19:35:33 +0100 Subject: [PATCH 10/28] fix import issue --- R-package/NAMESPACE | 2 ++ R-package/R/xgb.plot.tree.R | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a20057e25..f2baee12b 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -24,6 +24,8 @@ importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(magrittr,"%>%") +importFrom(magrittr,add) +importFrom(magrittr,not) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 7770bff1a..7eb267298 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -7,6 +7,8 @@ #' @importFrom data.table set #' @importFrom data.table rbindlist #' @importFrom magrittr %>% +#' @importFrom magrittr not +#' @importFrom magrittr add #' @importFrom data.table := #' @importFrom stringr str_extract #' @importFrom stringr str_split From a6c588f90d1e115518cac9492e35068d62ebea6d Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 6 Jan 2015 13:59:14 +0100 Subject: [PATCH 11/28] fix arg check --- R-package/R/xgb.importance.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 2071680d3..d5860e8a4 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -47,7 +47,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (class(filename_dump) != "character" & file.exists(filename_dump)) { + if (class(filename_dump) != "character" || !file.exists(filename_dump)) { stop("filename_dump: Has to be a path to the model dump file.") } text <- readLines(filename_dump) From 94d070da601379a41d32e10ddb37c9c8e1e42bdc Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 6 Jan 2015 13:59:29 +0100 Subject: [PATCH 12/28] add limit number of trees option --- R-package/R/xgb.plot.tree.R | 12 +++++++++--- R-package/man/xgb.plot.tree.Rd | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 7eb267298..4863fd7ca 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -17,6 +17,7 @@ #' @importFrom DiagrammeR DiagrammeR #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). +#' @param n_first_tree limit the plot to the n first trees. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' @@ -41,25 +42,30 @@ #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL){ +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (class(filename_dump) != "character" & file.exists(filename_dump)) { + if (class(filename_dump) != "character" || !file.exists(filename_dump)) { stop("filename_dump: Has to be a path to the model dump file.") } + if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { + stop("n_first_tree: Has to be a numeric vector of size 1.") + } text <- readLines(filename_dump) %>% str_trim(side = "both") position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist + n_round <- min(length(position) - 1, n_first_tree) + addTreeId <- function(x, i) paste(i,x,sep = "-") allTrees <- data.table() - for(i in 1:(length(position)-1)){ + for(i in 1:n_round){ tree <- text[(position[i]+1):(position[i+1]-1)] diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 08f8b9c94..eeec2f111 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -4,12 +4,15 @@ \alias{xgb.plot.tree} \title{Plot a boosted tree model} \usage{ -xgb.plot.tree(feature_names = NULL, filename_dump = NULL) +xgb.plot.tree(feature_names = NULL, filename_dump = NULL, + n_first_tree = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} + +\item{n_first_tree}{limit the plot to the n first trees.} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. From 3dd202a19eb318a16bfa0b98fd0f70fb78a927b4 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 6 Jan 2015 18:18:55 +0100 Subject: [PATCH 13/28] Add stat indicators in plot --- R-package/NAMESPACE | 1 + R-package/R/xgb.plot.tree.R | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f2baee12b..f68eafbc5 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -20,6 +20,7 @@ importClassesFrom(Matrix,dgeMatrix) importFrom(DiagrammeR,DiagrammeR) importFrom(data.table,":=") importFrom(data.table,as.data.table) +importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 4863fd7ca..a263fe989 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -6,10 +6,11 @@ #' @importFrom data.table data.table #' @importFrom data.table set #' @importFrom data.table rbindlist +#' @importFrom data.table := +#' @importFrom data.table copy #' @importFrom magrittr %>% #' @importFrom magrittr not #' @importFrom magrittr add -#' @importFrom data.table := #' @importFrom stringr str_extract #' @importFrom stringr str_split #' @importFrom stringr str_extract @@ -93,10 +94,11 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] - set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = dt[ID == dt[,Yes], Feature]) - set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = dt[ID == dt[,No], Feature]) + set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(dt)[,ID:=Yes][, .(ID)], dt[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) - dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] + set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = merge(copy(dt)[,ID:=No][, .(ID)], dt[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) + + dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "
Cover: ", Cover, "
Gain: ", Quality, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|>= ", Split, "|", No, "[", NoFeature, "]", sep = "")] @@ -112,6 +114,5 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";") - DiagrammeR(path) } From 9e20893d352cf5578dd45b6e22082bf0f99208a4 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 6 Jan 2015 23:57:33 +0100 Subject: [PATCH 14/28] Change in aesthetic Improve documentation --- R-package/R/xgb.plot.tree.R | 23 +++++++++++++++-------- R-package/man/xgb.plot.tree.Rd | 15 +++++++++++---- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index a263fe989..f3aa1fe65 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -17,17 +17,24 @@ #' @importFrom stringr str_trim #' @importFrom DiagrammeR DiagrammeR #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. -#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). -#' @param n_first_tree limit the plot to the n first trees. +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' #' @details -#' This is the function to plot the trees growned. +#' +#' The content of each node is organised that way: +#' +#' \itemize{ +#' \item{\code{feature} value}{ ;} +#' \item{\code{cover}}{: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;} +#' \item{\code{gain}}{: metric the importance of the node in the model.} +#' } +#' +#' Each branch finished with a leaf. For each leaf, only the \code{cover} is indicated. #' It uses Mermaid JS library for that purpose. -#' Performance can be low for huge models. -#' -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' @@ -98,9 +105,9 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = merge(copy(dt)[,ID:=No][, .(ID)], dt[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) - dt[Feature!="Leaf" ,yesPath:= paste(ID,"[", Feature, "
Cover: ", Cover, "
Gain: ", Quality, "]-->|< ", Split, "|", Yes, "[", YesFeature, "]", sep = "")] + dt[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", YesFeature, "]", sep = "")] - dt[Feature!="Leaf" ,noPath:= paste(ID,"[", Feature, "]-->|>= ", Split, "|", No, "[", NoFeature, "]", sep = "")] + dt[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", NoFeature, "]", sep = "")] #missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index eeec2f111..099092cc7 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -10,9 +10,9 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} -\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).} +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} -\item{n_first_tree}{limit the plot to the n first trees.} +\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. @@ -22,9 +22,16 @@ Read a xgboost model text dump. Only works for boosted tree model (not linear model). } \details{ -This is the function to plot the trees growned. +The content of each node is organised that way: + +\itemize{ + \item{\code{feature} value}{ ;} + \item{\code{cover}}{: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;} + \item{\code{gain}}{: metric the importance of the node in the model.} +} + +Each branch finished with a leaf. For each leaf, only the \code{cover} is indicated. It uses Mermaid JS library for that purpose. -Performance can be low for huge models. } \examples{ data(agaricus.train, package='xgboost') From cce26756bfa219c550035b0ca43fbb2d84ce5577 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 7 Jan 2015 17:05:34 +0100 Subject: [PATCH 15/28] add style option --- R-package/R/xgb.plot.tree.R | 17 ++++++++++++----- R-package/man/xgb.plot.tree.Rd | 10 ++++++---- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index f3aa1fe65..9582a7fe7 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -19,6 +19,7 @@ #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. +#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' @@ -27,9 +28,9 @@ #' The content of each node is organised that way: #' #' \itemize{ -#' \item{\code{feature} value}{ ;} -#' \item{\code{cover}}{: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;} -#' \item{\code{gain}}{: metric the importance of the node in the model.} +#' \item \code{feature} value ; +#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ; +#' \item \code{gain}: metric the importance of the node in the model. #' } #' #' Each branch finished with a leaf. For each leaf, only the \code{cover} is indicated. @@ -50,7 +51,7 @@ #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL){ +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") @@ -62,6 +63,10 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr stop("n_first_tree: Has to be a numeric vector of size 1.") } + if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) { + stop("style: Has to be a character vector of size 1.") + } + text <- readLines(filename_dump) %>% str_trim(side = "both") position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) @@ -114,7 +119,9 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } - styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + if(is.null(styles)){ + styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + } yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 099092cc7..21c1ab380 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -5,7 +5,7 @@ \title{Plot a boosted tree model} \usage{ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, - n_first_tree = NULL) + n_first_tree = NULL, styles = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} @@ -13,6 +13,8 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, \item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} + +\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. @@ -25,9 +27,9 @@ Only works for boosted tree model (not linear model). The content of each node is organised that way: \itemize{ - \item{\code{feature} value}{ ;} - \item{\code{cover}}{: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;} - \item{\code{gain}}{: metric the importance of the node in the model.} + \item \code{feature} value ; + \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ; + \item \code{gain}: metric the importance of the node in the model. } Each branch finished with a leaf. For each leaf, only the \code{cover} is indicated. From e380e4facf240a460cdbca729ef5c0b7a3cbddb7 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 7 Jan 2015 17:09:56 +0100 Subject: [PATCH 16/28] refactoring for perf --- R-package/R/xgb.plot.tree.R | 25 ++++++++++++------------- R-package/man/xgb.plot.tree.Rd | 5 +++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 9582a7fe7..1efcbf813 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -1,7 +1,7 @@ #' Plot a boosted tree model #' #' Read a xgboost model text dump. -#' Only works for boosted tree model (not linear model). +#' Plotting only works for boosted tree model (not linear model). #' #' @importFrom data.table data.table #' @importFrom data.table set @@ -33,7 +33,7 @@ #' \item \code{gain}: metric the importance of the node in the model. #' } #' -#' Each branch finished with a leaf. For each leaf, only the \code{cover} is indicated. +#' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. #' It uses Mermaid JS library for that purpose. #' #' @examples @@ -105,20 +105,19 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] - - set(dt, i = which(dt[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(dt)[,ID:=Yes][, .(ID)], dt[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) - - set(dt, i = which(dt[,Feature]!= "Leaf"), j = "NoFeature", value = merge(copy(dt)[,ID:=No][, .(ID)], dt[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) - - dt[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", YesFeature, "]", sep = "")] - - dt[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", NoFeature, "]", sep = "")] - - #missingPath <- paste(dtBranch[,ID], "-->|Missing|", dtBranch[,Missing], sep = "") - + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(allTrees)[,ID:=Yes][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "NoFeature", value = merge(copy(allTrees)[,ID:=No][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) + + allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", YesFeature, "]", sep = "")] + + allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", NoFeature, "]", sep = "")] + + if(is.null(styles)){ styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" } diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 21c1ab380..17ef49ced 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -21,9 +21,10 @@ A \code{data.table} of the features used in the model with their average gain (a } \description{ Read a xgboost model text dump. -Only works for boosted tree model (not linear model). } \details{ +Plotting only works for boosted tree model (not linear model). + The content of each node is organised that way: \itemize{ @@ -32,7 +33,7 @@ The content of each node is organised that way: \item \code{gain}: metric the importance of the node in the model. } -Each branch finished with a leaf. For each leaf, only the \code{cover} is indicated. +Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. It uses Mermaid JS library for that purpose. } \examples{ From d532f04394ccae1161aa2a586e56b3e7df10ac3c Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 7 Jan 2015 17:47:50 +0100 Subject: [PATCH 17/28] add new function to read model and use it in the plot function --- R-package/NAMESPACE | 1 + R-package/R/xgb.model.dt.tree.R | 109 +++++++++++++++++++++++++++++ R-package/R/xgb.plot.tree.R | 60 ++-------------- R-package/man/xgb.model.dt.tree.Rd | 54 ++++++++++++++ R-package/man/xgb.plot.tree.Rd | 9 ++- 5 files changed, 173 insertions(+), 60 deletions(-) create mode 100644 R-package/R/xgb.model.dt.tree.R create mode 100644 R-package/man/xgb.model.dt.tree.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f68eafbc5..23de90d28 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -9,6 +9,7 @@ export(xgb.cv) export(xgb.dump) export(xgb.importance) export(xgb.load) +export(xgb.model.dt.tree) export(xgb.plot.tree) export(xgb.save) export(xgb.train) diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R new file mode 100644 index 000000000..2a65c30f7 --- /dev/null +++ b/R-package/R/xgb.model.dt.tree.R @@ -0,0 +1,109 @@ +#' Convert tree model dump to data.table +#' +#' Read a tree model text dump and return a data.table. +#' +#' @importFrom data.table data.table +#' @importFrom data.table set +#' @importFrom data.table rbindlist +#' @importFrom data.table := +#' @importFrom magrittr %>% +#' @importFrom magrittr not +#' @importFrom magrittr add +#' @importFrom stringr str_extract +#' @importFrom stringr str_split +#' @importFrom stringr str_extract +#' @importFrom stringr str_trim +#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. +#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). +#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. +#' +#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing. +#' +#' @details +#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. +#' +#' The content of the \code{data.table} is organised that way: +#' +#' \itemize{ +#' \item \code{ID}: unique identifier of a node ; +#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; +#' \item \code{Split}: value of the chosen feature where is operated the split ; +#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; +#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; +#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; +#' \item \code{Quality}: it's the gain related to the split in this specific node ; +#' \item \code{Cover}: metric to measure the number of observation affected by the split ; +#' \item \code{Tree}: ID of the tree. It is included in the main ID ; +#' } +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' +#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). +#' #Each column of the sparse Matrix is a feature in one hot encoding format. +#' train <- agaricus.train +#' +#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' eta = 1, nround = 2,objective = "binary:logistic") +#' xgb.dump(bst, 'xgb.model.dump', with.stats = T) +#' +#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix. +#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') +#' +#' @export +xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL){ + + if (!class(feature_names) %in% c("character", "NULL")) { + stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") + } + if (class(filename_dump) != "character" || !file.exists(filename_dump)) { + stop("filename_dump: Has to be a path to the model dump file.") + } + if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { + stop("n_first_tree: Has to be a numeric vector of size 1.") + } + + text <- readLines(filename_dump) %>% str_trim(side = "both") + position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) + + extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist + + n_round <- min(length(position) - 1, n_first_tree) + + addTreeId <- function(x, i) paste(i,x,sep = "-") + + allTrees <- data.table() + + for(i in 1:n_round){ + + tree <- text[(position[i]+1):(position[i+1]-1)] + + notLeaf <- str_match(tree, "leaf") %>% is.na + leaf <- notLeaf %>% not %>% tree[.] + branch <- notLeaf %>% tree[.] + idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric + if(!is.null(feature_names)){ + featureBranch <- feature_names[featureBranch + 1] + } + featureLeaf <- rep("Leaf", length(leaf)) + splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") + splitLeaf <- rep(NA, length(leaf)) + yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) + yesLeaf <- rep(NA, length(leaf)) + noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) + noLeaf <- rep(NA, length(leaf)) + missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) + missingLeaf <- rep(NA, length(leaf)) + qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") + qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") + coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") + coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + + allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) + } + + allTrees +} diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 1efcbf813..b980671b0 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -1,6 +1,6 @@ #' Plot a boosted tree model #' -#' Read a xgboost model text dump. +#' Read a tree model text dump. #' Plotting only works for boosted tree model (not linear model). #' #' @importFrom data.table data.table @@ -21,7 +21,7 @@ #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. #' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. #' -#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +#' @return A \code{DiagrammeR} of the model. #' #' @details #' @@ -34,7 +34,7 @@ #' } #' #' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. -#' It uses Mermaid JS library for that purpose. +#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose. #' #' @examples #' data(agaricus.train, package='xgboost') @@ -51,63 +51,13 @@ #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') #' #' @export -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ - - if (!class(feature_names) %in% c("character", "NULL")) { - stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") - } - if (class(filename_dump) != "character" || !file.exists(filename_dump)) { - stop("filename_dump: Has to be a path to the model dump file.") - } - if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { - stop("n_first_tree: Has to be a numeric vector of size 1.") - } +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){ if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) { stop("style: Has to be a character vector of size 1.") } - - text <- readLines(filename_dump) %>% str_trim(side = "both") - position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) - - extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist - - n_round <- min(length(position) - 1, n_first_tree) - - addTreeId <- function(x, i) paste(i,x,sep = "-") - - allTrees <- data.table() - - for(i in 1:n_round){ - tree <- text[(position[i]+1):(position[i+1]-1)] - - notLeaf <- str_match(tree, "leaf") %>% is.na - leaf <- notLeaf %>% not %>% tree[.] - branch <- notLeaf %>% tree[.] - idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric - if(!is.null(feature_names)){ - featureBranch <- feature_names[featureBranch + 1] - } - featureLeaf <- rep("Leaf", length(leaf)) - splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") - splitLeaf <- rep(NA, length(leaf)) - yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) - yesLeaf <- rep(NA, length(leaf)) - noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) - noLeaf <- rep(NA, length(leaf)) - missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) - missingLeaf <- rep(NA, length(leaf)) - qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") - qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") - coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") - coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] - - allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) - } + allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree) set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(allTrees)[,ID:=Yes][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd new file mode 100644 index 000000000..8c46ffe4f --- /dev/null +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2 (4.1.0): do not edit by hand +% Please edit documentation in R/xgb.model.dt.tree.R +\name{xgb.model.dt.tree} +\alias{xgb.model.dt.tree} +\title{Convert tree model dump to data.table} +\usage{ +xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, + n_first_tree = NULL) +} +\arguments{ +\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} + +\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).} + +\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} +} +\value{ +A \code{data.table} of the features used in the model with their gain, cover and few other thing. +} +\description{ +Read a tree model text dump and return a data.table. +} +\details{ +General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it. + +The content of the \code{data.table} is organised that way: + +\itemize{ +\item \code{ID}: unique identifier of a node ; + \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ; + \item \code{Split}: value of the chosen feature where is operated the split ; + \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ; + \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ; + \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ; + \item \code{Quality}: it's the gain related to the split in this specific node ; + \item \code{Cover}: metric to measure the number of observation affected by the split ; + \item \code{Tree}: ID of the tree. It is included in the main ID ; +} +} +\examples{ +data(agaricus.train, package='xgboost') + +#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned). +#Each column of the sparse Matrix is a feature in one hot encoding format. +train <- agaricus.train + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nround = 2,objective = "binary:logistic") +xgb.dump(bst, 'xgb.model.dump', with.stats = T) + +#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump') +} + diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 17ef49ced..ba65cdd7c 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -17,14 +17,13 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, \item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.} } \value{ -A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. +A \code{DiagrammeR} of the model. } \description{ -Read a xgboost model text dump. +Read a tree model text dump. +Plotting only works for boosted tree model (not linear model). } \details{ -Plotting only works for boosted tree model (not linear model). - The content of each node is organised that way: \itemize{ @@ -34,7 +33,7 @@ The content of each node is organised that way: } Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. -It uses Mermaid JS library for that purpose. +It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose. } \examples{ data(agaricus.train, package='xgboost') From 3d0bbae2c2d84ff3835b1540e291fc80b857200d Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 7 Jan 2015 18:18:52 +0100 Subject: [PATCH 18/28] refactoring of importance function --- R-package/R/xgb.importance.R | 23 ++++++----------------- R-package/R/xgb.model.dt.tree.R | 18 ++++++++++++++---- R-package/man/xgb.importance.Rd | 3 ++- R-package/man/xgb.model.dt.tree.Rd | 2 +- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index d5860e8a4..eaaad9ab8 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -6,7 +6,6 @@ #' @importFrom data.table data.table #' @importFrom magrittr %>% #' @importFrom data.table := -#' @importFrom stringr str_extract #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). #' @@ -21,7 +20,8 @@ #' There are 3 columns : #' \itemize{ #' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. -#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training. +#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ; +#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; #' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. #' } #' @@ -59,21 +59,10 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL){ result } -treeDump <- function(feature_names, text){ - featureVec <- c() - gainVec <- c() - for(line in text){ - p <- str_extract(line, "\\[f.*<") - if (!is.na(p)) { - featureVec <- substr(p, 3, nchar(p)-1) %>% c(featureVec) - gainVec <- str_extract(line, "gain.*,") %>% substr(x = ., 6, nchar(.)-1) %>% as.numeric %>% c(gainVec) - } - } - if(!is.null(feature_names)) { - featureVec %<>% as.numeric %>% {c =.+1; feature_names[c]} #+1 because in R indexing start with 1 instead of 0. - } - #1. Reduce, 2. %, 3. reorder - bigger top, 4. remove temp col - data.table(Feature = featureVec, Weight = gainVec)[,list(sum(Weight), .N), by = Feature][, Gain:= V1/sum(V1)][,Weight:= N/sum(N)][order(-rank(Gain))][,-c(2,3), with = F] +treeDump <- function(feature_names, text){ + result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",][,.(sum(Quality), sum(Cover), .N),by = Feature][,V1:=V1/sum(V1)][,V2:=V2/sum(V2)][,N:=N/sum(N)][order(-rank(V1))] + setnames(result, c("Feature", "Gain", "Cover", "Frequence")) + result } linearDump <- function(feature_names, text){ diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 2a65c30f7..1fc104cce 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -51,19 +51,29 @@ #' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump') #' #' @export -xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL){ +xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){ if (!class(feature_names) %in% c("character", "NULL")) { stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.") } - if (class(filename_dump) != "character" || !file.exists(filename_dump)) { - stop("filename_dump: Has to be a path to the model dump file.") + if (!class(filename_dump) %in% c("character", "NULL")) { + stop("filename_dump: Has to be a character vector representing the path to the model dump file.") + } else if (class(filename_dump) == "character" && !file.exists(filename_dump)) { + stop("filename_dump: path to the model doesn't exist.") + } else if(is.null(filename_dump) & is.null(text)){ + stop("filename_dump: no path and no string version of the model dump have been provided.") + } + if (!class(text) %in% c("character", "NULL")) { + stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") } if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) { stop("n_first_tree: Has to be a numeric vector of size 1.") } - text <- readLines(filename_dump) %>% str_trim(side = "both") + if(is.null(text)){ + text <- readLines(filename_dump) %>% str_trim(side = "both") + } + position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1) extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index a7a71cefc..8aa58cddd 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -27,7 +27,8 @@ Results are returned for both linear and tree models. There are 3 columns : \itemize{ \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. - \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training. + \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training ; + \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. } } diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 8c46ffe4f..2bc48c4d0 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -4,7 +4,7 @@ \alias{xgb.model.dt.tree} \title{Convert tree model dump to data.table} \usage{ -xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, +xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL) } \arguments{ From 6fd8bbe71a5d686d4a6a6b7a5cc7fb4e38117fbe Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 8 Jan 2015 23:47:00 +0100 Subject: [PATCH 19/28] C part export a model dump string --- R-package/R/xgb.dump.R | 4 +++- R-package/src/xgboost_R.cpp | 17 +++++++++-------- R-package/src/xgboost_R.h | 5 ++--- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index b5e66604c..9a6e0ddd0 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -32,6 +32,8 @@ xgb.dump <- function(model, fname, fmap = "", with.stats=FALSE) { if (typeof(fname) != "character") { stop("xgb.dump: second argument must be type character") } - .Call("XGBoosterDumpModel_R", model, fname, fmap, as.integer(with.stats), PACKAGE = "xgboost") + result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") + writeLines(result, fname) + #unlist(str_split(a, "\n"))=="" return(TRUE) } diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index 7cab221fb..c8fe3e23a 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -272,20 +272,21 @@ extern "C" { XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))); _WrapperEnd(); } - void XGBoosterDumpModel_R(SEXP handle, SEXP fname, - SEXP fmap, SEXP with_stats) { + SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) { _WrapperBegin(); bst_ulong olen; const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle), CHAR(asChar(fmap)), asInteger(with_stats), &olen); - FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w"); - for (size_t i = 0; i < olen; ++i) { - fprintf(fo, "booster[%u]:\n", static_cast(i)); - fprintf(fo, "%s", res[i]); + SEXP out = PROTECT(allocVector(STRSXP, olen)); + char buffer [2000]; + for (size_t i = 0; i < olen; ++i) { + sprintf (buffer, "booster[%u]:\n%s", static_cast(i), res[i]); + SET_STRING_ELT(out, i, mkChar(buffer)); } - fclose(fo); _WrapperEnd(); + UNPROTECT(1); + return out; } -} +} \ No newline at end of file diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 04c16ab3e..766152699 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -128,12 +128,11 @@ extern "C" { */ void XGBoosterSaveModel_R(SEXP handle, SEXP fname); /*! - * \brief dump model into text file + * \brief dump model into a string * \param handle handle - * \param fname file name of model that can be dumped into * \param fmap name to fmap can be empty string * \param with_stats whether dump statistics of splits */ - void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap, SEXP with_stats); + SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats); } #endif // XGBOOST_WRAPPER_R_H_ From 3e1eea0eea6ad4be75408624bf71469a3492db84 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 00:14:01 +0100 Subject: [PATCH 20/28] refactor dump function to adapt to the new possibilities of exporting a String --- R-package/NAMESPACE | 1 + R-package/R/xgb.dump.R | 20 ++++++++++++++------ R-package/man/xgb.dump.Rd | 7 +++++-- R-package/man/xgb.importance.Rd | 2 +- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 23de90d28..a36e066ef 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -32,5 +32,6 @@ importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) importFrom(stringr,str_replace) +importFrom(stringr,str_replace_all) importFrom(stringr,str_split) importFrom(stringr,str_trim) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 9a6e0ddd0..0d7e79f31 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -2,8 +2,11 @@ #' #' Save a xgboost model to text file. Could be parsed later. #' +#' @importFrom magrittr %>% +#' @importFrom stringr str_split +#' @importFrom stringr str_replace_all #' @param model the model object. -#' @param fname the name of the binary file. +#' @param fname the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector. #' @param fmap feature map file representing the type of feature. #' Detailed description could be found at #' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}. @@ -15,6 +18,9 @@ #' gain is the approximate loss function gain we get in each split; #' cover is the sum of second order gradient in each node. #' +#' @return +#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. +#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') @@ -25,15 +31,17 @@ #' xgb.dump(bst, 'xgb.model.dump') #' @export #' -xgb.dump <- function(model, fname, fmap = "", with.stats=FALSE) { +xgb.dump <- function(model, fname = NULL, fmap = "", with.stats=FALSE) { if (class(model) != "xgb.Booster") { stop("xgb.dump: first argument must be type xgb.Booster") } - if (typeof(fname) != "character") { - stop("xgb.dump: second argument must be type character") + if (!class(fname) %in% c("character", "NULL")) { + stop("xgb.dump: second argument must be type character if provided") } result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") + + if(is.null(fname)) return(str_split(result, "\n") %>% unlist %>% str_replace_all("\t"," ") %>% Filter(function(x) x != "", .)) + writeLines(result, fname) - #unlist(str_split(a, "\n"))=="" - return(TRUE) + TRUE } diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index bcecc6abd..e779f32b9 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -4,12 +4,12 @@ \alias{xgb.dump} \title{Save xgboost model to text file} \usage{ -xgb.dump(model, fname, fmap = "", with.stats = FALSE) +xgb.dump(model, fname = NULL, fmap = "", with.stats = FALSE) } \arguments{ \item{model}{the model object.} -\item{fname}{the name of the binary file.} +\item{fname}{the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.} \item{fmap}{feature map file representing the type of feature. Detailed description could be found at @@ -23,6 +23,9 @@ for example Format.} gain is the approximate loss function gain we get in each split; cover is the sum of second order gradient in each node.} } +\value{ +if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. +} \description{ Save a xgboost model to text file. Could be parsed later. } diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 8aa58cddd..78be4b91b 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -27,7 +27,7 @@ Results are returned for both linear and tree models. There are 3 columns : \itemize{ \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. - \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training ; + \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ; \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. } From 10f755e055133f35c0b46e913d44d31577eafc63 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 11:06:56 +0100 Subject: [PATCH 21/28] only replace tabulation which begins a line (avoid wrong replacement in feature name) --- R-package/R/xgb.dump.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 0d7e79f31..7049257f8 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -40,8 +40,10 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with.stats=FALSE) { } result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") - if(is.null(fname)) return(str_split(result, "\n") %>% unlist %>% str_replace_all("\t"," ") %>% Filter(function(x) x != "", .)) - - writeLines(result, fname) - TRUE -} + if(is.null(fname)) { + return(str_split(result, "\n") %>% unlist %>% str_replace("^\t+","") %>% Filter(function(x) x != "", .)) + } else { + writeLines(result, fname) + return(TRUE) + } +} \ No newline at end of file From 9d6eecf34e88e331c92fd89ef61f042d6cdd9a50 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 11:07:53 +0100 Subject: [PATCH 22/28] small change in import lib --- R-package/NAMESPACE | 1 - R-package/R/xgb.dump.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a36e066ef..23de90d28 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -32,6 +32,5 @@ importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_match) importFrom(stringr,str_replace) -importFrom(stringr,str_replace_all) importFrom(stringr,str_split) importFrom(stringr,str_trim) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 7049257f8..ceb68c1a3 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -4,7 +4,7 @@ #' #' @importFrom magrittr %>% #' @importFrom stringr str_split -#' @importFrom stringr str_replace_all +#' @importFrom stringr str_replace #' @param model the model object. #' @param fname the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector. #' @param fmap feature map file representing the type of feature. From 31d0e8f65d42c6e65ef0a0eb89b4353a582ef69d Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 11:14:08 +0100 Subject: [PATCH 23/28] better doc of dump function --- R-package/R/xgb.dump.R | 10 +++++++--- R-package/man/xgb.dump.Rd | 8 ++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index ceb68c1a3..f73850883 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -6,7 +6,7 @@ #' @importFrom stringr str_split #' @importFrom stringr str_replace #' @param model the model object. -#' @param fname the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector. +#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector. #' @param fmap feature map file representing the type of feature. #' Detailed description could be found at #' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}. @@ -28,15 +28,19 @@ #' test <- agaricus.test #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' eta = 1, nround = 2,objective = "binary:logistic") +#' # save the model in file 'xgb.model.dump' #' xgb.dump(bst, 'xgb.model.dump') +#' +#' # print the model without saving it to a file +#' print(xgb.dump(bst)) #' @export #' -xgb.dump <- function(model, fname = NULL, fmap = "", with.stats=FALSE) { +xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { if (class(model) != "xgb.Booster") { stop("xgb.dump: first argument must be type xgb.Booster") } if (!class(fname) %in% c("character", "NULL")) { - stop("xgb.dump: second argument must be type character if provided") + stop("xgb.dump: second argument must be type character when provided") } result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index e779f32b9..6dad9ed7b 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -4,12 +4,12 @@ \alias{xgb.dump} \title{Save xgboost model to text file} \usage{ -xgb.dump(model, fname = NULL, fmap = "", with.stats = FALSE) +xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE) } \arguments{ \item{model}{the model object.} -\item{fname}{the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.} +\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.} \item{fmap}{feature map file representing the type of feature. Detailed description could be found at @@ -36,6 +36,10 @@ train <- agaricus.train test <- agaricus.test bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic") +# save the model in file 'xgb.model.dump' xgb.dump(bst, 'xgb.model.dump') + +# print the model without saving it to a file +print(xgb.dump(bst)) } From d96bd15b7deeb61d4719f013ed1b34feaa114691 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 11:52:40 +0100 Subject: [PATCH 24/28] small fix in the C dump code --- R-package/src/xgboost_R.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index c8fe3e23a..80f77c02b 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -281,7 +281,8 @@ extern "C" { &olen); SEXP out = PROTECT(allocVector(STRSXP, olen)); char buffer [2000]; - for (size_t i = 0; i < olen; ++i) { + for (size_t i = 0; i < olen; ++i) { + memset(buffer, 0, sizeof buffer); sprintf (buffer, "booster[%u]:\n%s", static_cast(i), res[i]); SET_STRING_ELT(out, i, mkChar(buffer)); } From b656ca1554c3024ddb93fc900103360305190784 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 11:54:23 +0100 Subject: [PATCH 25/28] reindent --- R-package/src/xgboost_R.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index 80f77c02b..9320547df 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -276,9 +276,9 @@ extern "C" { _WrapperBegin(); bst_ulong olen; const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle), - CHAR(asChar(fmap)), - asInteger(with_stats), - &olen); + CHAR(asChar(fmap)), + asInteger(with_stats), + &olen); SEXP out = PROTECT(allocVector(STRSXP, olen)); char buffer [2000]; for (size_t i = 0; i < olen; ++i) { @@ -287,7 +287,7 @@ extern "C" { SET_STRING_ELT(out, i, mkChar(buffer)); } _WrapperEnd(); - UNPROTECT(1); - return out; + UNPROTECT(1); + return out; } } \ No newline at end of file From 51935851bdf1a15630c6e4053b523753e04cf6e5 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 18:24:12 +0100 Subject: [PATCH 26/28] fix plenty of small bugs --- R-package/NAMESPACE | 1 + R-package/R/xgb.dump.R | 2 +- R-package/R/xgb.importance.R | 3 ++- R-package/R/xgb.model.dt.tree.R | 43 ++++++++++++++++++++++++++++----- R-package/R/xgb.plot.tree.R | 8 ++---- 5 files changed, 43 insertions(+), 14 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 23de90d28..d29ad7a18 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -25,6 +25,7 @@ importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) +importFrom(data.table,setnames) importFrom(magrittr,"%>%") importFrom(magrittr,add) importFrom(magrittr,not) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index f73850883..61bfe412e 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -47,7 +47,7 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { if(is.null(fname)) { return(str_split(result, "\n") %>% unlist %>% str_replace("^\t+","") %>% Filter(function(x) x != "", .)) } else { - writeLines(result, fname) + result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname) return(TRUE) } } \ No newline at end of file diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index eaaad9ab8..189ee03b4 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -4,8 +4,9 @@ #' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). #' #' @importFrom data.table data.table -#' @importFrom magrittr %>% +#' @importFrom data.table setnames #' @importFrom data.table := +#' @importFrom magrittr %>% #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}). #' diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 1fc104cce..3e0723c61 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -5,6 +5,7 @@ #' @importFrom data.table data.table #' @importFrom data.table set #' @importFrom data.table rbindlist +#' @importFrom data.table copy #' @importFrom data.table := #' @importFrom magrittr %>% #' @importFrom magrittr not @@ -88,11 +89,13 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = tree <- text[(position[i]+1):(position[i+1]-1)] + treeID <- i-1 + notLeaf <- str_match(tree, "leaf") %>% is.na leaf <- notLeaf %>% not %>% tree[.] branch <- notLeaf %>% tree[.] - idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) - idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(i) + idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID) + idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID) featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric if(!is.null(feature_names)){ featureBranch <- feature_names[featureBranch + 1] @@ -100,20 +103,48 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = featureLeaf <- rep("Leaf", length(leaf)) splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "") splitLeaf <- rep(NA, length(leaf)) - yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(i) + yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID) yesLeaf <- rep(NA, length(leaf)) - noBranch <- extract(branch, "no=\\d*") %>% addTreeId(i) + noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID) noLeaf <- rep(NA, length(leaf)) - missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(i) + missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID) missingLeaf <- rep(NA, length(leaf)) qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*") qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*") coverBranch <- extract(branch, "cover=\\d*\\.*\\d*") coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*") - dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=i] + dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID] allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F) } + yes <- allTrees[!is.na(Yes),Yes] + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + j = "Yes.Feature", + value = allTrees[ID == yes,Feature]) + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + j = "Yes.Cover", + value = allTrees[ID == yes,Cover]) + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + j = "Yes.Quality", + value = allTrees[ID == yes,Quality]) + + no <- allTrees[!is.na(No),No] + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + j = "No.Feature", + value = allTrees[ID == no,Feature]) + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + j = "No.Cover", + value = allTrees[ID == no,Cover]) + + set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), + j = "No.Quality", + value = allTrees[ID == no,Quality]) + allTrees } diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index b980671b0..1a8a04e8a 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -59,13 +59,9 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tr allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree) - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "YesFeature", value = merge(copy(allTrees)[,ID:=Yes][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) + allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] - set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), j = "NoFeature", value = merge(copy(allTrees)[,ID:=No][, .(ID)], allTrees[,.(ID, Feature, Quality, Cover)], by = "ID")[,paste(Feature, "
Cover: ", Cover, sep = "")]) - - allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", YesFeature, "]", sep = "")] - - allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", NoFeature, "]", sep = "")] + allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] if(is.null(styles)){ From a3493934d1e964240b8ff428d14c52a1d76abfa3 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 18:26:56 +0100 Subject: [PATCH 27/28] documentation example change --- R-package/R/xgb.dump.R | 2 +- R-package/man/xgb.dump.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 61bfe412e..3df8c9605 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -29,7 +29,7 @@ #' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, #' eta = 1, nround = 2,objective = "binary:logistic") #' # save the model in file 'xgb.model.dump' -#' xgb.dump(bst, 'xgb.model.dump') +#' xgb.dump(bst, 'xgb.model.dump', with.stats = T) #' #' # print the model without saving it to a file #' print(xgb.dump(bst)) diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 6dad9ed7b..473227357 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -37,7 +37,7 @@ test <- agaricus.test bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic") # save the model in file 'xgb.model.dump' -xgb.dump(bst, 'xgb.model.dump') +xgb.dump(bst, 'xgb.model.dump', with.stats = T) # print the model without saving it to a file print(xgb.dump(bst)) From 99b4ead9370f91a6d45eae8a7c2944678896f128 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Fri, 9 Jan 2015 18:28:10 +0100 Subject: [PATCH 28/28] add new dependency on DiagrammeR --- R-package/DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 6f73766fa..cc1c22087 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -24,4 +24,5 @@ Imports: methods, data.table (>= 1.9), magrittr (>= 1.5), - stringr \ No newline at end of file + stringr, + DiagrammeR \ No newline at end of file