Merge pull request #134 from pommedeterresautee/master
nice work! merged to master.
This commit is contained in:
commit
75a75bc1e9
@ -24,4 +24,5 @@ Imports:
|
||||
methods,
|
||||
data.table (>= 1.9),
|
||||
magrittr (>= 1.5),
|
||||
stringr
|
||||
stringr,
|
||||
DiagrammeR
|
||||
@ -9,6 +9,8 @@ export(xgb.cv)
|
||||
export(xgb.dump)
|
||||
export(xgb.importance)
|
||||
export(xgb.load)
|
||||
export(xgb.model.dt.tree)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
export(xgb.train)
|
||||
export(xgboost)
|
||||
@ -16,14 +18,20 @@ exportMethods(predict)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
importClassesFrom(Matrix,dgeMatrix)
|
||||
importFrom(DiagrammeR,DiagrammeR)
|
||||
importFrom(data.table,":=")
|
||||
importFrom(data.table,as.data.table)
|
||||
importFrom(data.table,copy)
|
||||
importFrom(data.table,data.table)
|
||||
importFrom(data.table,rbindlist)
|
||||
importFrom(data.table,set)
|
||||
importFrom(data.table,setnames)
|
||||
importFrom(magrittr,"%>%")
|
||||
importFrom(magrittr,add)
|
||||
importFrom(magrittr,not)
|
||||
importFrom(stringr,str_extract)
|
||||
importFrom(stringr,str_extract_all)
|
||||
importFrom(stringr,str_match)
|
||||
importFrom(stringr,str_replace)
|
||||
importFrom(stringr,str_replace_all)
|
||||
importFrom(stringr,str_split)
|
||||
importFrom(stringr,str_trim)
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom stringr str_extract_all
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_replace_all
|
||||
#' @importFrom stringr str_replace
|
||||
#' @importFrom stringr str_match
|
||||
#'
|
||||
@ -31,7 +31,7 @@
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param nfold number of folds used
|
||||
#' @param label option field, when data is Matrix
|
||||
#' @param showsd boolean, whether show standard deviation of cross validation
|
||||
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
|
||||
#' @param metrics, list of evaluation metrics to be used in corss validation,
|
||||
#' when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
#' Possible options are:
|
||||
@ -49,9 +49,10 @@
|
||||
#' prediction and dtrain,
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param verbose \code{boolean}, print the statistics during the process.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @return a \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||
#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the cross validation function for xgboost
|
||||
@ -66,10 +67,11 @@
|
||||
#' dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
#' history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
|
||||
#' "max.depth"=3, "eta"=1, "objective"="binary:logistic")
|
||||
#' print(history)
|
||||
#' @export
|
||||
#'
|
||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
||||
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, ...) {
|
||||
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.cv: first argument params must be list")
|
||||
}
|
||||
@ -94,28 +96,24 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
for (k in 1:nfold) {
|
||||
fd <- folds[[k]]
|
||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||
msg[[k]] <- strsplit(xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval),
|
||||
"\t")[[1]]
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
}
|
||||
ret <- xgb.cv.aggcv(msg, showsd)
|
||||
history <- c(history, ret)
|
||||
cat(paste(ret, "\n", sep=""))
|
||||
if(verbose) paste(ret, "\n", sep="") %>% cat
|
||||
}
|
||||
|
||||
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace_all("-", ".")
|
||||
|
||||
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
|
||||
colnamesMean <- paste(colnames, "mean")
|
||||
colnamesStd <- paste(colnames, "std")
|
||||
|
||||
colnames <- c()
|
||||
for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
||||
|
||||
type <- rep(x = "numeric", times = length(colnames))
|
||||
|
||||
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
||||
|
||||
split = str_split(string = history, pattern = "\t")
|
||||
for(line in split){
|
||||
dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d.\\d*") %>% unlist %>% as.list %>% {vec <- .;rbindlist(list(dt, vec), use.names = F, fill = F)}
|
||||
}
|
||||
|
||||
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.*\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
|
||||
dt
|
||||
}
|
||||
@ -2,8 +2,11 @@
|
||||
#'
|
||||
#' Save a xgboost model to text file. Could be parsed later.
|
||||
#'
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_replace
|
||||
#' @param model the model object.
|
||||
#' @param fname the name of the binary file.
|
||||
#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
|
||||
#' @param fmap feature map file representing the type of feature.
|
||||
#' Detailed description could be found at
|
||||
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
|
||||
@ -15,6 +18,9 @@
|
||||
#' gain is the approximate loss function gain we get in each split;
|
||||
#' cover is the sum of second order gradient in each node.
|
||||
#'
|
||||
#' @return
|
||||
#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
@ -22,16 +28,26 @@
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' xgb.dump(bst, 'xgb.model.dump')
|
||||
#' # save the model in file 'xgb.model.dump'
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#'
|
||||
#' # print the model without saving it to a file
|
||||
#' print(xgb.dump(bst))
|
||||
#' @export
|
||||
#'
|
||||
xgb.dump <- function(model, fname, fmap = "", with.stats=FALSE) {
|
||||
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.dump: second argument must be type character")
|
||||
if (!class(fname) %in% c("character", "NULL")) {
|
||||
stop("xgb.dump: second argument must be type character when provided")
|
||||
}
|
||||
.Call("XGBoosterDumpModel_R", model, fname, fmap, as.integer(with.stats), PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
|
||||
|
||||
if(is.null(fname)) {
|
||||
return(str_split(result, "\n") %>% unlist %>% str_replace("^\t+","") %>% Filter(function(x) x != "", .))
|
||||
} else {
|
||||
result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname)
|
||||
return(TRUE)
|
||||
}
|
||||
}
|
||||
@ -4,9 +4,9 @@
|
||||
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom data.table setnames
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
|
||||
#'
|
||||
@ -21,7 +21,8 @@
|
||||
#' There are 3 columns :
|
||||
#' \itemize{
|
||||
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training.
|
||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
||||
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
||||
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
||||
#' }
|
||||
#'
|
||||
@ -47,7 +48,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL){
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
if (class(filename_dump) != "character" & file.exists(filename_dump)) {
|
||||
if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
|
||||
stop("filename_dump: Has to be a path to the model dump file.")
|
||||
}
|
||||
text <- readLines(filename_dump)
|
||||
@ -59,21 +60,10 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL){
|
||||
result
|
||||
}
|
||||
|
||||
treeDump <- function(feature_names, text){
|
||||
featureVec <- c()
|
||||
gainVec <- c()
|
||||
for(line in text){
|
||||
p <- str_extract(line, "\\[f.*<")
|
||||
if (!is.na(p)) {
|
||||
featureVec <- substr(p, 3, nchar(p)-1) %>% c(featureVec)
|
||||
gainVec <- str_extract(line, "gain.*,") %>% substr(x = ., 6, nchar(.)-1) %>% as.numeric %>% c(gainVec)
|
||||
}
|
||||
}
|
||||
if(!is.null(feature_names)) {
|
||||
featureVec %<>% as.numeric %>% {c =.+1; feature_names[c]} #+1 because in R indexing start with 1 instead of 0.
|
||||
}
|
||||
#1. Reduce, 2. %, 3. reorder - bigger top, 4. remove temp col
|
||||
data.table(Feature = featureVec, Weight = gainVec)[,list(sum(Weight), .N), by = Feature][, Gain:= V1/sum(V1)][,Weight:= N/sum(N)][order(-rank(Gain))][,-c(2,3), with = F]
|
||||
treeDump <- function(feature_names, text){
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",][,.(sum(Quality), sum(Cover), .N),by = Feature][,V1:=V1/sum(V1)][,V2:=V2/sum(V2)][,N:=N/sum(N)][order(-rank(V1))]
|
||||
setnames(result, c("Feature", "Gain", "Cover", "Frequence"))
|
||||
result
|
||||
}
|
||||
|
||||
linearDump <- function(feature_names, text){
|
||||
|
||||
150
R-package/R/xgb.model.dt.tree.R
Normal file
150
R-package/R/xgb.model.dt.tree.R
Normal file
@ -0,0 +1,150 @@
|
||||
#' Convert tree model dump to data.table
|
||||
#'
|
||||
#' Read a tree model text dump and return a data.table.
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table set
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom data.table copy
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom magrittr not
|
||||
#' @importFrom magrittr add
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_trim
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
#'
|
||||
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
|
||||
#'
|
||||
#' @details
|
||||
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
|
||||
#'
|
||||
#' The content of the \code{data.table} is organised that way:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{ID}: unique identifier of a node ;
|
||||
#' \item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
|
||||
#' \item \code{Split}: value of the chosen feature where is operated the split ;
|
||||
#' \item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
|
||||
#' \item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
|
||||
#' \item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
|
||||
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
|
||||
#'
|
||||
#' @export
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){
|
||||
|
||||
if (!class(feature_names) %in% c("character", "NULL")) {
|
||||
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
|
||||
}
|
||||
if (!class(filename_dump) %in% c("character", "NULL")) {
|
||||
stop("filename_dump: Has to be a character vector representing the path to the model dump file.")
|
||||
} else if (class(filename_dump) == "character" && !file.exists(filename_dump)) {
|
||||
stop("filename_dump: path to the model doesn't exist.")
|
||||
} else if(is.null(filename_dump) & is.null(text)){
|
||||
stop("filename_dump: no path and no string version of the model dump have been provided.")
|
||||
}
|
||||
if (!class(text) %in% c("character", "NULL")) {
|
||||
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
|
||||
}
|
||||
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
|
||||
stop("n_first_tree: Has to be a numeric vector of size 1.")
|
||||
}
|
||||
|
||||
if(is.null(text)){
|
||||
text <- readLines(filename_dump) %>% str_trim(side = "both")
|
||||
}
|
||||
|
||||
position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
|
||||
|
||||
extract <- function(x, pattern) str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
|
||||
|
||||
n_round <- min(length(position) - 1, n_first_tree)
|
||||
|
||||
addTreeId <- function(x, i) paste(i,x,sep = "-")
|
||||
|
||||
allTrees <- data.table()
|
||||
|
||||
for(i in 1:n_round){
|
||||
|
||||
tree <- text[(position[i]+1):(position[i+1]-1)]
|
||||
|
||||
treeID <- i-1
|
||||
|
||||
notLeaf <- str_match(tree, "leaf") %>% is.na
|
||||
leaf <- notLeaf %>% not %>% tree[.]
|
||||
branch <- notLeaf %>% tree[.]
|
||||
idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
|
||||
idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
|
||||
featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
|
||||
if(!is.null(feature_names)){
|
||||
featureBranch <- feature_names[featureBranch + 1]
|
||||
}
|
||||
featureLeaf <- rep("Leaf", length(leaf))
|
||||
splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "")
|
||||
splitLeaf <- rep(NA, length(leaf))
|
||||
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
|
||||
yesLeaf <- rep(NA, length(leaf))
|
||||
noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
|
||||
noLeaf <- rep(NA, length(leaf))
|
||||
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
|
||||
missingLeaf <- rep(NA, length(leaf))
|
||||
qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
|
||||
qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
|
||||
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
||||
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
||||
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
|
||||
|
||||
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
|
||||
}
|
||||
|
||||
yes <- allTrees[!is.na(Yes),Yes]
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "Yes.Feature",
|
||||
value = allTrees[ID == yes,Feature])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "Yes.Cover",
|
||||
value = allTrees[ID == yes,Cover])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "Yes.Quality",
|
||||
value = allTrees[ID == yes,Quality])
|
||||
|
||||
no <- allTrees[!is.na(No),No]
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Feature",
|
||||
value = allTrees[ID == no,Feature])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Cover",
|
||||
value = allTrees[ID == no,Cover])
|
||||
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Quality",
|
||||
value = allTrees[ID == no,Quality])
|
||||
|
||||
allTrees
|
||||
}
|
||||
77
R-package/R/xgb.plot.tree.R
Normal file
77
R-package/R/xgb.plot.tree.R
Normal file
@ -0,0 +1,77 @@
|
||||
#' Plot a boosted tree model
|
||||
#'
|
||||
#' Read a tree model text dump.
|
||||
#' Plotting only works for boosted tree model (not linear model).
|
||||
#'
|
||||
#' @importFrom data.table data.table
|
||||
#' @importFrom data.table set
|
||||
#' @importFrom data.table rbindlist
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom data.table copy
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom magrittr not
|
||||
#' @importFrom magrittr add
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_trim
|
||||
#' @importFrom DiagrammeR DiagrammeR
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
|
||||
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
|
||||
#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
|
||||
#'
|
||||
#' @return A \code{DiagrammeR} of the model.
|
||||
#'
|
||||
#' @details
|
||||
#'
|
||||
#' The content of each node is organised that way:
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{feature} value ;
|
||||
#' \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
|
||||
#' \item \code{gain}: metric the importance of the node in the model.
|
||||
#' }
|
||||
#'
|
||||
#' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
|
||||
#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){
|
||||
|
||||
if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) {
|
||||
stop("style: Has to be a character vector of size 1.")
|
||||
}
|
||||
|
||||
allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree)
|
||||
|
||||
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
|
||||
|
||||
allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
|
||||
|
||||
|
||||
if(is.null(styles)){
|
||||
styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
|
||||
}
|
||||
|
||||
yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
|
||||
|
||||
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
|
||||
|
||||
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";")
|
||||
DiagrammeR(path)
|
||||
}
|
||||
@ -6,7 +6,7 @@
|
||||
\usage{
|
||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
|
||||
feval = NULL, ...)
|
||||
feval = NULL, verbose = T, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
@ -34,7 +34,7 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
|
||||
\item{showsd}{boolean, whether show standard deviation of cross validation}
|
||||
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
|
||||
|
||||
\item{metrics,}{list of evaluation metrics to be used in corss validation,
|
||||
when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
@ -54,10 +54,12 @@ gradient with given prediction and dtrain,}
|
||||
\code{list(metric='metric-name', value='metric-value')} with given
|
||||
prediction and dtrain,}
|
||||
|
||||
\item{verbose}{\code{boolean}, print the statistics during the process.}
|
||||
|
||||
\item{...}{other parameters to pass to \code{params}.}
|
||||
}
|
||||
\value{
|
||||
a \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||
A \code{data.table} with each mean and standard deviation stat for training set and test set.
|
||||
}
|
||||
\description{
|
||||
The cross valudation function of xgboost
|
||||
@ -75,5 +77,6 @@ data(agaricus.train, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
history <- xgb.cv(data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
|
||||
"max.depth"=3, "eta"=1, "objective"="binary:logistic")
|
||||
print(history)
|
||||
}
|
||||
|
||||
|
||||
@ -4,12 +4,12 @@
|
||||
\alias{xgb.dump}
|
||||
\title{Save xgboost model to text file}
|
||||
\usage{
|
||||
xgb.dump(model, fname, fmap = "", with.stats = FALSE)
|
||||
xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
|
||||
\item{fname}{the name of the binary file.}
|
||||
\item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
|
||||
|
||||
\item{fmap}{feature map file representing the type of feature.
|
||||
Detailed description could be found at
|
||||
@ -23,6 +23,9 @@ for example Format.}
|
||||
gain is the approximate loss function gain we get in each split;
|
||||
cover is the sum of second order gradient in each node.}
|
||||
}
|
||||
\value{
|
||||
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||
}
|
||||
\description{
|
||||
Save a xgboost model to text file. Could be parsed later.
|
||||
}
|
||||
@ -33,6 +36,10 @@ train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump')
|
||||
# save the model in file 'xgb.model.dump'
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
# print the model without saving it to a file
|
||||
print(xgb.dump(bst))
|
||||
}
|
||||
|
||||
|
||||
@ -27,7 +27,8 @@ Results are returned for both linear and tree models.
|
||||
There are 3 columns :
|
||||
\itemize{
|
||||
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training.
|
||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
||||
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
||||
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
||||
}
|
||||
}
|
||||
|
||||
54
R-package/man/xgb.model.dt.tree.Rd
Normal file
54
R-package/man/xgb.model.dt.tree.Rd
Normal file
@ -0,0 +1,54 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.model.dt.tree.R
|
||||
\name{xgb.model.dt.tree}
|
||||
\alias{xgb.model.dt.tree}
|
||||
\title{Convert tree model dump to data.table}
|
||||
\usage{
|
||||
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL,
|
||||
n_first_tree = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||
}
|
||||
\value{
|
||||
A \code{data.table} of the features used in the model with their gain, cover and few other thing.
|
||||
}
|
||||
\description{
|
||||
Read a tree model text dump and return a data.table.
|
||||
}
|
||||
\details{
|
||||
General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
|
||||
|
||||
The content of the \code{data.table} is organised that way:
|
||||
|
||||
\itemize{
|
||||
\item \code{ID}: unique identifier of a node ;
|
||||
\item \code{Feature}: feature used in the tree to operate a split. When Leaf is indicated, it is the end of a branch ;
|
||||
\item \code{Split}: value of the chosen feature where is operated the split ;
|
||||
\item \code{Yes}: ID of the feature for the next node in the branch when the split condition is met ;
|
||||
\item \code{No}: ID of the feature for the next node in the branch when the split condition is not met ;
|
||||
\item \code{Missing}: ID of the feature for the next node in the branch for observation where the feature used for the split are not provided ;
|
||||
\item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||
\item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||
\item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
|
||||
}
|
||||
|
||||
52
R-package/man/xgb.plot.tree.Rd
Normal file
52
R-package/man/xgb.plot.tree.Rd
Normal file
@ -0,0 +1,52 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.tree.R
|
||||
\name{xgb.plot.tree}
|
||||
\alias{xgb.plot.tree}
|
||||
\title{Plot a boosted tree model}
|
||||
\usage{
|
||||
xgb.plot.tree(feature_names = NULL, filename_dump = NULL,
|
||||
n_first_tree = NULL, styles = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
|
||||
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
|
||||
|
||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||
|
||||
\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
|
||||
}
|
||||
\value{
|
||||
A \code{DiagrammeR} of the model.
|
||||
}
|
||||
\description{
|
||||
Read a tree model text dump.
|
||||
Plotting only works for boosted tree model (not linear model).
|
||||
}
|
||||
\details{
|
||||
The content of each node is organised that way:
|
||||
|
||||
\itemize{
|
||||
\item \code{feature} value ;
|
||||
\item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
|
||||
\item \code{gain}: metric the importance of the node in the model.
|
||||
}
|
||||
|
||||
Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
|
||||
It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
|
||||
}
|
||||
|
||||
@ -272,20 +272,22 @@ extern "C" {
|
||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname,
|
||||
SEXP fmap, SEXP with_stats) {
|
||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
|
||||
_WrapperBegin();
|
||||
bst_ulong olen;
|
||||
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fmap)),
|
||||
asInteger(with_stats),
|
||||
&olen);
|
||||
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
|
||||
fprintf(fo, "%s", res[i]);
|
||||
CHAR(asChar(fmap)),
|
||||
asInteger(with_stats),
|
||||
&olen);
|
||||
SEXP out = PROTECT(allocVector(STRSXP, olen));
|
||||
char buffer [2000];
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
memset(buffer, 0, sizeof buffer);
|
||||
sprintf (buffer, "booster[%u]:\n%s", static_cast<unsigned>(i), res[i]);
|
||||
SET_STRING_ELT(out, i, mkChar(buffer));
|
||||
}
|
||||
fclose(fo);
|
||||
_WrapperEnd();
|
||||
UNPROTECT(1);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -128,12 +128,11 @@ extern "C" {
|
||||
*/
|
||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
|
||||
/*!
|
||||
* \brief dump model into text file
|
||||
* \brief dump model into a string
|
||||
* \param handle handle
|
||||
* \param fname file name of model that can be dumped into
|
||||
* \param fmap name to fmap can be empty string
|
||||
* \param with_stats whether dump statistics of splits
|
||||
*/
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap, SEXP with_stats);
|
||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
|
||||
}
|
||||
#endif // XGBOOST_WRAPPER_R_H_
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user