From b8711226e2f121a7cb76200716421ee840c0f900 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Thu, 2 Apr 2015 19:48:23 -0500 Subject: [PATCH 1/6] added an option for stratified CV to xgb.cv --- R-package/R/utils.R | 77 ++++++++++++++++++++++++++++++++++++++++---- R-package/R/xgb.cv.R | 11 ++++--- 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 7336ed213..b0b565a3d 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -214,18 +214,30 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F #------------------------------------------ # helper functions for cross validation # -xgb.cv.mknfold <- function(dall, nfold, param) { +xgb.cv.mknfold <- function(dall, nfold, param, stratified) { if (nfold <= 1) { stop("nfold must be bigger than 1") } randidx <- sample(1 : xgb.numrow(dall)) - kstep <- length(randidx) %/% nfold - idset <- list() - for (i in 1:(nfold-1)) { - idset[[i]] = randidx[1:kstep] - randidx = setdiff(randidx,idset[[i]]) + y <- getinfo(dall, 'label') + if (stratified & length(y) == length(randidx)) { + y <- y[randidx] + # by default assume that y is a classification label, and only + # leave it numeric for the reg:linear objective + # WARNING: if there would be any objectives with truly numerical + # they would not currently be treated correctly. + if (param[['objective']] != 'reg:linear') y <- factor(y) + idset <- xgb.createFolds(y, nfold) + } else { + # make simple non-stratified folds + kstep <- length(randidx) %/% nfold + idset <- list() + for (i in 1:(nfold-1)) { + idset[[i]] = randidx[1:kstep] + randidx = setdiff(randidx,idset[[i]]) + } + idset[[nfold]] = randidx } - idset[[nfold]] = randidx ret <- list() for (k in 1:nfold) { dtest <- slice(dall, idset[[k]]) @@ -242,6 +254,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) { } return (ret) } + xgb.cv.aggcv <- function(res, showsd = TRUE) { header <- res[[1]] ret <- header[1] @@ -261,3 +274,53 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) { } return (ret) } + +# Shamelessly copied from caret::createFolds +# and simplified by always returning an unnamed list of test indices +xgb.createFolds <- function(y, k = 10) +{ + if(is.numeric(y)) { + ## Group the numeric data based on their magnitudes + ## and sample within those groups. + + ## When the number of samples is low, we may have + ## issues further slicing the numeric data into + ## groups. The number of groups will depend on the + ## ratio of the number of folds to the sample size. + ## At most, we will use quantiles. If the sample + ## is too small, we just do regular unstratified + ## CV + cuts <- floor(length(y)/k) + if(cuts < 2) cuts <- 2 + if(cuts > 5) cuts <- 5 + y <- cut(y, + unique(quantile(y, probs = seq(0, 1, length = cuts))), + include.lowest = TRUE) + } + + if(k < length(y)) { + ## reset levels so that the possible levels and + ## the levels in the vector are the same + y <- factor(as.character(y)) + numInClass <- table(y) + foldVector <- vector(mode = "integer", length(y)) + + ## For each class, balance the fold allocation as far + ## as possible, then resample the remainder. + ## The final assignment of folds is also randomized. + for(i in 1:length(numInClass)) { + ## create a vector of integers from 1:k as many times as possible without + ## going over the number of samples in the class. Note that if the number + ## of samples in a class is less than k, nothing is producd here. + seqVector <- rep(1:k, numInClass[i] %/% k) + ## add enough random integers to get length(seqVector) == numInClass[i] + if(numInClass[i] %% k > 0) seqVector <- c(seqVector, sample(1:k, numInClass[i] %% k)) + ## shuffle the integers for fold assignment and assign to this classes's data + foldVector[which(y == dimnames(numInClass)$y[i])] <- sample(seqVector) + } + } else foldVector <- seq(along = y) + + out <- split(seq(along = y), foldVector) + names(out) <- NULL + out +} diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 3472ead4d..38f70b973 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -46,11 +46,12 @@ #' \item \code{merror} Exact matching error, used to evaluate multi-class classification #' } #' @param obj customized objective function. Returns gradient and second order -#' gradient with given prediction and dtrain, +#' gradient with given prediction and dtrain. #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given -#' prediction and dtrain, -#' @param verbose \code{boolean}, print the statistics during the process. +#' prediction and dtrain. +#' @param stratified \code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data} +#' @param verbose \code{boolean}, print the statistics during the process #' @param ... other parameters to pass to \code{params}. #' #' @return A \code{data.table} with each mean and standard deviation stat for training set and test set. @@ -76,7 +77,7 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, verbose = T,...) { + obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } @@ -94,7 +95,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params <- append(params, list("eval_metric"=mc)) } - folds <- xgb.cv.mknfold(dtrain, nfold, params) + folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified) obj_type = params[['objective']] mat_pred = FALSE if (!is.null(obj_type) && obj_type=='multi:softprob') From 611d69c771f5fad6eedec598839fc5175f83957e Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Thu, 2 Apr 2015 19:59:06 -0500 Subject: [PATCH 2/6] fix some wording --- R-package/R/utils.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index b0b565a3d..ca32901dc 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -222,10 +222,10 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified) { y <- getinfo(dall, 'label') if (stratified & length(y) == length(randidx)) { y <- y[randidx] - # by default assume that y is a classification label, and only - # leave it numeric for the reg:linear objective - # WARNING: if there would be any objectives with truly numerical - # they would not currently be treated correctly. + # By default assume that y is a classification label, + # and only leave it numeric for the reg:linear objective. + # WARNING: if there would be any other objectives with truly + # numerical labels, they currently would not be treated correctly! if (param[['objective']] != 'reg:linear') y <- factor(y) idset <- xgb.createFolds(y, nfold) } else { From b04920d8e7e1bda51077c07e7feb7a187627b209 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 3 Apr 2015 11:14:09 -0500 Subject: [PATCH 3/6] update documentation for xgb.cv --- R-package/man/xgb.cv.Rd | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 8b65d9d4d..6f8ee06cb 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -6,7 +6,7 @@ \usage{ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, feval = NULL, verbose = T, ...) + obj = NULL, feval = NULL, stratified = TRUE, verbose = T, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -51,13 +51,15 @@ value that represents missing value. Sometime a data use 0 or other extreme valu }} \item{obj}{customized objective function. Returns gradient and second order -gradient with given prediction and dtrain,} +gradient with given prediction and dtrain.} \item{feval}{custimized evaluation function. Returns \code{list(metric='metric-name', value='metric-value')} with given -prediction and dtrain,} +prediction and dtrain.} -\item{verbose}{\code{boolean}, print the statistics during the process.} +\item{stratified}{\code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data}} + +\item{verbose}{\code{boolean}, print the statistics during the process} \item{...}{other parameters to pass to \code{params}.} } From 271e8202a7b077f9a5afab25eca3d67471303878 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 3 Apr 2015 12:20:34 -0500 Subject: [PATCH 4/6] force xgb.cv to return numeric performance values instead of character; update its docs --- R-package/R/xgb.cv.R | 15 +++++++++++---- R-package/man/xgb.cv.Rd | 10 ++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 38f70b973..db2ecf103 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -50,12 +50,19 @@ #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain. -#' @param stratified \code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data} +#' @param stratified \code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data} #' @param verbose \code{boolean}, print the statistics during the process #' @param ... other parameters to pass to \code{params}. #' -#' @return A \code{data.table} with each mean and standard deviation stat for training set and test set. -#' +#' @return +#' If \code{prediction = TRUE}, a list with the following elements is returned: +#' \itemize{ +#' \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set +#' \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds. +#' } +#' +#' If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned. +#' #' @details #' The original sample is randomly partitioned into \code{nfold} equal size subsamples. #' @@ -148,7 +155,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table split <- str_split(string = history, pattern = "\t") - for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)} + for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)} if (prediction) { return(list(dt = dt,pred = predictValues)) diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 6f8ee06cb..20423f76a 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -57,14 +57,20 @@ gradient with given prediction and dtrain.} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} -\item{stratified}{\code{boolean}, whether the sampling of folds should be stratified by the values of labels in \code{data}} +\item{stratified}{\code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}} \item{verbose}{\code{boolean}, print the statistics during the process} \item{...}{other parameters to pass to \code{params}.} } \value{ -A \code{data.table} with each mean and standard deviation stat for training set and test set. +If \code{prediction = TRUE}, a list with the following elements is returned: +\itemize{ + \item \code{dt} a \code{data.table} with each mean and standard deviation stat for training set and test set + \item \code{pred} an array or matrix (for multiclass classification) with predictions for each CV-fold for the model having been trained on the data in all other folds. +} + +If \code{prediction = FALSE}, just a \code{data.table} with each mean and standard deviation stat for training set and test set is returned. } \description{ The cross valudation function of xgboost From 31b0e53cd462460ff1b80e281872dd3c3903f3fd Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Fri, 3 Apr 2015 13:24:04 -0500 Subject: [PATCH 5/6] make it possible to use a list of pre-defined CV folds in xgb.cv --- R-package/R/utils.R | 46 +++++++++++++++++++++-------------------- R-package/R/xgb.cv.R | 16 ++++++++++---- R-package/man/xgb.cv.Rd | 8 +++++-- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index ca32901dc..a9ea767a8 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -214,43 +214,45 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F #------------------------------------------ # helper functions for cross validation # -xgb.cv.mknfold <- function(dall, nfold, param, stratified) { +xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { if (nfold <= 1) { stop("nfold must be bigger than 1") } - randidx <- sample(1 : xgb.numrow(dall)) - y <- getinfo(dall, 'label') - if (stratified & length(y) == length(randidx)) { - y <- y[randidx] - # By default assume that y is a classification label, - # and only leave it numeric for the reg:linear objective. - # WARNING: if there would be any other objectives with truly - # numerical labels, they currently would not be treated correctly! - if (param[['objective']] != 'reg:linear') y <- factor(y) - idset <- xgb.createFolds(y, nfold) - } else { - # make simple non-stratified folds - kstep <- length(randidx) %/% nfold - idset <- list() - for (i in 1:(nfold-1)) { - idset[[i]] = randidx[1:kstep] - randidx = setdiff(randidx,idset[[i]]) + if(is.null(folds)) { + y <- getinfo(dall, 'label') + randidx <- sample(1 : xgb.numrow(dall)) + if (stratified & length(y) == length(randidx)) { + y <- y[randidx] + # By default assume that y is a classification label, + # and only leave it numeric for the reg:linear objective. + # WARNING: if there would be any other objectives with truly + # numerical labels, they currently would not be treated correctly! + if (param[['objective']] != 'reg:linear') y <- factor(y) + folds <- xgb.createFolds(y, nfold) + } else { + # make simple non-stratified folds + kstep <- length(randidx) %/% nfold + folds <- list() + for (i in 1:(nfold-1)) { + folds[[i]] = randidx[1:kstep] + randidx = setdiff(randidx, folds[[i]]) + } + folds[[nfold]] = randidx } - idset[[nfold]] = randidx } ret <- list() for (k in 1:nfold) { - dtest <- slice(dall, idset[[k]]) + dtest <- slice(dall, folds[[k]]) didx = c() for (i in 1:nfold) { if (i != k) { - didx <- append(didx, idset[[i]]) + didx <- append(didx, folds[[i]]) } } dtrain <- slice(dall, didx) bst <- xgb.Booster(param, list(dtrain, dtest)) watchlist = list(train=dtrain, test=dtest) - ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]]) + ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]]) } return (ret) } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index db2ecf103..e5f5c7b72 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -50,7 +50,9 @@ #' @param feval custimized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain. -#' @param stratified \code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data} +#' @param stratified \code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data} +#' @param folds \code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). +#' If folds are supplied, the nfold and stratified parameters would be ignored. #' @param verbose \code{boolean}, print the statistics during the process #' @param ... other parameters to pass to \code{params}. #' @@ -84,10 +86,16 @@ #' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics=list(), - obj = NULL, feval = NULL, stratified = TRUE, verbose = T,...) { + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T,...) { if (typeof(params) != "list") { stop("xgb.cv: first argument params must be list") } + if(!is.null(folds)) { + if(class(folds)!="list" | length(folds) < 2) { + stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold") + } + nfold <- length(folds) + } if (nfold <= 1) { stop("nfold must be bigger than 1") } @@ -102,7 +110,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = params <- append(params, list("eval_metric"=mc)) } - folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified) + xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds) obj_type = params[['objective']] mat_pred = FALSE if (!is.null(obj_type) && obj_type=='multi:softprob') @@ -119,7 +127,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = for (i in 1:nrounds) { msg <- list() for (k in 1:nfold) { - fd <- folds[[k]] + fd <- xgb_folds[[k]] succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj) if (i% str_split("\t") %>% .[[1]] diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 20423f76a..feee4e18f 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -6,7 +6,8 @@ \usage{ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, feval = NULL, stratified = TRUE, verbose = T, ...) + obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, + verbose = T, ...) } \arguments{ \item{params}{the list of parameters. Commonly used ones are: @@ -57,7 +58,10 @@ gradient with given prediction and dtrain.} \code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} -\item{stratified}{\code{boolean}, whether sampling of folds should be stratified by the values of labels in \code{data}} +\item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}} + +\item{folds}{\code{list} provides a possibility of using a list of pre-defined CV folds (each element must be a vector of fold's indices). +If folds are supplied, the nfold and stratified parameters would be ignored.} \item{verbose}{\code{boolean}, print the statistics during the process} From aefd234da3ed64164259254282991a251185c061 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Tue, 7 Apr 2015 17:43:53 -0500 Subject: [PATCH 6/6] moved the external graphing packages to Suggested in order to trim the dependencies --- R-package/DESCRIPTION | 13 +++++++------ R-package/NAMESPACE | 12 ------------ R-package/R/xgb.plot.importance.R | 21 ++++++++------------- R-package/R/xgb.plot.tree.R | 9 ++++++--- 4 files changed, 21 insertions(+), 34 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 6de75f930..3d5319411 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -18,7 +18,12 @@ License: Apache License (== 2.0) | file LICENSE URL: https://github.com/dmlc/xgboost BugReports: https://github.com/dmlc/xgboost/issues VignetteBuilder: knitr -Suggests: knitr +Suggests: + knitr, + ggplot2 (>= 1.0.0), + DiagrammeR (>= 0.4), + Ckmeans.1d.dp (>= 3.3.1), + vcd (>= 1.3) Depends: R (>= 2.10) Imports: @@ -26,8 +31,4 @@ Imports: methods, data.table (>= 1.9.4), magrittr (>= 1.5), - stringr (>= 0.6.2), - DiagrammeR (>= 0.4), - ggplot2 (>= 1.0.0), - Ckmeans.1d.dp (>= 3.3.1), - vcd (>= 1.3) + stringr (>= 0.6.2) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a1d296d80..d7f9e455c 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -21,8 +21,6 @@ exportMethods(predict) import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) -importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp) -importFrom(DiagrammeR,mermaid) importFrom(Matrix,cBind) importFrom(Matrix,colSums) importFrom(Matrix,sparseVector) @@ -34,16 +32,6 @@ importFrom(data.table,fread) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setnames) -importFrom(ggplot2,aes) -importFrom(ggplot2,coord_flip) -importFrom(ggplot2,element_blank) -importFrom(ggplot2,element_text) -importFrom(ggplot2,geom_bar) -importFrom(ggplot2,ggplot) -importFrom(ggplot2,ggtitle) -importFrom(ggplot2,theme) -importFrom(ggplot2,xlab) -importFrom(ggplot2,ylab) importFrom(magrittr,"%>%") importFrom(magrittr,add) importFrom(magrittr,not) diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 66dcadaec..3efc3fda9 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -2,17 +2,6 @@ #' #' Read a data.table containing feature importance details and plot it. #' -#' @importFrom ggplot2 ggplot -#' @importFrom ggplot2 aes -#' @importFrom ggplot2 geom_bar -#' @importFrom ggplot2 coord_flip -#' @importFrom ggplot2 xlab -#' @importFrom ggplot2 ylab -#' @importFrom ggplot2 ggtitle -#' @importFrom ggplot2 theme -#' @importFrom ggplot2 element_text -#' @importFrom ggplot2 element_blank -#' @importFrom Ckmeans.1d.dp Ckmeans.1d.dp #' @importFrom magrittr %>% #' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function. #' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars. @@ -44,11 +33,17 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 if (!"data.table" %in% class(importance_matrix)) { stop("importance_matrix: Should be a data.table.") } - + if (!require(ggplot2, quietly = TRUE)) { + stop("ggplot2 package is required for plotting the importance", call. = FALSE) + } + if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { + stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE) + } + # To avoid issues in clustering when co-occurences are used importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature] - clusters <- suppressWarnings(Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) + clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) importance_matrix[,"Cluster":=clusters$cluster %>% as.character] plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() ) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 7f7d74d57..edcd5f47f 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -15,7 +15,6 @@ #' @importFrom stringr str_split #' @importFrom stringr str_extract #' @importFrom stringr str_trim -#' @importFrom DiagrammeR mermaid #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. @@ -64,7 +63,11 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU if (!class(model) %in% c("xgb.Booster", "NULL")) { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") } - + + if (!requireNamespace("DiagrammeR", quietly = TRUE)) { + stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE) + } + if(is.null(model)){ allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree) } else { @@ -85,7 +88,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";") - mermaid(path, width, height) + DiagrammeR::mermaid(path, width, height) } # Avoid error messages during CRAN check.