Merge remote-tracking branch 'dmlc/master'
This commit is contained in:
@@ -18,7 +18,12 @@ License: Apache License (== 2.0) | file LICENSE
|
||||
URL: https://github.com/dmlc/xgboost
|
||||
BugReports: https://github.com/dmlc/xgboost/issues
|
||||
VignetteBuilder: knitr
|
||||
Suggests: knitr
|
||||
Suggests:
|
||||
knitr,
|
||||
ggplot2 (>= 1.0.0),
|
||||
DiagrammeR (>= 0.4),
|
||||
Ckmeans.1d.dp (>= 3.3.1),
|
||||
vcd (>= 1.3)
|
||||
Depends:
|
||||
R (>= 2.10)
|
||||
Imports:
|
||||
@@ -26,8 +31,4 @@ Imports:
|
||||
methods,
|
||||
data.table (>= 1.9.4),
|
||||
magrittr (>= 1.5),
|
||||
stringr (>= 0.6.2),
|
||||
DiagrammeR (>= 0.4),
|
||||
ggplot2 (>= 1.0.0),
|
||||
Ckmeans.1d.dp (>= 3.3.1),
|
||||
vcd (>= 1.3)
|
||||
stringr (>= 0.6.2)
|
||||
|
||||
@@ -21,8 +21,6 @@ exportMethods(predict)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
importClassesFrom(Matrix,dgeMatrix)
|
||||
importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp)
|
||||
importFrom(DiagrammeR,mermaid)
|
||||
importFrom(Matrix,cBind)
|
||||
importFrom(Matrix,colSums)
|
||||
importFrom(Matrix,sparseVector)
|
||||
@@ -34,16 +32,6 @@ importFrom(data.table,fread)
|
||||
importFrom(data.table,rbindlist)
|
||||
importFrom(data.table,set)
|
||||
importFrom(data.table,setnames)
|
||||
importFrom(ggplot2,aes)
|
||||
importFrom(ggplot2,coord_flip)
|
||||
importFrom(ggplot2,element_blank)
|
||||
importFrom(ggplot2,element_text)
|
||||
importFrom(ggplot2,geom_bar)
|
||||
importFrom(ggplot2,ggplot)
|
||||
importFrom(ggplot2,ggtitle)
|
||||
importFrom(ggplot2,theme)
|
||||
importFrom(ggplot2,xlab)
|
||||
importFrom(ggplot2,ylab)
|
||||
importFrom(magrittr,"%>%")
|
||||
importFrom(magrittr,add)
|
||||
importFrom(magrittr,not)
|
||||
|
||||
@@ -36,8 +36,8 @@ xgb.setinfo <- function(dmat, name, info) {
|
||||
return(TRUE)
|
||||
}
|
||||
if (name == "group") {
|
||||
if (length(info)!=xgb.numrow(dmat))
|
||||
stop("The length of groups must equal to the number of rows in the input data")
|
||||
if (sum(info)!=xgb.numrow(dmat))
|
||||
stop("The sum of groups must equal to the number of rows in the input data")
|
||||
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
|
||||
PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
@@ -77,9 +77,9 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
}
|
||||
|
||||
# convert xgb.Booster.handle to xgb.Booster
|
||||
xgb.handleToBooster <- function(handle)
|
||||
xgb.handleToBooster <- function(handle, raw = NULL)
|
||||
{
|
||||
bst <- list(handle = handle, raw = NULL)
|
||||
bst <- list(handle = handle, raw = raw)
|
||||
class(bst) <- "xgb.Booster"
|
||||
return(bst)
|
||||
}
|
||||
@@ -87,8 +87,12 @@ xgb.handleToBooster <- function(handle)
|
||||
# Check whether an xgb.Booster object is complete
|
||||
xgb.Booster.check <- function(bst, saveraw = TRUE)
|
||||
{
|
||||
if (is.null(bst$handle)) {
|
||||
bst$handle <- xgb.load(bst$raw)
|
||||
isnull <- is.null(bst$handle)
|
||||
if (!isnull) {
|
||||
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
|
||||
}
|
||||
if (isnull) {
|
||||
bst$handle <- xgb.Booster(modelfile = bst$raw)
|
||||
} else {
|
||||
if (is.null(bst$raw) && saveraw)
|
||||
bst$raw <- xgb.save.raw(bst$handle)
|
||||
|
||||
@@ -95,19 +95,39 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
}
|
||||
|
||||
folds <- xgb.cv.mknfold(dtrain, nfold, params)
|
||||
predictValues <- rep(0,xgb.numrow(dtrain))
|
||||
obj_type = params[['objective']]
|
||||
mat_pred = FALSE
|
||||
if (!is.null(obj_type) && obj_type=='multi:softprob')
|
||||
{
|
||||
num_class = params[['num_class']]
|
||||
if (is.null(num_class))
|
||||
stop('must set num_class to use softmax')
|
||||
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
|
||||
mat_pred = TRUE
|
||||
}
|
||||
else
|
||||
predictValues <- rep(0,xgb.numrow(dtrain))
|
||||
history <- c()
|
||||
for (i in 1:nrounds) {
|
||||
msg <- list()
|
||||
for (k in 1:nfold) {
|
||||
fd <- folds[[k]]
|
||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||
if (!prediction){
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
if (i<nrounds) {
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
} else {
|
||||
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
|
||||
predictValues[fd$index] <- res[[2]]
|
||||
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
|
||||
if (!prediction) {
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
} else {
|
||||
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
|
||||
if (mat_pred) {
|
||||
pred_mat = matrix(res[[2]],num_class,length(fd$index))
|
||||
predictValues[fd$index,] <- t(pred_mat)
|
||||
} else {
|
||||
predictValues[fd$index] <- res[[2]]
|
||||
}
|
||||
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
|
||||
}
|
||||
}
|
||||
}
|
||||
ret <- xgb.cv.aggcv(msg, showsd)
|
||||
|
||||
@@ -21,7 +21,12 @@ xgb.load <- function(modelfile) {
|
||||
stop("xgb.load: modelfile cannot be NULL")
|
||||
|
||||
handle <- xgb.Booster(modelfile = modelfile)
|
||||
bst <- xgb.handleToBooster(handle)
|
||||
# re-use modelfile if it is raw so we donot need to serialize
|
||||
if (typeof(modelfile) == "raw") {
|
||||
bst <- xgb.handleToBooster(handle, modelfile)
|
||||
} else {
|
||||
bst <- xgb.handleToBooster(handle, NULL)
|
||||
}
|
||||
bst <- xgb.Booster.check(bst)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@@ -95,7 +95,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
||||
addTreeId <- function(x, i) paste(i,x,sep = "-")
|
||||
|
||||
allTrees <- data.table()
|
||||
|
||||
|
||||
anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
|
||||
for(i in 1:n_round){
|
||||
|
||||
tree <- text[(position[i]+1):(position[i+1]-1)]
|
||||
@@ -115,7 +116,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
||||
featureBranch <- feature_names[featureBranch + 1]
|
||||
}
|
||||
featureLeaf <- rep("Leaf", length(leaf))
|
||||
splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "")
|
||||
splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
|
||||
splitLeaf <- rep(NA, length(leaf))
|
||||
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
|
||||
yesLeaf <- rep(NA, length(leaf))
|
||||
@@ -123,8 +124,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
||||
noLeaf <- rep(NA, length(leaf))
|
||||
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
|
||||
missingLeaf <- rep(NA, length(leaf))
|
||||
qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
|
||||
qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
|
||||
qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
|
||||
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
|
||||
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
||||
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
||||
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
|
||||
|
||||
@@ -2,17 +2,6 @@
|
||||
#'
|
||||
#' Read a data.table containing feature importance details and plot it.
|
||||
#'
|
||||
#' @importFrom ggplot2 ggplot
|
||||
#' @importFrom ggplot2 aes
|
||||
#' @importFrom ggplot2 geom_bar
|
||||
#' @importFrom ggplot2 coord_flip
|
||||
#' @importFrom ggplot2 xlab
|
||||
#' @importFrom ggplot2 ylab
|
||||
#' @importFrom ggplot2 ggtitle
|
||||
#' @importFrom ggplot2 theme
|
||||
#' @importFrom ggplot2 element_text
|
||||
#' @importFrom ggplot2 element_blank
|
||||
#' @importFrom Ckmeans.1d.dp Ckmeans.1d.dp
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
|
||||
#' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
|
||||
@@ -44,11 +33,17 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
|
||||
if (!"data.table" %in% class(importance_matrix)) {
|
||||
stop("importance_matrix: Should be a data.table.")
|
||||
}
|
||||
|
||||
if (!require(ggplot2, quietly = TRUE)) {
|
||||
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
|
||||
}
|
||||
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
|
||||
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
|
||||
}
|
||||
|
||||
# To avoid issues in clustering when co-occurences are used
|
||||
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
|
||||
|
||||
clusters <- suppressWarnings(Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
|
||||
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
|
||||
importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
|
||||
|
||||
plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_trim
|
||||
#' @importFrom DiagrammeR mermaid
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
|
||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
@@ -64,7 +63,11 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
|
||||
if (!class(model) %in% c("xgb.Booster", "NULL")) {
|
||||
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
|
||||
}
|
||||
|
||||
|
||||
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
|
||||
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
|
||||
}
|
||||
|
||||
if(is.null(model)){
|
||||
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
|
||||
} else {
|
||||
@@ -85,7 +88,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
|
||||
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
|
||||
|
||||
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
|
||||
mermaid(path, width, height)
|
||||
DiagrammeR::mermaid(path, width, height)
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
|
||||
@@ -4,4 +4,5 @@ PKGROOT=../../
|
||||
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
|
||||
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
||||
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
|
||||
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
|
||||
|
||||
|
||||
@@ -15,5 +15,5 @@ xgblib:
|
||||
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
|
||||
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
|
||||
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
|
||||
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
|
||||
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
|
||||
$(OBJECTS) : xgblib
|
||||
|
||||
@@ -59,6 +59,9 @@ inline void _WrapperEnd(void) {
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
SEXP XGCheckNullPtr_R(SEXP handle) {
|
||||
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
|
||||
}
|
||||
void _DMatrixFinalizer(SEXP ext) {
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
XGDMatrixFree(R_ExternalPtrAddr(ext));
|
||||
|
||||
@@ -11,6 +11,12 @@ extern "C" {
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
/*!
|
||||
* \brief check whether a handle is NULL
|
||||
* \param handle
|
||||
* \return whether it is null ptr
|
||||
*/
|
||||
SEXP XGCheckNullPtr_R(SEXP handle);
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \param fname name of the content
|
||||
|
||||
Reference in New Issue
Block a user