Merge remote-tracking branch 'dmlc/master'

This commit is contained in:
El Potaeto
2015-04-15 18:48:26 +02:00
118 changed files with 1305 additions and 11320 deletions

View File

@@ -18,7 +18,12 @@ License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/dmlc/xgboost
BugReports: https://github.com/dmlc/xgboost/issues
VignetteBuilder: knitr
Suggests: knitr
Suggests:
knitr,
ggplot2 (>= 1.0.0),
DiagrammeR (>= 0.4),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3)
Depends:
R (>= 2.10)
Imports:
@@ -26,8 +31,4 @@ Imports:
methods,
data.table (>= 1.9.4),
magrittr (>= 1.5),
stringr (>= 0.6.2),
DiagrammeR (>= 0.4),
ggplot2 (>= 1.0.0),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3)
stringr (>= 0.6.2)

View File

@@ -21,8 +21,6 @@ exportMethods(predict)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp)
importFrom(DiagrammeR,mermaid)
importFrom(Matrix,cBind)
importFrom(Matrix,colSums)
importFrom(Matrix,sparseVector)
@@ -34,16 +32,6 @@ importFrom(data.table,fread)
importFrom(data.table,rbindlist)
importFrom(data.table,set)
importFrom(data.table,setnames)
importFrom(ggplot2,aes)
importFrom(ggplot2,coord_flip)
importFrom(ggplot2,element_blank)
importFrom(ggplot2,element_text)
importFrom(ggplot2,geom_bar)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggtitle)
importFrom(ggplot2,theme)
importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(magrittr,"%>%")
importFrom(magrittr,add)
importFrom(magrittr,not)

View File

@@ -36,8 +36,8 @@ xgb.setinfo <- function(dmat, name, info) {
return(TRUE)
}
if (name == "group") {
if (length(info)!=xgb.numrow(dmat))
stop("The length of groups must equal to the number of rows in the input data")
if (sum(info)!=xgb.numrow(dmat))
stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE)
@@ -77,9 +77,9 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
}
# convert xgb.Booster.handle to xgb.Booster
xgb.handleToBooster <- function(handle)
xgb.handleToBooster <- function(handle, raw = NULL)
{
bst <- list(handle = handle, raw = NULL)
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
@@ -87,8 +87,12 @@ xgb.handleToBooster <- function(handle)
# Check whether an xgb.Booster object is complete
xgb.Booster.check <- function(bst, saveraw = TRUE)
{
if (is.null(bst$handle)) {
bst$handle <- xgb.load(bst$raw)
isnull <- is.null(bst$handle)
if (!isnull) {
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
}
if (isnull) {
bst$handle <- xgb.Booster(modelfile = bst$raw)
} else {
if (is.null(bst$raw) && saveraw)
bst$raw <- xgb.save.raw(bst$handle)

View File

@@ -95,19 +95,39 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
}
folds <- xgb.cv.mknfold(dtrain, nfold, params)
predictValues <- rep(0,xgb.numrow(dtrain))
obj_type = params[['objective']]
mat_pred = FALSE
if (!is.null(obj_type) && obj_type=='multi:softprob')
{
num_class = params[['num_class']]
if (is.null(num_class))
stop('must set num_class to use softmax')
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
mat_pred = TRUE
}
else
predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
if (!prediction){
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
if (i<nrounds) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
predictValues[fd$index] <- res[[2]]
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
if (!prediction) {
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
if (mat_pred) {
pred_mat = matrix(res[[2]],num_class,length(fd$index))
predictValues[fd$index,] <- t(pred_mat)
} else {
predictValues[fd$index] <- res[[2]]
}
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
}
}
}
ret <- xgb.cv.aggcv(msg, showsd)

View File

@@ -21,7 +21,12 @@ xgb.load <- function(modelfile) {
stop("xgb.load: modelfile cannot be NULL")
handle <- xgb.Booster(modelfile = modelfile)
bst <- xgb.handleToBooster(handle)
# re-use modelfile if it is raw so we donot need to serialize
if (typeof(modelfile) == "raw") {
bst <- xgb.handleToBooster(handle, modelfile)
} else {
bst <- xgb.handleToBooster(handle, NULL)
}
bst <- xgb.Booster.check(bst)
return(bst)
}

View File

@@ -95,7 +95,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
addTreeId <- function(x, i) paste(i,x,sep = "-")
allTrees <- data.table()
anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
for(i in 1:n_round){
tree <- text[(position[i]+1):(position[i+1]-1)]
@@ -115,7 +116,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
featureBranch <- feature_names[featureBranch + 1]
}
featureLeaf <- rep("Leaf", length(leaf))
splitBranch <- str_extract(branch, "<\\d*\\.*\\d*\\]") %>% str_replace("<", "") %>% str_replace("\\]", "")
splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
splitLeaf <- rep(NA, length(leaf))
yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
yesLeaf <- rep(NA, length(leaf))
@@ -123,8 +124,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
noLeaf <- rep(NA, length(leaf))
missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
missingLeaf <- rep(NA, length(leaf))
qualityBranch <- extract(branch, "gain=\\d*\\.*\\d*")
qualityLeaf <- extract(leaf, "leaf=\\-*\\d*\\.*\\d*")
qualityBranch <- extract(branch, paste0("gain=",anynumber_regex))
qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]

View File

@@ -2,17 +2,6 @@
#'
#' Read a data.table containing feature importance details and plot it.
#'
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 aes
#' @importFrom ggplot2 geom_bar
#' @importFrom ggplot2 coord_flip
#' @importFrom ggplot2 xlab
#' @importFrom ggplot2 ylab
#' @importFrom ggplot2 ggtitle
#' @importFrom ggplot2 theme
#' @importFrom ggplot2 element_text
#' @importFrom ggplot2 element_blank
#' @importFrom Ckmeans.1d.dp Ckmeans.1d.dp
#' @importFrom magrittr %>%
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
#' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
@@ -44,11 +33,17 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
if (!"data.table" %in% class(importance_matrix)) {
stop("importance_matrix: Should be a data.table.")
}
if (!require(ggplot2, quietly = TRUE)) {
stop("ggplot2 package is required for plotting the importance", call. = FALSE)
}
if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
}
# To avoid issues in clustering when co-occurences are used
importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
clusters <- suppressWarnings(Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )

View File

@@ -15,7 +15,6 @@
#' @importFrom stringr str_split
#' @importFrom stringr str_extract
#' @importFrom stringr str_trim
#' @importFrom DiagrammeR mermaid
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
@@ -64,7 +63,11 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
}
if(is.null(model)){
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
} else {
@@ -85,7 +88,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
mermaid(path, width, height)
DiagrammeR::mermaid(path, width, height)
}
# Avoid error messages during CRAN check.

View File

@@ -4,4 +4,5 @@ PKGROOT=../../
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o

View File

@@ -15,5 +15,5 @@ xgblib:
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o
$(OBJECTS) : xgblib

View File

@@ -59,6 +59,9 @@ inline void _WrapperEnd(void) {
}
extern "C" {
SEXP XGCheckNullPtr_R(SEXP handle) {
return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
}
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));

View File

@@ -11,6 +11,12 @@ extern "C" {
}
extern "C" {
/*!
* \brief check whether a handle is NULL
* \param handle
* \return whether it is null ptr
*/
SEXP XGCheckNullPtr_R(SEXP handle);
/*!
* \brief load a data matrix
* \param fname name of the content