Merge branch 'master' into unity
This commit is contained in:
commit
c639efc71b
2
.gitignore
vendored
2
.gitignore
vendored
@ -52,4 +52,6 @@ xgboost.mpi
|
||||
xgboost.mock
|
||||
train*
|
||||
rabit
|
||||
.Rbuildignore
|
||||
R-package.Rproj
|
||||
|
||||
|
||||
12
Makefile
12
Makefile
@ -75,15 +75,17 @@ Rpack:
|
||||
cd subtree/rabit;make clean;cd ..
|
||||
rm -rf xgboost xgboost*.tar.gz
|
||||
cp -r R-package xgboost
|
||||
rm -rf xgboost/inst/examples/*.buffer
|
||||
rm -rf xgboost/inst/examples/*.model
|
||||
rm -rf xgboost/inst/examples/dump*
|
||||
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
|
||||
rm -rf xgboost/src/*/*.o
|
||||
rm -rf subtree/rabit/src/*.o
|
||||
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
|
||||
rm -rf xgboost/demo/runall.R
|
||||
cp -r src xgboost/src/src
|
||||
cp -r subtree xgboost/src/subtree
|
||||
mkdir xgboost/src/subtree
|
||||
mkdir xgboost/src/subtree/rabit
|
||||
cp -r subtree/rabit/include xgboost/src/subtree/rabit/include
|
||||
cp -r subtree/rabit/src xgboost/src/subtree/rabit/src
|
||||
rm -rf xgboost/src/subtree/rabit/src/*.o
|
||||
mkdir xgboost/src/wrapper
|
||||
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
|
||||
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
|
||||
@ -95,5 +97,5 @@ Rpack:
|
||||
R CMD check --as-cran xgboost*.tar.gz
|
||||
|
||||
clean:
|
||||
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||
$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||
cd subtree/rabit; make clean; cd ..
|
||||
|
||||
@ -3,7 +3,7 @@ Type: Package
|
||||
Title: eXtreme Gradient Boosting
|
||||
Version: 0.3-3
|
||||
Date: 2014-12-28
|
||||
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>
|
||||
Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michaël Benesty <michael@benesty.fr>
|
||||
Maintainer: Tong He <hetong007@gmail.com>
|
||||
Description: This package is a R wrapper of xgboost, which is short for eXtreme
|
||||
Gradient Boosting. It is an efficient and scalable implementation of
|
||||
@ -17,13 +17,16 @@ Description: This package is a R wrapper of xgboost, which is short for eXtreme
|
||||
License: Apache License (== 2.0) | file LICENSE
|
||||
URL: https://github.com/tqchen/xgboost
|
||||
BugReports: https://github.com/tqchen/xgboost/issues
|
||||
VignetteBuilder: knitr
|
||||
Suggests: knitr
|
||||
Depends:
|
||||
R (>= 2.10)
|
||||
Imports:
|
||||
Matrix (>= 1.1-0),
|
||||
methods,
|
||||
data.table (>= 1.9),
|
||||
data.table (>= 1.9.4),
|
||||
magrittr (>= 1.5),
|
||||
stringr,
|
||||
DiagrammeR,
|
||||
vcd
|
||||
stringr (>= 0.6.2),
|
||||
DiagrammeR (>= 0.4),
|
||||
ggplot2 (>= 1.0.0),
|
||||
Ckmeans.1d.dp (>= 3.3.0)
|
||||
|
||||
@ -10,22 +10,36 @@ export(xgb.dump)
|
||||
export(xgb.importance)
|
||||
export(xgb.load)
|
||||
export(xgb.model.dt.tree)
|
||||
export(xgb.plot.importance)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
export(xgb.save.raw)
|
||||
export(xgb.train)
|
||||
export(xgboost)
|
||||
exportMethods(predict)
|
||||
import(methods)
|
||||
importClassesFrom(Matrix,dgCMatrix)
|
||||
importClassesFrom(Matrix,dgeMatrix)
|
||||
importFrom(DiagrammeR,DiagrammeR)
|
||||
importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp)
|
||||
importFrom(DiagrammeR,mermaid)
|
||||
importFrom(data.table,":=")
|
||||
importFrom(data.table,as.data.table)
|
||||
importFrom(data.table,copy)
|
||||
importFrom(data.table,data.table)
|
||||
importFrom(data.table,fread)
|
||||
importFrom(data.table,rbindlist)
|
||||
importFrom(data.table,set)
|
||||
importFrom(data.table,setnames)
|
||||
importFrom(ggplot2,aes)
|
||||
importFrom(ggplot2,coord_flip)
|
||||
importFrom(ggplot2,element_blank)
|
||||
importFrom(ggplot2,element_text)
|
||||
importFrom(ggplot2,geom_bar)
|
||||
importFrom(ggplot2,ggplot)
|
||||
importFrom(ggplot2,ggtitle)
|
||||
importFrom(ggplot2,theme)
|
||||
importFrom(ggplot2,xlab)
|
||||
importFrom(ggplot2,ylab)
|
||||
importFrom(magrittr,"%>%")
|
||||
importFrom(magrittr,add)
|
||||
importFrom(magrittr,not)
|
||||
|
||||
@ -32,10 +32,15 @@ setMethod("getinfo", signature = "xgb.DMatrix",
|
||||
if (class(object) != "xgb.DMatrix") {
|
||||
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
if (name != "label" && name != "weight" && name != "base_margin") {
|
||||
if (name != "label" && name != "weight" &&
|
||||
name != "base_margin" && name != "nrow") {
|
||||
stop(paste("xgb.getinfo: unknown info name", name))
|
||||
}
|
||||
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
|
||||
if (name != "nrow"){
|
||||
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
|
||||
} else {
|
||||
ret <- xgb.numrow(object)
|
||||
}
|
||||
return(ret)
|
||||
})
|
||||
|
||||
|
||||
@ -1,4 +1,7 @@
|
||||
setClass("xgb.Booster")
|
||||
setClass("xgb.Booster.handle")
|
||||
setClass("xgb.Booster",
|
||||
slots = c(handle = "xgb.Booster.handle",
|
||||
raw = "raw"))
|
||||
|
||||
#' Predict method for eXtreme Gradient Boosting model
|
||||
#'
|
||||
@ -7,14 +10,16 @@ setClass("xgb.Booster")
|
||||
#' @param object Object of class "xgb.Boost"
|
||||
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
#' \code{xgb.DMatrix}.
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param outputmargin whether the prediction should be shown in the original
|
||||
#' value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||
#' untransformed margin value. In logistic regression, outputmargin=T will
|
||||
#' output value before logistic transformation.
|
||||
#' @param predleaf whether predict leaf index instead
|
||||
#' @param ntreelimit limit number of trees used in prediction, this parameter is
|
||||
#' only valid for gbtree, but not for gblinear. set it to be value bigger
|
||||
#' than 0. It will use all trees by default.
|
||||
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
@ -26,7 +31,13 @@ setClass("xgb.Booster")
|
||||
#' @export
|
||||
#'
|
||||
setMethod("predict", signature = "xgb.Booster",
|
||||
definition = function(object, newdata, missing = NULL, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
|
||||
definition = function(object, newdata, missing = NULL,
|
||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
|
||||
if (class(object) != "xgb.Booster"){
|
||||
stop("predict: model in prediction must be of class xgb.Booster")
|
||||
} else {
|
||||
object <- xgb.Booster.check(object, saveraw = FALSE)
|
||||
}
|
||||
if (class(newdata) != "xgb.DMatrix") {
|
||||
if (is.null(missing)) {
|
||||
newdata <- xgb.DMatrix(newdata)
|
||||
@ -48,7 +59,17 @@ setMethod("predict", signature = "xgb.Booster",
|
||||
if (predleaf) {
|
||||
option <- option + 2
|
||||
}
|
||||
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(option), as.integer(ntreelimit), PACKAGE = "xgboost")
|
||||
ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option),
|
||||
as.integer(ntreelimit), PACKAGE = "xgboost")
|
||||
if (predleaf){
|
||||
len <- getinfo(newdata, "nrow")
|
||||
if (length(ret) == len){
|
||||
ret <- matrix(ret,ncol = 1)
|
||||
} else {
|
||||
ret <- matrix(ret, ncol = len)
|
||||
ret <- t(ret)
|
||||
}
|
||||
}
|
||||
return(ret)
|
||||
})
|
||||
|
||||
|
||||
|
||||
19
R-package/R/predict.xgb.Booster.handle.R
Normal file
19
R-package/R/predict.xgb.Booster.handle.R
Normal file
@ -0,0 +1,19 @@
|
||||
#' Predict method for eXtreme Gradient Boosting model handle
|
||||
#'
|
||||
#' Predicted values based on xgb.Booster.handle object.
|
||||
#'
|
||||
#' @param object Object of class "xgb.Boost.handle"
|
||||
#' @param ... Parameters pass to \code{predict.xgb.Booster}
|
||||
#'
|
||||
setMethod("predict", signature = "xgb.Booster.handle",
|
||||
definition = function(object, ...) {
|
||||
if (class(object) != "xgb.Booster.handle"){
|
||||
stop("predict: model in prediction must be of class xgb.Booster.handle")
|
||||
}
|
||||
|
||||
bst <- xgb.handleToBooster(object)
|
||||
|
||||
ret = predict(bst, ...)
|
||||
return(ret)
|
||||
})
|
||||
|
||||
@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
|
||||
if (class(object) != "xgb.DMatrix") {
|
||||
stop("slice: first argument dtrain must be xgb.DMatrix")
|
||||
}
|
||||
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
|
||||
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
|
||||
PACKAGE = "xgboost")
|
||||
|
||||
attr_list <- attributes(object)
|
||||
nr <- xgb.numrow(object)
|
||||
len <- sapply(attr_list,length)
|
||||
ind <- which(len==nr)
|
||||
if (length(ind)>0) {
|
||||
nms <- names(attr_list)[ind]
|
||||
for (i in 1:length(ind)) {
|
||||
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
|
||||
}
|
||||
}
|
||||
return(structure(ret, class = "xgb.DMatrix"))
|
||||
})
|
||||
|
||||
@ -57,12 +57,35 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
}
|
||||
}
|
||||
if (!is.null(modelfile)) {
|
||||
if (typeof(modelfile) != "character") {
|
||||
stop("xgb.Booster: modelfile must be character")
|
||||
if (typeof(modelfile) == "character") {
|
||||
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
|
||||
} else if (typeof(modelfile) == "raw") {
|
||||
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
|
||||
} else {
|
||||
stop("xgb.Booster: modelfile must be character or raw vector")
|
||||
}
|
||||
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
|
||||
}
|
||||
return(structure(handle, class = "xgb.Booster"))
|
||||
return(structure(handle, class = "xgb.Booster.handle"))
|
||||
}
|
||||
|
||||
# convert xgb.Booster.handle to xgb.Booster
|
||||
xgb.handleToBooster <- function(handle)
|
||||
{
|
||||
bst <- list(handle = handle, raw = NULL)
|
||||
class(bst) <- "xgb.Booster"
|
||||
return(bst)
|
||||
}
|
||||
|
||||
# Check whether an xgb.Booster object is complete
|
||||
xgb.Booster.check <- function(bst, saveraw = TRUE)
|
||||
{
|
||||
if (is.null(bst$handle)) {
|
||||
bst$handle <- xgb.load(bst$raw)
|
||||
} else {
|
||||
if (is.null(bst$raw) && saveraw)
|
||||
bst$raw <- xgb.save.raw(bst$handle)
|
||||
}
|
||||
return(bst)
|
||||
}
|
||||
|
||||
## ----the following are low level iteratively function, not needed if
|
||||
@ -99,8 +122,8 @@ xgb.numrow <- function(dmat) {
|
||||
}
|
||||
# iteratively update booster with customized statistics
|
||||
xgb.iter.boost <- function(booster, dtrain, gpair) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
if (class(booster) != "xgb.Booster.handle") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
@ -112,8 +135,8 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
|
||||
|
||||
# iteratively update booster with dtrain
|
||||
xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
if (class(booster) != "xgb.Booster.handle") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster.handle")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
@ -131,8 +154,8 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
|
||||
}
|
||||
|
||||
# iteratively evaluate one iteration
|
||||
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
|
||||
if (class(booster) != "xgb.Booster.handle") {
|
||||
stop("xgb.eval: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(watchlist) != "list") {
|
||||
@ -169,18 +192,27 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
|
||||
} else {
|
||||
msg <- ""
|
||||
}
|
||||
if (prediction){
|
||||
preds <- predict(booster,watchlist[[2]])
|
||||
return(list(msg,preds))
|
||||
}
|
||||
return(msg)
|
||||
}
|
||||
}
|
||||
#------------------------------------------
|
||||
# helper functions for cross validation
|
||||
#
|
||||
xgb.cv.mknfold <- function(dall, nfold, param) {
|
||||
randidx <- sample(1 : xgb.numrow(dall))
|
||||
kstep <- length(randidx) / nfold
|
||||
idset <- list()
|
||||
for (i in 1:nfold) {
|
||||
idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
|
||||
if (nfold <= 1) {
|
||||
stop("nfold must be bigger than 1")
|
||||
}
|
||||
randidx <- sample(1 : xgb.numrow(dall))
|
||||
kstep <- length(randidx) %/% nfold
|
||||
idset <- list()
|
||||
for (i in 1:(nfold-1)) {
|
||||
idset[[i]] = randidx[1:kstep]
|
||||
randidx = setdiff(randidx,idset[[i]])
|
||||
}
|
||||
idset[[nfold]] = randidx
|
||||
ret <- list()
|
||||
for (k in 1:nfold) {
|
||||
dtest <- slice(dall, idset[[k]])
|
||||
@ -193,7 +225,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
|
||||
dtrain <- slice(dall, didx)
|
||||
bst <- xgb.Booster(param, list(dtrain, dtest))
|
||||
watchlist = list(train=dtrain, test=dtest)
|
||||
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
|
||||
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
|
||||
}
|
||||
return (ret)
|
||||
}
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#' indicating the data file.
|
||||
#' @param info a list of information of the xgb.DMatrix object
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#
|
||||
#' @param ... other information to pass to \code{info}.
|
||||
#'
|
||||
|
||||
@ -31,6 +31,9 @@
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param nfold number of folds used
|
||||
#' @param label option field, when data is Matrix
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param prediction A logical value indicating whether to return the prediction vector.
|
||||
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
|
||||
#' @param metrics, list of evaluation metrics to be used in corss validation,
|
||||
#' when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
@ -47,8 +50,6 @@
|
||||
#' @param feval custimized evaluation function. Returns
|
||||
#' \code{list(metric='metric-name', value='metric-value')} with given
|
||||
#' prediction and dtrain,
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param verbose \code{boolean}, print the statistics during the process.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
@ -71,7 +72,8 @@
|
||||
#' @export
|
||||
#'
|
||||
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
|
||||
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
|
||||
prediction = FALSE, showsd = TRUE, metrics=list(),
|
||||
obj = NULL, feval = NULL, verbose = T,...) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.cv: first argument params must be list")
|
||||
}
|
||||
@ -90,13 +92,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
}
|
||||
|
||||
folds <- xgb.cv.mknfold(dtrain, nfold, params)
|
||||
predictValues <- rep(0,xgb.numrow(dtrain))
|
||||
history <- c()
|
||||
for (i in 1:nrounds) {
|
||||
msg <- list()
|
||||
for (k in 1:nfold) {
|
||||
fd <- folds[[k]]
|
||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
|
||||
if (!prediction){
|
||||
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
|
||||
} else {
|
||||
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
|
||||
predictValues[fd$index] <- res[[2]]
|
||||
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
|
||||
}
|
||||
}
|
||||
ret <- xgb.cv.aggcv(msg, showsd)
|
||||
history <- c(history, ret)
|
||||
@ -105,15 +114,25 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
||||
|
||||
colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
|
||||
colnamesMean <- paste(colnames, "mean")
|
||||
colnamesStd <- paste(colnames, "std")
|
||||
if(showsd) colnamesStd <- paste(colnames, "std")
|
||||
|
||||
colnames <- c()
|
||||
for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
||||
if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
|
||||
else colnames <- colnamesMean
|
||||
|
||||
type <- rep(x = "numeric", times = length(colnames))
|
||||
dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
|
||||
split <- str_split(string = history, pattern = "\t")
|
||||
|
||||
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
|
||||
dt
|
||||
}
|
||||
|
||||
if (prediction) {
|
||||
return(list(dt = dt,pred = predictValues))
|
||||
}
|
||||
return(dt)
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(".")
|
||||
|
||||
@ -3,8 +3,10 @@
|
||||
#' Save a xgboost model to text file. Could be parsed later.
|
||||
#'
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_replace
|
||||
#' @importFrom data.table fread
|
||||
#' @importFrom data.table :=
|
||||
#' @importFrom data.table setnames
|
||||
#' @param model the model object.
|
||||
#' @param fname the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
|
||||
#' @param fmap feature map file representing the type of feature.
|
||||
@ -29,7 +31,7 @@
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' # save the model in file 'xgb.model.dump'
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
|
||||
#'
|
||||
#' # print the model without saving it to a file
|
||||
#' print(xgb.dump(bst))
|
||||
@ -38,6 +40,8 @@
|
||||
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
|
||||
if (class(model) != "xgb.Booster") {
|
||||
stop("model: argument must be type xgb.Booster")
|
||||
} else {
|
||||
model <- xgb.Booster.check(model)
|
||||
}
|
||||
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
|
||||
stop("fname: argument must be type character (when provided)")
|
||||
@ -46,12 +50,22 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
|
||||
stop("fmap: argument must be type character (when provided)")
|
||||
}
|
||||
|
||||
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
|
||||
longString <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with.stats), PACKAGE = "xgboost")
|
||||
|
||||
dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F)
|
||||
|
||||
setnames(dt, "Lines")
|
||||
|
||||
if(is.null(fname)) {
|
||||
return(str_split(result, "\n") %>% unlist %>% str_replace("^\t+","") %>% Filter(function(x) x != "", .))
|
||||
result <- dt[Lines != "0"][, Lines := str_replace(Lines, "^\t+", "")][Lines != ""][, paste(Lines)]
|
||||
return(result)
|
||||
} else {
|
||||
result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname)
|
||||
result <- dt[Lines != "0"][Lines != ""][, paste(Lines)] %>% writeLines(fname)
|
||||
return(TRUE)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Lines", "."))
|
||||
@ -32,7 +32,8 @@
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||
#' #(labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
@ -72,11 +73,16 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
|
||||
}
|
||||
|
||||
treeDump <- function(feature_names, text){
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover),Frequence=Frequence/sum(Frequence))][order(-Gain)]
|
||||
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))]
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
linearDump <- function(feature_names, text){
|
||||
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
|
||||
}
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(".")
|
||||
@ -19,5 +19,9 @@
|
||||
xgb.load <- function(modelfile) {
|
||||
if (is.null(modelfile))
|
||||
stop("xgb.load: modelfile cannot be NULL")
|
||||
xgb.Booster(modelfile = modelfile)
|
||||
|
||||
handle <- xgb.Booster(modelfile = modelfile)
|
||||
bst <- xgb.handleToBooster(handle)
|
||||
bst <- xgb.Booster.check(bst)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -37,21 +37,22 @@
|
||||
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||
#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
|
||||
#' }
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||
#' #(labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
#'
|
||||
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump')
|
||||
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
|
||||
#'
|
||||
#' @export
|
||||
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
|
||||
@ -127,7 +128,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
||||
coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
|
||||
coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
|
||||
dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
|
||||
|
||||
|
||||
allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
|
||||
}
|
||||
|
||||
@ -158,6 +159,11 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
|
||||
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
|
||||
j = "No.Quality",
|
||||
value = allTrees[ID == no,Quality])
|
||||
|
||||
|
||||
allTrees
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
|
||||
59
R-package/R/xgb.plot.importance.R
Normal file
59
R-package/R/xgb.plot.importance.R
Normal file
@ -0,0 +1,59 @@
|
||||
#' Plot feature importance bar graph
|
||||
#'
|
||||
#' Read a data.table containing feature importance details and plot it.
|
||||
#'
|
||||
#' @importFrom ggplot2 ggplot
|
||||
#' @importFrom ggplot2 aes
|
||||
#' @importFrom ggplot2 geom_bar
|
||||
#' @importFrom ggplot2 coord_flip
|
||||
#' @importFrom ggplot2 xlab
|
||||
#' @importFrom ggplot2 ylab
|
||||
#' @importFrom ggplot2 ggtitle
|
||||
#' @importFrom ggplot2 theme
|
||||
#' @importFrom ggplot2 element_text
|
||||
#' @importFrom ggplot2 element_blank
|
||||
#' @importFrom Ckmeans.1d.dp Ckmeans.1d.dp
|
||||
#' @importFrom magrittr %>%
|
||||
#' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
|
||||
#' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
|
||||
#'
|
||||
#' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
|
||||
#'
|
||||
#' @details
|
||||
#' The purpose of this function is to easily represent the importance of each feature of a model.
|
||||
#' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
||||
#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||
#' #(labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#'
|
||||
#' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
#' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst)
|
||||
#' xgb.plot.importance(importance_matrix)
|
||||
#'
|
||||
#' @export
|
||||
xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
|
||||
if (!"data.table" %in% class(importance_matrix)) {
|
||||
stop("importance_matrix: Should be a data.table.")
|
||||
}
|
||||
|
||||
clusters <- suppressWarnings(Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
|
||||
importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
|
||||
|
||||
plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+ geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
|
||||
|
||||
return(plot)
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Feature","Gain", "Cluster"))
|
||||
@ -15,7 +15,7 @@
|
||||
#' @importFrom stringr str_split
|
||||
#' @importFrom stringr str_extract
|
||||
#' @importFrom stringr str_trim
|
||||
#' @importFrom DiagrammeR DiagrammeR
|
||||
#' @importFrom DiagrammeR mermaid
|
||||
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
|
||||
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
|
||||
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
|
||||
@ -42,7 +42,8 @@
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#'
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#' #Both dataset are list with two items, a sparse matrix and labels
|
||||
#' #(labels = outcome column which will be learned).
|
||||
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
#' train <- agaricus.train
|
||||
#'
|
||||
@ -84,5 +85,10 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU
|
||||
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
|
||||
|
||||
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
|
||||
DiagrammeR(path, width, height)
|
||||
mermaid(path, width, height)
|
||||
}
|
||||
|
||||
# Avoid error messages during CRAN check.
|
||||
# The reason is that these variables are never declared
|
||||
# They are mainly column names inferred by Data.table...
|
||||
globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))
|
||||
@ -22,7 +22,8 @@ xgb.save <- function(model, fname) {
|
||||
stop("xgb.save: fname must be character")
|
||||
}
|
||||
if (class(model) == "xgb.Booster") {
|
||||
.Call("XGBoosterSaveModel_R", model, fname, PACKAGE = "xgboost")
|
||||
model <- xgb.Booster.check(model)
|
||||
.Call("XGBoosterSaveModel_R", model$handle, fname, PACKAGE = "xgboost")
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
|
||||
|
||||
30
R-package/R/xgb.save.raw.R
Normal file
30
R-package/R/xgb.save.raw.R
Normal file
@ -0,0 +1,30 @@
|
||||
#' Save xgboost model to R's raw vector,
|
||||
#' user can call xgb.load to load the model back from raw vector
|
||||
#'
|
||||
#' Save xgboost model from xgboost or xgb.train
|
||||
#'
|
||||
#' @param model the model object.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
#' data(agaricus.test, package='xgboost')
|
||||
#' train <- agaricus.train
|
||||
#' test <- agaricus.test
|
||||
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
#' eta = 1, nround = 2,objective = "binary:logistic")
|
||||
#' raw <- xgb.save.raw(bst)
|
||||
#' bst <- xgb.load(raw)
|
||||
#' pred <- predict(bst, test$data)
|
||||
#' @export
|
||||
#'
|
||||
xgb.save.raw <- function(model) {
|
||||
if (class(model) == "xgb.Booster"){
|
||||
model <- model$handle
|
||||
}
|
||||
if (class(model) == "xgb.Booster.handle") {
|
||||
raw <- .Call("XGBoosterModelToRaw_R", model, PACKAGE = "xgboost")
|
||||
return(raw)
|
||||
}
|
||||
stop("xgb.raw: the input must be xgb.Booster.handle. Use xgb.DMatrix.save to save
|
||||
xgb.DMatrix object.")
|
||||
}
|
||||
@ -86,13 +86,15 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
|
||||
}
|
||||
params = append(params, list(...))
|
||||
|
||||
bst <- xgb.Booster(params, append(watchlist, dtrain))
|
||||
handle <- xgb.Booster(params, append(watchlist, dtrain))
|
||||
bst <- xgb.handleToBooster(handle)
|
||||
for (i in 1:nrounds) {
|
||||
succ <- xgb.iter.update(bst, dtrain, i - 1, obj)
|
||||
succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
|
||||
if (length(watchlist) != 0) {
|
||||
msg <- xgb.iter.eval(bst, watchlist, i - 1, feval)
|
||||
msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
|
||||
cat(paste(msg, "\n", sep=""))
|
||||
}
|
||||
}
|
||||
bst <- xgb.Booster.check(bst)
|
||||
return(bst)
|
||||
}
|
||||
|
||||
@ -5,34 +5,79 @@
|
||||
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
#' \code{xgb.DMatrix}.
|
||||
#' @param label the response variable. User should not set this field,
|
||||
# if data is local data file or \code{xgb.DMatrix}.
|
||||
#' @param params the list of parameters. Commonly used ones are:
|
||||
#' if data is local data file or \code{xgb.DMatrix}.
|
||||
#' @param params the list of parameters.
|
||||
#'
|
||||
#' 1. General Parameters
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{objective} objective function, common ones are
|
||||
#' \itemize{
|
||||
#' \item \code{reg:linear} linear regression
|
||||
#' \item \code{binary:logistic} logistic regression for classification
|
||||
#' }
|
||||
#' \item \code{eta} step size of each boosting step
|
||||
#' \item \code{max.depth} maximum depth of the tree
|
||||
#' \item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
#' \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
|
||||
#' \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
|
||||
#' }
|
||||
#'
|
||||
#' See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||
#' further details. See also demo/ for walkthrough example in R.
|
||||
#'
|
||||
#' 2. Booster Parameters
|
||||
#'
|
||||
#' 2.1. Parameter for Tree Booster
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3
|
||||
#' \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||
#' \item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
#' \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||
#' \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1
|
||||
#' \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
#' }
|
||||
#'
|
||||
#' 2.2. Parameter for Linear Booster
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{lambda} L2 regularization term on weights. Default: 0
|
||||
#' \item \code{lambda_bias} L2 regularization term on bias. Default: 0
|
||||
#' \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
#' }
|
||||
#'
|
||||
#' 3. Task Parameters
|
||||
#'
|
||||
#' \itemize{
|
||||
#' \item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
|
||||
#' \itemize{
|
||||
#' \item \code{reg:linear} linear regression (Default).
|
||||
#' \item \code{reg:logistic} logistic regression.
|
||||
#' \item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
#' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes).
|
||||
#' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.
|
||||
#' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
#' }
|
||||
#' \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
#' \item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||
#' }
|
||||
#'
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
|
||||
#' information of performance. If 2, xgboost will print information of both
|
||||
#' performance and construction progress information
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
|
||||
#' @param missing Missing is only used when input is dense matrix, pick a float
|
||||
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
|
||||
#' @param ... other parameters to pass to \code{params}.
|
||||
#'
|
||||
#' @details
|
||||
#' This is the modeling function for xgboost.
|
||||
#'
|
||||
#' Parallelization is automatically enabled if OpenMP is present.
|
||||
#' Number of threads can also be manually specified via "nthread" parameter
|
||||
#' Number of threads can also be manually specified via "nthread" parameter.
|
||||
#'
|
||||
#' \code{eval_metric} is set automatically by xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by xgboost to help you to understand how it works inside. It should not be overriden until you have a real reason to do so.
|
||||
#' \itemize{
|
||||
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
|
||||
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
|
||||
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
#' \item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG}
|
||||
#' }
|
||||
#'
|
||||
#' More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters}.
|
||||
#'
|
||||
#' @examples
|
||||
#' data(agaricus.train, package='xgboost')
|
||||
|
||||
@ -5,8 +5,7 @@
|
||||
For up-to-date version(which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
|
||||
|
||||
```r
|
||||
require(devtools)
|
||||
install_github('tqchen/xgboost',subdir='R-package')
|
||||
devtools::install_github('tqchen/xgboost',subdir='R-package')
|
||||
```
|
||||
|
||||
For stable version on CRAN, please run
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@ -4,4 +4,5 @@ boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
generalized_linear_model Generalized Linear Model
|
||||
cross_validation Cross validation
|
||||
create_sparse_matrix
|
||||
create_sparse_matrix Create Sparse Matrix
|
||||
predict_leaf_indices Predicting the corresponding leaves
|
||||
|
||||
@ -14,5 +14,5 @@ Benchmarks
|
||||
|
||||
Notes
|
||||
====
|
||||
* Contribution of exampls, benchmarks is more than welcomed!
|
||||
* Contribution of examples, benchmarks is more than welcomed!
|
||||
* If you like to share how you use xgboost to solve your problem, send a pull request:)
|
||||
|
||||
@ -58,6 +58,14 @@ pred2 <- predict(bst2, test$data)
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
|
||||
|
||||
# save model to R's raw vector
|
||||
raw = xgb.save.raw(bst)
|
||||
# load binary model to R
|
||||
bst3 <- xgb.load(raw)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
||||
|
||||
#----------------Advanced features --------------
|
||||
# to use advanced features, we need to put data in xgb.DMatrix
|
||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
require(vcd) #Available in Cran. Used for its dataset with categorical values.
|
||||
if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
|
||||
|
||||
# According to its documentation, Xgboost works only on numbers.
|
||||
# Sometimes the dataset we have to work on have categorical data.
|
||||
@ -86,4 +86,4 @@ print(chisq.test(df$AgeCat, df$Y))
|
||||
|
||||
# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
|
||||
# However it's almost always worse when you add some arbitrary rules.
|
||||
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
|
||||
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
|
||||
|
||||
@ -19,7 +19,7 @@ cat('running cross validation, disable standard deviation display\n')
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nround, nfold=5,
|
||||
metrics={'error'}, , showsd = FALSE)
|
||||
metrics={'error'}, showsd = FALSE)
|
||||
|
||||
###
|
||||
# you can also do cross validation with cutomized loss function
|
||||
@ -45,3 +45,7 @@ param <- list(max.depth=2,eta=1,silent=1)
|
||||
xgb.cv(param, dtrain, nround, nfold = 5,
|
||||
obj = logregobj, feval=evalerror)
|
||||
|
||||
# do cross validation with prediction values for each fold
|
||||
res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
|
||||
res$dt
|
||||
length(res$pred)
|
||||
|
||||
21
R-package/demo/predict_leaf_indices.R
Normal file
21
R-package/demo/predict_leaf_indices.R
Normal file
@ -0,0 +1,21 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
nround = 5
|
||||
|
||||
# training the model for two rounds
|
||||
bst = xgb.train(param, dtrain, nround, watchlist)
|
||||
cat('start testing prediction from first n trees\n')
|
||||
|
||||
### predict using first 2 tree
|
||||
pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
|
||||
head(pred_with_leaf)
|
||||
# by default, we predict using all the trees
|
||||
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
|
||||
head(pred_with_leaf)
|
||||
@ -5,4 +5,5 @@ demo(boost_from_prediction)
|
||||
demo(predict_first_ntree)
|
||||
demo(generalized_linear_model)
|
||||
demo(cross_validation)
|
||||
|
||||
demo(create_sparse_matrix)
|
||||
demo(predict_leaf_indices)
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
\title{Predict method for eXtreme Gradient Boosting model}
|
||||
\usage{
|
||||
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
|
||||
outputmargin = FALSE, ntreelimit = NULL)
|
||||
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.Boost"}
|
||||
@ -14,6 +14,9 @@
|
||||
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
\code{xgb.DMatrix}.}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{outputmargin}{whether the prediction should be shown in the original
|
||||
value of sum of functions, when outputmargin=TRUE, the prediction is
|
||||
untransformed margin value. In logistic regression, outputmargin=T will
|
||||
@ -22,6 +25,8 @@ output value before logistic transformation.}
|
||||
\item{ntreelimit}{limit number of trees used in prediction, this parameter is
|
||||
only valid for gbtree, but not for gblinear. set it to be value bigger
|
||||
than 0. It will use all trees by default.}
|
||||
|
||||
\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
|
||||
}
|
||||
\description{
|
||||
Predicted values based on xgboost model object.
|
||||
|
||||
18
R-package/man/predict-xgb.Booster.handle-method.Rd
Normal file
18
R-package/man/predict-xgb.Booster.handle-method.Rd
Normal file
@ -0,0 +1,18 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/predict.xgb.Booster.handle.R
|
||||
\docType{methods}
|
||||
\name{predict,xgb.Booster.handle-method}
|
||||
\alias{predict,xgb.Booster.handle-method}
|
||||
\title{Predict method for eXtreme Gradient Boosting model handle}
|
||||
\usage{
|
||||
\S4method{predict}{xgb.Booster.handle}(object, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class "xgb.Boost.handle"}
|
||||
|
||||
\item{...}{Parameters pass to \code{predict.xgb.Booster}}
|
||||
}
|
||||
\description{
|
||||
Predicted values based on xgb.Booster.handle object.
|
||||
}
|
||||
|
||||
@ -12,7 +12,8 @@ indicating the data file.}
|
||||
|
||||
\item{info}{a list of information of the xgb.DMatrix object}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{...}{other information to pass to \code{info}.}
|
||||
}
|
||||
|
||||
@ -5,8 +5,8 @@
|
||||
\title{Cross Validation}
|
||||
\usage{
|
||||
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
|
||||
feval = NULL, verbose = T, ...)
|
||||
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
|
||||
obj = NULL, feval = NULL, verbose = T, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
@ -32,7 +32,10 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
|
||||
|
||||
\item{label}{option field, when data is Matrix}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{prediction}{A logical value indicating whether to return the prediction vector.}
|
||||
|
||||
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
# save the model in file 'xgb.model.dump'
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
|
||||
|
||||
# print the model without saving it to a file
|
||||
print(xgb.dump(bst))
|
||||
|
||||
@ -38,7 +38,8 @@ There are 3 columns :
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
@ -39,20 +39,21 @@ The content of the \code{data.table} is organised that way:
|
||||
\item \code{Quality}: it's the gain related to the split in this specific node ;
|
||||
\item \code{Cover}: metric to measure the number of observation affected by the split ;
|
||||
\item \code{Tree}: ID of the tree. It is included in the main ID ;
|
||||
\item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
||||
|
||||
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump')
|
||||
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
|
||||
}
|
||||
|
||||
|
||||
40
R-package/man/xgb.plot.importance.Rd
Normal file
40
R-package/man/xgb.plot.importance.Rd
Normal file
@ -0,0 +1,40 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.importance.R
|
||||
\name{xgb.plot.importance}
|
||||
\alias{xgb.plot.importance}
|
||||
\title{Plot feature importance bar graph}
|
||||
\usage{
|
||||
xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10))
|
||||
}
|
||||
\arguments{
|
||||
\item{importance_matrix}{a \code{data.table} returned by the \code{xgb.importance} function.}
|
||||
|
||||
\item{numberOfClusters}{a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.}
|
||||
}
|
||||
\value{
|
||||
A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
|
||||
}
|
||||
\description{
|
||||
Read a data.table containing feature importance details and plot it.
|
||||
}
|
||||
\details{
|
||||
The purpose of this function is to easily represent the importance of each feature of a model.
|
||||
The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
|
||||
In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#train$data@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||
importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst)
|
||||
xgb.plot.importance(importance_matrix)
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
\title{Plot a boosted tree model}
|
||||
\usage{
|
||||
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
|
||||
n_first_tree = NULL, CSSstyle = NULL)
|
||||
n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
|
||||
}
|
||||
\arguments{
|
||||
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
|
||||
@ -17,6 +17,10 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
|
||||
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
|
||||
|
||||
\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
|
||||
|
||||
\item{width}{the width of the diagram in pixels.}
|
||||
|
||||
\item{height}{the height of the diagram in pixels.}
|
||||
}
|
||||
\value{
|
||||
A \code{DiagrammeR} of the model.
|
||||
@ -40,7 +44,8 @@ It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpos
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
|
||||
#Both dataset are list with two items, a sparse matrix and labels
|
||||
#(labels = outcome column which will be learned).
|
||||
#Each column of the sparse Matrix is a feature in one hot encoding format.
|
||||
train <- agaricus.train
|
||||
|
||||
|
||||
27
R-package/man/xgb.save.raw.Rd
Normal file
27
R-package/man/xgb.save.raw.Rd
Normal file
@ -0,0 +1,27 @@
|
||||
% Generated by roxygen2 (4.1.0): do not edit by hand
|
||||
% Please edit documentation in R/xgb.save.raw.R
|
||||
\name{xgb.save.raw}
|
||||
\alias{xgb.save.raw}
|
||||
\title{Save xgboost model to R's raw vector,
|
||||
user can call xgb.load to load the model back from raw vector}
|
||||
\usage{
|
||||
xgb.save.raw(model)
|
||||
}
|
||||
\arguments{
|
||||
\item{model}{the model object.}
|
||||
}
|
||||
\description{
|
||||
Save xgboost model from xgboost or xgb.train
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
|
||||
eta = 1, nround = 2,objective = "binary:logistic")
|
||||
raw <- xgb.save.raw(bst)
|
||||
bst <- xgb.load(raw)
|
||||
pred <- predict(bst, test$data)
|
||||
}
|
||||
|
||||
@ -11,24 +11,58 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
|
||||
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
|
||||
\code{xgb.DMatrix}.}
|
||||
|
||||
\item{label}{the response variable. User should not set this field,}
|
||||
\item{label}{the response variable. User should not set this field,
|
||||
if data is local data file or \code{xgb.DMatrix}.}
|
||||
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float}
|
||||
\item{missing}{Missing is only used when input is dense matrix, pick a float
|
||||
value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
|
||||
|
||||
\item{params}{the list of parameters.
|
||||
|
||||
1. General Parameters
|
||||
|
||||
\item{params}{the list of parameters. Commonly used ones are:
|
||||
\itemize{
|
||||
\item \code{objective} objective function, common ones are
|
||||
\itemize{
|
||||
\item \code{reg:linear} linear regression
|
||||
\item \code{binary:logistic} logistic regression for classification
|
||||
}
|
||||
\item \code{eta} step size of each boosting step
|
||||
\item \code{max.depth} maximum depth of the tree
|
||||
\item \code{nthread} number of thread used in training, if not set, all threads are used
|
||||
\item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
|
||||
\item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
|
||||
}
|
||||
|
||||
See \url{https://github.com/tqchen/xgboost/wiki/Parameters} for
|
||||
further details. See also demo/ for walkthrough example in R.}
|
||||
2. Booster Parameters
|
||||
|
||||
2.1. Parameter for Tree Booster
|
||||
|
||||
\itemize{
|
||||
\item \code{eta} step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. Default: 0.3
|
||||
\item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
|
||||
\item \code{max_depth} maximum depth of a tree. Default: 6
|
||||
\item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
|
||||
\item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. Default: 1
|
||||
\item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
|
||||
}
|
||||
|
||||
2.2. Parameter for Linear Booster
|
||||
|
||||
\itemize{
|
||||
\item \code{lambda} L2 regularization term on weights. Default: 0
|
||||
\item \code{lambda_bias} L2 regularization term on bias. Default: 0
|
||||
\item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
|
||||
}
|
||||
|
||||
3. Task Parameters
|
||||
|
||||
\itemize{
|
||||
\item \code{objective} specify the learning task and the corresponding learning objective, and the objective options are below:
|
||||
\itemize{
|
||||
\item \code{reg:linear} linear regression (Default).
|
||||
\item \code{reg:logistic} logistic regression.
|
||||
\item \code{binary:logistic} logistic regression for binary classification. Output probability.
|
||||
\item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
|
||||
\item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes).
|
||||
\item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.
|
||||
\item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
|
||||
}
|
||||
\item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
|
||||
\item \code{eval_metric} evaluation metrics for validation data. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section.
|
||||
}}
|
||||
|
||||
\item{nrounds}{the max number of iterations}
|
||||
|
||||
@ -45,7 +79,19 @@ A simple interface for xgboost in R
|
||||
This is the modeling function for xgboost.
|
||||
|
||||
Parallelization is automatically enabled if OpenMP is present.
|
||||
Number of threads can also be manually specified via "nthread" parameter
|
||||
Number of threads can also be manually specified via "nthread" parameter.
|
||||
|
||||
\code{eval_metric} is set automatically by xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by xgboost to help you to understand how it works inside. It should not be overriden until you have a real reason to do so.
|
||||
\itemize{
|
||||
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
|
||||
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
|
||||
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain. \url{http://en.wikipedia.org/wiki/NDCG}
|
||||
}
|
||||
|
||||
More parameters are available in the Wiki \url{https://github.com/tqchen/xgboost/wiki/Parameters}.
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
@ -71,13 +71,13 @@ extern "C" {
|
||||
SEXP missing) {
|
||||
_WrapperBegin();
|
||||
SEXP dim = getAttrib(mat, R_DimSymbol);
|
||||
int nrow = INTEGER(dim)[0];
|
||||
int ncol = INTEGER(dim)[1];
|
||||
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
|
||||
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
|
||||
double *din = REAL(mat);
|
||||
std::vector<float> data(nrow * ncol);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < nrow; ++i) {
|
||||
for (int j = 0; j < ncol; ++j) {
|
||||
for (bst_omp_uint i = 0; i < nrow; ++i) {
|
||||
for (size_t j = 0; j < ncol; ++j) {
|
||||
data[i * ncol +j] = din[i + nrow * j];
|
||||
}
|
||||
}
|
||||
@ -274,6 +274,23 @@ extern "C" {
|
||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
_WrapperEnd();
|
||||
}
|
||||
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
|
||||
_WrapperBegin();
|
||||
XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
|
||||
RAW(raw),
|
||||
length(raw));
|
||||
_WrapperEnd();
|
||||
}
|
||||
SEXP XGBoosterModelToRaw_R(SEXP handle) {
|
||||
bst_ulong olen;
|
||||
_WrapperBegin();
|
||||
const char *raw = XGBoosterGetModelRaw(R_ExternalPtrAddr(handle), &olen);
|
||||
_WrapperEnd();
|
||||
SEXP ret = PROTECT(allocVector(RAWSXP, olen));
|
||||
memcpy(RAW(ret), raw, olen);
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
|
||||
_WrapperBegin();
|
||||
bst_ulong olen;
|
||||
|
||||
@ -127,6 +127,17 @@ extern "C" {
|
||||
* \param fname file name
|
||||
*/
|
||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
|
||||
/*!
|
||||
* \brief load model from raw array
|
||||
* \param handle handle
|
||||
*/
|
||||
void XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw);
|
||||
/*!
|
||||
* \brief save model into R's raw array
|
||||
* \param handle handle
|
||||
* \return raw array
|
||||
*/
|
||||
SEXP XGBoosterModelToRaw_R(SEXP handle);
|
||||
/*!
|
||||
* \brief dump model into a string
|
||||
* \param handle handle
|
||||
|
||||
243
R-package/vignettes/discoverYourData.Rmd
Normal file
243
R-package/vignettes/discoverYourData.Rmd
Normal file
@ -0,0 +1,243 @@
|
||||
---
|
||||
title: "Understand your dataset with Xgboost"
|
||||
output:
|
||||
html_document:
|
||||
css: vignette.css
|
||||
number_sections: yes
|
||||
toc: yes
|
||||
---
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
The purpose of this Vignette is to show you how to use **Xgboost** to discover and better understand your own dataset.
|
||||
|
||||
You may know **Xgboost** as a state of the art tool to build some kind of Machine learning models. It has been [used](https://github.com/tqchen/xgboost) to win several [Kaggle](http://www.kaggle.com) competition.
|
||||
|
||||
During these competition, the purpose is to make prediction. This Vignette is not about showing you how to predict anything. The purpose of this document is to explain *how to use **Xgboost** to understand the link between the features of your data and an outcome*.
|
||||
|
||||
For the purpose of this tutorial we will first load the required packages.
|
||||
|
||||
```{r libLoading, results='hold', message=F, warning=F}
|
||||
require(xgboost)
|
||||
require(Matrix)
|
||||
require(data.table)
|
||||
if (!require(vcd)) install.packages('vcd')
|
||||
```
|
||||
> **VCD** is used for one of its embedded dataset only (and not for its own functions).
|
||||
|
||||
Preparation of the dataset
|
||||
==========================
|
||||
|
||||
According to its documentation, **Xgboost** works only on `numeric` variables.
|
||||
|
||||
Sometimes the dataset we have to work on have *categorical* data.
|
||||
|
||||
A *categorical* variable is one which have a fixed number of different values. By exemple, if for each observation a variable called *Colour* can have only *red*, *blue* or *green* as value, it is a *categorical* variable.
|
||||
|
||||
> In **R**, *categorical* variable is called `factor`.
|
||||
> Type `?factor` in console for more information.
|
||||
|
||||
In this demo we will see how to transform a dense dataframe with *categorical* variables to a sparse matrix before analyzing it in **Xgboost**.
|
||||
|
||||
The method we are going to see is usually called [one hot encoding](http://en.wikipedia.org/wiki/One-hot).
|
||||
|
||||
The first step is to load Arthritis dataset in memory and create a copy of the dataset with `data.table` package (`data.table` is 100% compliant with **R** dataframe but its syntax is a lot more consistent and its performance are really good).
|
||||
|
||||
```{r, results='hide'}
|
||||
data(Arthritis)
|
||||
df <- data.table(Arthritis, keep.rownames = F)
|
||||
```
|
||||
|
||||
Let's have a look to the 10 first lines of the `data.table`:
|
||||
|
||||
```{r}
|
||||
print(df[1:10])
|
||||
```
|
||||
|
||||
Now we will check the format of each column.
|
||||
|
||||
```{r}
|
||||
str(df)
|
||||
```
|
||||
|
||||
> 2 columns have `factor` type, one has `ordinal` type.
|
||||
> `ordinal` variable is a categorical variable with values wich can be ordered
|
||||
> Here: `None` > `Some` > `Marked`.
|
||||
|
||||
Let's add some new categorical features to see if it helps.
|
||||
|
||||
Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
|
||||
|
||||
```{r}
|
||||
df[,AgeDiscret:= as.factor(round(Age/10,0))][1:10]
|
||||
```
|
||||
|
||||
> For the first feature we create groups of age by rounding the real age.
|
||||
> Note that we transform it to `factor` so the algorithm treat them as independant values.
|
||||
|
||||
Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
|
||||
|
||||
```{r}
|
||||
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))][1:10]
|
||||
```
|
||||
|
||||
We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
|
||||
|
||||
```{r, results='hide'}
|
||||
df[,ID:=NULL]
|
||||
```
|
||||
|
||||
Let's list the different values for the column Treatment.
|
||||
|
||||
```{r}
|
||||
print(levels(df[,Treatment]))
|
||||
```
|
||||
|
||||
|
||||
Next step, we will transform the categorical data to dummy variables.
|
||||
This is the [one hot encoding](http://en.wikipedia.org/wiki/One-hot) part.
|
||||
The purpose is to transform each value of each *categorical* feature in a binary feature.
|
||||
|
||||
For example, the column Treatment will be replaced by two columns, Placebo, and Treated. Each of them will be *binary*. For example an observation which had the value Placebo in column Treatment before the transformation will have, after the transformation, the value 1 in the new column Placebo and the value 0 in the new column Treated.
|
||||
|
||||
> Formulae `Improved~.-1` used below means transform all *categorical* features but column Improved to binary values.
|
||||
|
||||
Column Improved is excluded because it will be our output column, the one we want to predict.
|
||||
|
||||
```{r, warning=FALSE,message=FALSE}
|
||||
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
|
||||
print(sparse_matrix[1:10,])
|
||||
```
|
||||
|
||||
Create the output vector (not as a sparse `Matrix`):
|
||||
|
||||
1. Set, for all rows, field in Y column to 0;
|
||||
2. set Y to 1 when Improved == Marked;
|
||||
3. Return Y column.
|
||||
|
||||
```{r}
|
||||
output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
|
||||
```
|
||||
|
||||
Build the model
|
||||
===============
|
||||
|
||||
The code below is very usual. For more information, you can look at the documentation of `xgboost()` function.
|
||||
|
||||
```{r}
|
||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
|
||||
eta = 1, nround = 10,objective = "binary:logistic")
|
||||
|
||||
```
|
||||
|
||||
You can see plenty of `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well your model explains your data. Lower is better.
|
||||
|
||||
A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy paste too much the past, and is not that good to predict the future).
|
||||
|
||||
> Here you can see the numbers decrease until line 7 and then increase.
|
||||
> It probably means I am overfitting. To fix that I may reduce the number of rounds to `nround = 4`.
|
||||
> I will let things like that because I don't really care for the purpose of this example :-)
|
||||
|
||||
Feature importance
|
||||
==================
|
||||
|
||||
Measure feature importance
|
||||
--------------------------
|
||||
|
||||
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the values of the feature (because one binary column == one value of one *categorical* feature)
|
||||
|
||||
```{r}
|
||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
|
||||
print(importance)
|
||||
```
|
||||
|
||||
> The column `Gain` provide the information we are looking for.
|
||||
> As you can see, features are classified by `Gain`.
|
||||
|
||||
`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as 1, and the other branch saying the exact opposite, both new branch being more accurate than the one before the insertion of the feature).
|
||||
|
||||
`Cover` measure the relative quantity of observations concerned by a feature.
|
||||
|
||||
`Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
|
||||
|
||||
Plotting the feature importance
|
||||
-------------------------------
|
||||
|
||||
All these things are nice, but it would be even better to plot the result. Fortunately, such function already exists.
|
||||
|
||||
```{r}
|
||||
xgb.plot.importance(importance_matrix = importance)
|
||||
```
|
||||
|
||||
Feature have been automatically divided in 2 clusters: the interesting features... and the others.
|
||||
|
||||
> Depending of the case you may have more than two clusters.
|
||||
> Default value is to limit them to 10, but you can increase this limit. Look at the function documentation for more information.
|
||||
|
||||
According to the plot above, the most important feature in this dataset to predict if the treatment will work is :
|
||||
|
||||
* the Age;
|
||||
* having received a placebo or not ;
|
||||
* the sex is third but already included in the not interesting feature ;
|
||||
* then we see our generated features (AgeDiscret). We can see that their contribution is very low.
|
||||
|
||||
Does these results make sense?
|
||||
------------------------------
|
||||
|
||||
Let's check some **Chi2** between each of these features and the outcome.
|
||||
|
||||
Higher **Chi2** means better correlation.
|
||||
|
||||
```{r, warning=FALSE, message=FALSE}
|
||||
c2 <- chisq.test(df$Age, df$Y)
|
||||
print(c2)
|
||||
```
|
||||
|
||||
Pearson correlation between Age and illness disapearing is **`r round(c2$statistic, 2 )`**.
|
||||
|
||||
```{r, warning=FALSE, message=FALSE}
|
||||
c2 <- chisq.test(df$AgeDiscret, df$Y)
|
||||
print(c2)
|
||||
```
|
||||
|
||||
Our first simplification of Age gives a Pearson correlation is **`r round(c2$statistic, 2)`**.
|
||||
|
||||
```{r, warning=FALSE, message=FALSE}
|
||||
c2 <- chisq.test(df$AgeCat, df$Y)
|
||||
print(c2)
|
||||
```
|
||||
|
||||
The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same. Don't let your *gut* lower the quality of your model. In *data science* expression, there is the word *science* :-)
|
||||
|
||||
Conclusion
|
||||
==========
|
||||
|
||||
As you can see, in general *destroying information by simplying it won't improve your model*. **Chi2** just demonstrates that.
|
||||
|
||||
But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
|
||||
|
||||
The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
|
||||
|
||||
Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
|
||||
|
||||
Linear model may not be that strong in these scenario.
|
||||
|
||||
```{r, fig.align='center', include=FALSE}
|
||||
#xgb.plot.tree(sparse_matrix@Dimnames[[2]], model = bst, n_first_tree = 1, width = 1200, height = 800)
|
||||
```
|
||||
|
||||
Special Note: What about Random forest?
|
||||
=======================================
|
||||
|
||||
As you may know, [Random Forest](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble leanrning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
|
||||
|
||||
Both trains several decision trees for one dataset. The *main* difference is that in Random Forest, trees are independant and in boosting tree N+1 focus its learning on what has no been well modeled by tree N (and so on...).
|
||||
|
||||
This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
|
||||
|
||||
Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and random forest).
|
||||
|
||||
However, in Random Forest this choice will be done plenty of times, because trees are independant. So the **importance** of a specific feature is diluted among features `A` and `B`. So you won't easily know they are important to predict what you want to predict.
|
||||
|
||||
In boosting, when as aspect of your dataset have been learned by the algorithm, there is no more need to refocus on it. Therefore, all the importace will be on `A` or `B`. You will know that one of them is important, it is up to you to search for correlated features.
|
||||
203
R-package/vignettes/vignette.css
Normal file
203
R-package/vignettes/vignette.css
Normal file
@ -0,0 +1,203 @@
|
||||
body{
|
||||
margin: 0 auto;
|
||||
background-color: white;
|
||||
|
||||
/* --------- FONT FAMILY --------
|
||||
following are some optional font families. Usually a family
|
||||
is safer to choose than a specific font,
|
||||
which may not be on the users computer */
|
||||
/ font-family:Georgia, Palatino, serif;
|
||||
font-family: "Open Sans", "Book Antiqua", Palatino, serif;
|
||||
/ font-family:Arial, Helvetica, sans-serif;
|
||||
/ font-family:Tahoma, Verdana, Geneva, sans-serif;
|
||||
/ font-family:Courier, monospace;
|
||||
/ font-family:"Times New Roman", Times, serif;
|
||||
|
||||
/* -------------- COLOR OPTIONS ------------
|
||||
following are additional color options for base font
|
||||
you could uncomment another one to easily change the base color
|
||||
or add one to a specific element style below */
|
||||
color: #333333; /* dark gray not black */
|
||||
/ color: #000000; /* black */
|
||||
/ color: #666666; /* medium gray black */
|
||||
/ color: #E3E3E3; /* very light gray */
|
||||
/ color: white;
|
||||
|
||||
line-height: 1;
|
||||
max-width: 960px;
|
||||
padding: 20px;
|
||||
font-size: 17px;
|
||||
}
|
||||
|
||||
|
||||
p {
|
||||
line-height: 150%;
|
||||
/ max-width: 540px;
|
||||
max-width: 960px;
|
||||
font-weight: 400;
|
||||
/ color: #333333
|
||||
}
|
||||
|
||||
|
||||
h1, h2, h3, h4 {
|
||||
/ color: #111111;
|
||||
font-weight: 400;
|
||||
}
|
||||
|
||||
h2, h3, h4, h5, p {
|
||||
margin-bottom: 20px;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-bottom: 10px;
|
||||
font-size:230%;
|
||||
padding: 0px;
|
||||
font-variant:small-caps;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size:130%
|
||||
/ margin: 24px 0 6px;
|
||||
}
|
||||
|
||||
h3 {
|
||||
font-size:110%
|
||||
text-decoration: underline;
|
||||
font-style: italic;
|
||||
}
|
||||
h4 {
|
||||
font-size:100%
|
||||
font-variant:small-caps;
|
||||
|
||||
}
|
||||
h5 {
|
||||
font-size:100%
|
||||
font-weight: 100;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
h6 {
|
||||
font-size:100%
|
||||
font-weight: 100;
|
||||
color:red;
|
||||
font-variant:small-caps;
|
||||
font-style: italic;
|
||||
}
|
||||
a {
|
||||
color: #606AAA;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
vertical-align: baseline;
|
||||
}
|
||||
a:hover {
|
||||
text-decoration: blink;
|
||||
color: green;
|
||||
}
|
||||
a:visited {
|
||||
color: gray;
|
||||
}
|
||||
ul, ol {
|
||||
padding: 0;
|
||||
margin: 0px 0px 0px 50px;
|
||||
}
|
||||
ul {
|
||||
list-style-type: square;
|
||||
list-style-position: inside;
|
||||
|
||||
}
|
||||
|
||||
li {
|
||||
line-height:150%
|
||||
}
|
||||
li ul, li ul {
|
||||
margin-left: 24px;
|
||||
}
|
||||
|
||||
pre {
|
||||
padding: 0px 24px;
|
||||
max-width: 800px;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
code {
|
||||
font-family: Consolas, Monaco, Andale Mono, monospace;
|
||||
line-height: 1.5;
|
||||
font-size: 15px;
|
||||
}
|
||||
aside {
|
||||
display: block;
|
||||
float: right;
|
||||
width: 390px;
|
||||
}
|
||||
blockquote {
|
||||
font-size:14px;
|
||||
border-left:.5em solid #606AAA;
|
||||
background: #f5f5f5;
|
||||
color:#bfbfbf;
|
||||
padding: 5px;
|
||||
margin-left:25px;
|
||||
max-width: 500px;
|
||||
}
|
||||
blockquote cite {
|
||||
font-size:14px;
|
||||
line-height:20px;
|
||||
color:#bfbfbf;
|
||||
}
|
||||
blockquote cite:before {
|
||||
content: '\2014 \00A0';
|
||||
}
|
||||
|
||||
blockquote p {
|
||||
color: #666;
|
||||
}
|
||||
hr {
|
||||
/ width: 540px;
|
||||
text-align: left;
|
||||
margin: 0 auto 0 0;
|
||||
color: #999;
|
||||
}
|
||||
|
||||
|
||||
/* table */
|
||||
|
||||
table {
|
||||
width: 100%;
|
||||
border-top: 1px solid #919699;
|
||||
border-left: 1px solid #919699;
|
||||
border-spacing: 0;
|
||||
}
|
||||
|
||||
table th {
|
||||
padding: 4px 8px 4px 8px;
|
||||
text-align: center;
|
||||
color: white;
|
||||
background: #606AAA;
|
||||
border-bottom: 1px solid #919699;
|
||||
border-right: 1px solid #919699;
|
||||
}
|
||||
table th p {
|
||||
font-weight: bold;
|
||||
margin-bottom: 0px;
|
||||
}
|
||||
|
||||
table td {
|
||||
padding: 8px;
|
||||
vertical-align: top;
|
||||
border-bottom: 1px solid #919699;
|
||||
border-right: 1px solid #919699;
|
||||
}
|
||||
|
||||
table td:last-child {
|
||||
/background: lightgray;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
table td p {
|
||||
margin-bottom: 0px;
|
||||
}
|
||||
table td p + p {
|
||||
margin-top: 5px;
|
||||
}
|
||||
table td p + p + p {
|
||||
margin-top: 5px;
|
||||
}
|
||||
241
R-package/vignettes/xgboostPresentation.Rmd
Normal file
241
R-package/vignettes/xgboostPresentation.Rmd
Normal file
@ -0,0 +1,241 @@
|
||||
---
|
||||
title: "Xgboost presentation"
|
||||
output:
|
||||
html_document:
|
||||
css: vignette.css
|
||||
number_sections: yes
|
||||
toc: yes
|
||||
---
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
The purpose of this Vignette is to show you how to use **Xgboost** to make prediction from a model based on your own dataset.
|
||||
|
||||
You may know **Xgboost** as a state of the art tool to build some kind of Machine learning models. It has been [used](https://github.com/tqchen/xgboost) to win several [Kaggle](http://www.kaggle.com) competition.
|
||||
|
||||
For the purpose of this tutorial we will first load the required packages.
|
||||
|
||||
```{r libLoading, results='hold', message=F, warning=F}
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
```
|
||||
|
||||
In this example, we are aiming to predict whether a mushroom can be eated.
|
||||
|
||||
Learning
|
||||
========
|
||||
|
||||
Dataset loading
|
||||
---------------
|
||||
|
||||
We load the `agaricus` datasets and link it to variables.
|
||||
|
||||
The dataset is already separated in `train` and `test` data.
|
||||
|
||||
As their names imply, the train part will be used to build the model and the test part to check how well our model works. Without separation we would test the model on data the algorithm have already seen, as you may imagine, it's not the best methodology to check the performance of a prediction (would it even be a prediction?).
|
||||
|
||||
```{r datasetLoading, results='hold', message=F, warning=F}
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
```
|
||||
|
||||
> Each variable is a S3 object containing both label and data.
|
||||
|
||||
> In the real world, it would be up to you to make this division between `train` and `test` data.
|
||||
|
||||
The loaded data is stored in `dgCMatrix` which is a **sparse matrix** type.
|
||||
|
||||
Label is a `numeric` vector in `{0,1}`.
|
||||
|
||||
```{r dataClass, message=F, warning=F}
|
||||
class(train$data)[1]
|
||||
class(train$label)
|
||||
```
|
||||
|
||||
Basic Training using XGBoost
|
||||
----------------------------
|
||||
|
||||
The most critical part of the process is the training.
|
||||
|
||||
We are using the train data. Both `data` and `label` are in each data (explained above). To access to the field of a `S3` object we use the `$` character in **R**.
|
||||
|
||||
> label is the outcome of our dataset. It is the classification we want to predict. For these data we already have it, but when our model is built, that is this column we want to guess.
|
||||
|
||||
In sparse matrix, cells which contains `0` are not encoded. Therefore, in a dataset where there are plenty of `0`, dataset size is optimized. It is very usual to have such dataset. **Xgboost** can manage both dense and sparse matrix.
|
||||
|
||||
```{r trainingSparse, message=F, warning=F}
|
||||
bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
Alternatively, you can put your dataset in a dense matrix, i.e. a basic R-matrix.
|
||||
|
||||
```{r trainingDense, message=F, warning=F}
|
||||
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
```
|
||||
|
||||
Above, data and label are not stored together.
|
||||
|
||||
**Xgboost** offer a way to group them in a `xgb.DMatrix`. You can even add other meta data. It will be usefull for the most advanced features.
|
||||
|
||||
```{r trainingDmatrix, message=F, warning=F}
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
|
||||
```
|
||||
|
||||
Below is a demonstration of the effect of verbose parameter.
|
||||
|
||||
```{r trainingVerbose, message=T, warning=F}
|
||||
# verbose 0, no message
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 0)
|
||||
|
||||
# verbose 1, print evaluation metric
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 1)
|
||||
|
||||
# verbose 2, also print information about tree
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 2)
|
||||
```
|
||||
|
||||
Basic prediction using Xgboost
|
||||
------------------------------
|
||||
|
||||
The main use of **Xgboost** is to predict data. For that purpose we will use the test dataset. We remind you that the algorithm has never seen these data.
|
||||
|
||||
```{r predicting, message=F, warning=F}
|
||||
pred <- predict(bst, test$data)
|
||||
err <- mean(as.numeric(pred > 0.5) != test$label)
|
||||
print(paste("test-error=", err))
|
||||
```
|
||||
|
||||
> You can put data in Matrix, sparseMatrix, or xgb.DMatrix
|
||||
|
||||
Save and load models
|
||||
--------------------
|
||||
|
||||
When your dataset is big, it may takes time to build a model. Or may be you are not a big fan of loosing time in redoing the same thing again and again. In these cases, you will want to save your model and load it when required.
|
||||
|
||||
Hopefully for you, **Xgboost** implement such functions.
|
||||
|
||||
```{r saveLoadModel, message=F, warning=F}
|
||||
# save model to binary local file
|
||||
xgb.save(bst, "xgboost.model")
|
||||
|
||||
# load binary model to R
|
||||
bst2 <- xgb.load("xgboost.model")
|
||||
pred2 <- predict(bst2, test$data)
|
||||
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
|
||||
```
|
||||
|
||||
In some very specific cases, like when you want to pilot **Xgboost** from `caret`, you will want to save the model as a **R** binary vector. See below how to do it.
|
||||
|
||||
```{r saveLoadRBinVectorModel, message=F, warning=F}
|
||||
# save model to R's raw vector
|
||||
raw = xgb.save.raw(bst)
|
||||
|
||||
# load binary model to R
|
||||
bst3 <- xgb.load(raw)
|
||||
pred3 <- predict(bst3, test$data)
|
||||
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
|
||||
```
|
||||
|
||||
|
||||
|
||||
Advanced features
|
||||
=================
|
||||
|
||||
Most of the features below have been created to help you to improve your model by offering a better understanding of its content.
|
||||
|
||||
|
||||
Dataset preparation
|
||||
-------------------
|
||||
|
||||
For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
|
||||
|
||||
```{r DMatrix, message=F, warning=F}
|
||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
||||
```
|
||||
|
||||
Using xgb.train
|
||||
---------------
|
||||
|
||||
`xgb.train` is a powerfull way to follow progress in learning of one or more dataset.
|
||||
|
||||
One way to measure progress in learning of a model is to provide to the **Xgboost** a second dataset already classified. Therefore it can learn on the real dataset and test its model on the second one. Some metrics are measured after each round during the learning.
|
||||
|
||||
For that purpose, you will use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
|
||||
|
||||
```{r watchlist, message=F, warning=F}
|
||||
watchlist <- list(train=dtrain, test=dtest)
|
||||
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||
objective = "binary:logistic")
|
||||
```
|
||||
|
||||
> To train with watchlist, we use `xgb.train`, which contains more advanced features than `xgboost` function.
|
||||
|
||||
For a better understanding, you may want to have some specific metric or even use multiple evaluation metrics.
|
||||
|
||||
`eval.metric` allows us to monitor the evaluation of several metrics at a time. Hereafter we will watch two new metrics, logloss and error.
|
||||
|
||||
```{r watchlist2, message=F, warning=F}
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||
eval.metric = "error", eval.metric = "logloss",
|
||||
objective = "binary:logistic")
|
||||
```
|
||||
|
||||
Manipulating xgb.DMatrix
|
||||
------------------------
|
||||
|
||||
### Save / Load
|
||||
|
||||
Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function.
|
||||
|
||||
```{r DMatrixSave, message=F, warning=F}
|
||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||
# to load it in, simply call xgb.DMatrix
|
||||
dtrain2 <- xgb.DMatrix("dtrain.buffer")
|
||||
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||
objective = "binary:logistic")
|
||||
```
|
||||
|
||||
### Information extraction
|
||||
|
||||
Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data.
|
||||
|
||||
```{r getinfo, message=F, warning=F}
|
||||
label = getinfo(dtest, "label")
|
||||
pred <- predict(bst, dtest)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
|
||||
print(paste("test-error=", err))
|
||||
```
|
||||
|
||||
View the trees from a model
|
||||
---------------------------
|
||||
|
||||
You can dump the tree you learned using `xgb.dump` into a text file.
|
||||
|
||||
```{r dump, message=T, warning=F}
|
||||
xgb.dump(bst, with.stats = T)
|
||||
```
|
||||
|
||||
Feature importance
|
||||
------------------
|
||||
|
||||
Finally, you can check which features are the most important.
|
||||
|
||||
```{r featureImportance, message=T, warning=F}
|
||||
importance_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst)
|
||||
print(importance_matrix)
|
||||
xgb.plot.importance(importance_matrix)
|
||||
```
|
||||
@ -1,7 +1,7 @@
|
||||
xgboost: eXtreme Gradient Boosting
|
||||
======
|
||||
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
|
||||
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree.
|
||||
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data.
|
||||
|
||||
Contributors: https://github.com/tqchen/xgboost/graphs/contributors
|
||||
|
||||
|
||||
1
demo/.gitignore
vendored
Normal file
1
demo/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.libsvm
|
||||
@ -34,6 +34,7 @@ This is a list of short codes introducing different functionalities of xgboost a
|
||||
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)
|
||||
* Predicting leaf indices
|
||||
[python](guide-python/predict_leaf_indices.py)
|
||||
[R](../R-package/demo/predict_leaf_indices.R)
|
||||
|
||||
Basic Examples by Tasks
|
||||
====
|
||||
|
||||
9
demo/yearpredMSD/README.md
Normal file
9
demo/yearpredMSD/README.md
Normal file
@ -0,0 +1,9 @@
|
||||
Demonstrating how to use XGBoost on [Year Prediction task of Million Song Dataset](https://archive.ics.uci.edu/ml/datasets/YearPredictionMSD)
|
||||
|
||||
1. Run runexp.sh
|
||||
```bash
|
||||
./runexp.sh
|
||||
```
|
||||
|
||||
You can also use the script to prepare LIBSVM format, and run the [Distributed Version](../../multi-node).
|
||||
Note that though that normally you only need to use single machine for dataset at this scale, and use distributed version for larger scale dataset.
|
||||
14
demo/yearpredMSD/csv2libsvm.py
Executable file
14
demo/yearpredMSD/csv2libsvm.py
Executable file
@ -0,0 +1,14 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print 'Usage: <csv> <libsvm>'
|
||||
print 'convert a all numerical csv to libsvm'
|
||||
|
||||
fo = open(sys.argv[2], 'w')
|
||||
for l in open(sys.argv[1]):
|
||||
arr = l.split(',')
|
||||
fo.write('%s' % arr[0])
|
||||
for i in xrange(len(arr) - 1):
|
||||
fo.write(' %d:%s' % (i, arr[i+1]))
|
||||
fo.close()
|
||||
17
demo/yearpredMSD/runexp.sh
Executable file
17
demo/yearpredMSD/runexp.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -f YearPredictionMSD.txt ]
|
||||
then
|
||||
echo "use existing data to run experiment"
|
||||
else
|
||||
echo "getting data from uci, make sure you are connected to internet"
|
||||
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip
|
||||
unzip YearPredictionMSD.txt.zip
|
||||
fi
|
||||
echo "start making data.."
|
||||
# map feature using indicator encoding, also produce featmap.txt
|
||||
python csv2libsvm.py YearPredictionMSD.txt yearpredMSD.libsvm
|
||||
head -n 463715 yearpredMSD.libsvm > yearpredMSD.libsvm.train
|
||||
tail -n 51630 yearpredMSD.libsvm > yearpredMSD.libsvm.test
|
||||
echo "finish making the data"
|
||||
../../xgboost yearpredMSD.conf
|
||||
@ -1,6 +1,6 @@
|
||||
Distributed XGBoost
|
||||
======
|
||||
This folder contains information of Distributed XGBoost.
|
||||
This folder contains information of Distributed XGBoost (Distributed GBDT).
|
||||
|
||||
* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
|
||||
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning
|
||||
|
||||
@ -17,8 +17,8 @@ cd -
|
||||
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
|
||||
|
||||
# run xgboost mpi
|
||||
../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
|
||||
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
|
||||
|
||||
# run xgboost-mpi save model 0001, continue to run from existing model
|
||||
../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
|
||||
../../rabit/tracker/rabit_mpi.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
|
||||
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
|
||||
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
|
||||
|
||||
@ -159,7 +159,7 @@ class GBLinear : public IGradBooster {
|
||||
}
|
||||
fo << "weight:\n";
|
||||
for (int i = 0; i < model.param.num_output_group; ++i) {
|
||||
for (int j = 0; j <model.param.num_feature; ++j) {
|
||||
for (unsigned j = 0; j <model.param.num_feature; ++j) {
|
||||
fo << model[i][j] << std::endl;
|
||||
}
|
||||
}
|
||||
@ -173,6 +173,7 @@ class GBLinear : public IGradBooster {
|
||||
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
||||
float psum = model.bias()[gid];
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
if (inst[i].index >= model.param.num_feature) continue;
|
||||
psum += inst[i].fvalue * model[inst[i].index][gid];
|
||||
}
|
||||
preds[gid] = psum;
|
||||
@ -229,7 +230,7 @@ class GBLinear : public IGradBooster {
|
||||
// model parameter
|
||||
struct Param {
|
||||
// number of feature dimension
|
||||
int num_feature;
|
||||
unsigned num_feature;
|
||||
// number of output group
|
||||
int num_output_group;
|
||||
// reserved field
|
||||
@ -242,7 +243,7 @@ class GBLinear : public IGradBooster {
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
using namespace std;
|
||||
if (!strcmp(name, "bst:num_feature")) num_feature = atoi(val);
|
||||
if (!strcmp(name, "bst:num_feature")) num_feature = static_cast<unsigned>(atoi(val));
|
||||
if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
|
||||
}
|
||||
};
|
||||
|
||||
@ -339,7 +339,7 @@ class FMatrixPage : public IFMatrix {
|
||||
}
|
||||
if (ktop % 100000 == 0) {
|
||||
utils::Printf("\r \r");
|
||||
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
|
||||
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -104,7 +104,7 @@ class BoostLearner : public rabit::ISerializable {
|
||||
}
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
if (!strcmp("seed", name)) {
|
||||
this->seed = seed; random::Seed(atoi(val));
|
||||
seed = atoi(val); random::Seed(seed);
|
||||
}
|
||||
if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
|
||||
if (!strcmp("save_base64", name)) save_base64 = atoi(val);
|
||||
@ -159,7 +159,9 @@ class BoostLearner : public rabit::ISerializable {
|
||||
* \param with_pbuffer whether to load with predict buffer
|
||||
* \param calc_num_feature whether call InitTrainer with calc_num_feature
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true, bool calc_num_feature = true) {
|
||||
inline void LoadModel(utils::IStream &fi,
|
||||
bool with_pbuffer = true,
|
||||
bool calc_num_feature = true) {
|
||||
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
||||
"BoostLearner: wrong model format");
|
||||
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
|
||||
@ -192,8 +194,8 @@ class BoostLearner : public rabit::ISerializable {
|
||||
*/
|
||||
inline void LoadModel(const char *fname) {
|
||||
FILE *fp = utils::FopenCheck(fname, "rb");
|
||||
std::string header; header.resize(4);
|
||||
utils::FileStream fi(fp);
|
||||
std::string header; header.resize(4);
|
||||
// check header for different binary encode
|
||||
// can be base64 or binary
|
||||
if (fi.Read(&header[0], 4) != 0) {
|
||||
@ -227,14 +229,19 @@ class BoostLearner : public rabit::ISerializable {
|
||||
*/
|
||||
inline void SaveModel(const char *fname) const {
|
||||
FILE *fp;
|
||||
bool use_stdout = false;;
|
||||
#ifndef XGBOOST_STRICT_CXX98_
|
||||
if (!strcmp(fname, "stdout")) {
|
||||
fp = stdout;
|
||||
} else {
|
||||
use_stdout = true;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
fp = utils::FopenCheck(fname, "wb");
|
||||
}
|
||||
utils::FileStream fo(fp);
|
||||
std::string header;
|
||||
if (save_base64 != 0|| fp == stdout) {
|
||||
if (save_base64 != 0|| use_stdout) {
|
||||
fo.Write("bs64\t", 5);
|
||||
utils::Base64OutStream bout(fp);
|
||||
this->SaveModel(bout);
|
||||
@ -243,7 +250,9 @@ class BoostLearner : public rabit::ISerializable {
|
||||
fo.Write("binf", 4);
|
||||
this->SaveModel(fo);
|
||||
}
|
||||
if (fp != stdout) fclose(fp);
|
||||
if (!use_stdout) {
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief check if data matrix is ready to be used by training,
|
||||
@ -262,8 +271,8 @@ class BoostLearner : public rabit::ISerializable {
|
||||
* \param p_train pointer to the data matrix
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, const DMatrix &train) {
|
||||
if (seed_per_iteration || rabit::IsDistributed()) {
|
||||
random::Seed(this->seed * kRandSeedMagic);
|
||||
if (seed_per_iteration != 0 || rabit::IsDistributed()) {
|
||||
random::Seed(this->seed * kRandSeedMagic + iter);
|
||||
}
|
||||
this->PredictRaw(train, &preds_);
|
||||
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
||||
|
||||
@ -206,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
|
||||
Softmax(&rec);
|
||||
const unsigned j = i % nstep;
|
||||
int label = static_cast<int>(info.labels[j]);
|
||||
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
|
||||
utils::Check(label >= 0 && label < nclass,
|
||||
"SoftmaxMultiClassObj: label must be in [0, num_class)");
|
||||
const float wt = info.GetWeight(j);
|
||||
for (int k = 0; k < nclass; ++k) {
|
||||
float p = rec[k];
|
||||
|
||||
@ -503,12 +503,14 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
/*! \brief fill the vector with sparse vector */
|
||||
inline void Fill(const RowBatch::Inst &inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
if (inst[i].index >= data.size()) continue;
|
||||
data[inst[i].index].fvalue = inst[i].fvalue;
|
||||
}
|
||||
}
|
||||
/*! \brief drop the trace after fill, must be called after fill */
|
||||
inline void Drop(const RowBatch::Inst &inst) {
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
if (inst[i].index >= data.size()) continue;
|
||||
data[inst[i].index].flag = -1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -205,8 +205,8 @@ struct GradStats {
|
||||
this->Add(b.sum_grad, b.sum_hess);
|
||||
}
|
||||
/*! \brief same as add, reduce is used in All Reduce */
|
||||
inline void Reduce(const GradStats &b) {
|
||||
this->Add(b);
|
||||
inline static void Reduce(GradStats &a, const GradStats &b) {
|
||||
a.Add(b);
|
||||
}
|
||||
/*! \brief set current value to a - b */
|
||||
inline void SetSubstract(const GradStats &a, const GradStats &b) {
|
||||
@ -285,8 +285,8 @@ struct CVGradStats : public GradStats {
|
||||
}
|
||||
}
|
||||
/*! \brief same as add, reduce is used in All Reduce */
|
||||
inline void Reduce(const CVGradStats &b) {
|
||||
this->Add(b);
|
||||
inline static void Reduce(CVGradStats &a, const CVGradStats &b) {
|
||||
a.Add(b);
|
||||
}
|
||||
/*! \brief set current value to a - b */
|
||||
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
|
||||
@ -368,8 +368,8 @@ struct SplitEntry{
|
||||
}
|
||||
}
|
||||
/*! \brief same as update, used by AllReduce*/
|
||||
inline void Reduce(const SplitEntry &e) {
|
||||
this->Update(e);
|
||||
inline static void Reduce(SplitEntry &dst, const SplitEntry &src) {
|
||||
dst.Update(src);
|
||||
}
|
||||
/*!\return feature index to split on */
|
||||
inline unsigned split_index(void) const {
|
||||
|
||||
@ -153,7 +153,8 @@ class ColMaker: public IUpdater {
|
||||
}
|
||||
unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
|
||||
random::Shuffle(feat_index);
|
||||
utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
|
||||
//utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
|
||||
utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included", param.colsample_bytree);
|
||||
feat_index.resize(n);
|
||||
}
|
||||
{// setup temp space for each thread
|
||||
|
||||
@ -155,7 +155,7 @@ class DistColMaker : public ColMaker<TStats> {
|
||||
private:
|
||||
utils::BitMap bitmap;
|
||||
std::vector<int> boolmap;
|
||||
rabit::Reducer<SplitEntry> reducer;
|
||||
rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
|
||||
};
|
||||
// we directly introduce pruner here
|
||||
TreePruner pruner;
|
||||
|
||||
@ -117,7 +117,7 @@ class HistMaker: public BaseMaker {
|
||||
// workspace of thread
|
||||
ThreadWSpace wspace;
|
||||
// reducer for histogram
|
||||
rabit::Reducer<TStats> histred;
|
||||
rabit::Reducer<TStats, TStats::Reduce> histred;
|
||||
// set of working features
|
||||
std::vector<bst_uint> fwork_set;
|
||||
// update function implementation
|
||||
|
||||
@ -147,7 +147,7 @@ class TreeRefresher: public IUpdater {
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
// reducer
|
||||
rabit::Reducer<TStats> reducer;
|
||||
rabit::Reducer<TStats, TStats::Reduce> reducer;
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
|
||||
@ -139,7 +139,7 @@ class Base64InStream: public IStream {
|
||||
|
||||
private:
|
||||
FILE *fp;
|
||||
unsigned char tmp_ch;
|
||||
int tmp_ch;
|
||||
int num_prev;
|
||||
unsigned char buf_prev[2];
|
||||
// whether we need to do strict check
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "./utils.h"
|
||||
/*!
|
||||
* \file io.h
|
||||
|
||||
@ -208,7 +208,6 @@ struct SparseCSRFileBuilder {
|
||||
fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
|
||||
}
|
||||
}
|
||||
printf("CSV::begin_dat=%lu\n", begin_data);
|
||||
}
|
||||
protected:
|
||||
inline void WriteBuffer(void) {
|
||||
|
||||
@ -1,389 +0,0 @@
|
||||
#ifndef XGBOOST_UTILS_SOCKET_H
|
||||
#define XGBOOST_UTILS_SOCKET_H
|
||||
/*!
|
||||
* \file socket.h
|
||||
* \brief this file aims to provide a wrapper of sockets
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#if defined(_WIN32)
|
||||
#include <winsock2.h>
|
||||
#include <ws2tcpip.h>
|
||||
#else
|
||||
#include <fcntl.h>
|
||||
#include <netdb.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/in.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/select.h>
|
||||
#endif
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "./utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
#if defined(_WIN32)
|
||||
typedef int ssize_t;
|
||||
typedef int sock_size_t;
|
||||
#else
|
||||
typedef int SOCKET;
|
||||
typedef size_t sock_size_t;
|
||||
const int INVALID_SOCKET = -1;
|
||||
#endif
|
||||
|
||||
/*! \brief data structure for network address */
|
||||
struct SockAddr {
|
||||
sockaddr_in addr;
|
||||
// constructor
|
||||
SockAddr(void) {}
|
||||
SockAddr(const char *url, int port) {
|
||||
this->Set(url, port);
|
||||
}
|
||||
inline static std::string GetHostName(void) {
|
||||
std::string buf; buf.resize(256);
|
||||
utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
|
||||
return std::string(buf.c_str());
|
||||
}
|
||||
/*!
|
||||
* \brief set the address
|
||||
* \param url the url of the address
|
||||
* \param port the port of address
|
||||
*/
|
||||
inline void Set(const char *host, int port) {
|
||||
hostent *hp = gethostbyname(host);
|
||||
Check(hp != NULL, "cannot obtain address of %s", host);
|
||||
memset(&addr, 0, sizeof(addr));
|
||||
addr.sin_family = AF_INET;
|
||||
addr.sin_port = htons(port);
|
||||
memcpy(&addr.sin_addr, hp->h_addr_list[0], hp->h_length);
|
||||
}
|
||||
/*! \brief return port of the address*/
|
||||
inline int port(void) const {
|
||||
return ntohs(addr.sin_port);
|
||||
}
|
||||
/*! \return a string representation of the address */
|
||||
inline std::string AddrStr(void) const {
|
||||
std::string buf; buf.resize(256);
|
||||
#ifdef _WIN32
|
||||
const char *s = inet_ntop(AF_INET, (PVOID)&addr.sin_addr, &buf[0], buf.length());
|
||||
#else
|
||||
const char *s = inet_ntop(AF_INET, &addr.sin_addr, &buf[0], buf.length());
|
||||
#endif
|
||||
Assert(s != NULL, "cannot decode address");
|
||||
return std::string(s);
|
||||
}
|
||||
};
|
||||
/*!
|
||||
* \brief a wrapper of TCP socket that hopefully be cross platform
|
||||
*/
|
||||
class TCPSocket {
|
||||
public:
|
||||
/*! \brief the file descriptor of socket */
|
||||
SOCKET sockfd;
|
||||
// constructor
|
||||
TCPSocket(void) : sockfd(INVALID_SOCKET) {
|
||||
}
|
||||
explicit TCPSocket(SOCKET sockfd) : sockfd(sockfd) {
|
||||
}
|
||||
~TCPSocket(void) {
|
||||
// do nothing in destructor
|
||||
// user need to take care of close
|
||||
}
|
||||
// default conversion to int
|
||||
inline operator SOCKET() const {
|
||||
return sockfd;
|
||||
}
|
||||
/*!
|
||||
* \brief create the socket, call this before using socket
|
||||
* \param af domain
|
||||
*/
|
||||
inline void Create(int af = PF_INET) {
|
||||
sockfd = socket(PF_INET, SOCK_STREAM, 0);
|
||||
if (sockfd == INVALID_SOCKET) {
|
||||
SockError("Create");
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief start up the socket module
|
||||
* call this before using the sockets
|
||||
*/
|
||||
inline static void Startup(void) {
|
||||
#ifdef _WIN32
|
||||
WSADATA wsa_data;
|
||||
if (WSAStartup(MAKEWORD(2, 2), &wsa_data) != -1) {
|
||||
SockError("Startup");
|
||||
}
|
||||
if (LOBYTE(wsa_data.wVersion) != 2 || HIBYTE(wsa_data.wVersion) != 2) {
|
||||
WSACleanup();
|
||||
utils::Error("Could not find a usable version of Winsock.dll\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
/*!
|
||||
* \brief shutdown the socket module after use, all sockets need to be closed
|
||||
*/
|
||||
inline static void Finalize(void) {
|
||||
#ifdef _WIN32
|
||||
WSACleanup();
|
||||
#endif
|
||||
}
|
||||
/*!
|
||||
* \brief set this socket to use non-blocking mode
|
||||
* \param non_block whether set it to be non-block, if it is false
|
||||
* it will set it back to block mode
|
||||
*/
|
||||
inline void SetNonBlock(bool non_block) {
|
||||
#ifdef _WIN32
|
||||
u_long mode = non_block ? 1 : 0;
|
||||
if (ioctlsocket(sockfd, FIONBIO, &mode) != NO_ERROR) {
|
||||
SockError("SetNonBlock");
|
||||
}
|
||||
#else
|
||||
int flag = fcntl(sockfd, F_GETFL, 0);
|
||||
if (flag == -1) {
|
||||
SockError("SetNonBlock-1");
|
||||
}
|
||||
if (non_block) {
|
||||
flag |= O_NONBLOCK;
|
||||
} else {
|
||||
flag &= ~O_NONBLOCK;
|
||||
}
|
||||
if (fcntl(sockfd, F_SETFL, flag) == -1) {
|
||||
SockError("SetNonBlock-2");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
/*!
|
||||
* \brief perform listen of the socket
|
||||
* \param backlog backlog parameter
|
||||
*/
|
||||
inline void Listen(int backlog = 16) {
|
||||
listen(sockfd, backlog);
|
||||
}
|
||||
/*! \brief get a new connection */
|
||||
TCPSocket Accept(void) {
|
||||
SOCKET newfd = accept(sockfd, NULL, NULL);
|
||||
if (newfd == INVALID_SOCKET) {
|
||||
SockError("Accept");
|
||||
}
|
||||
return TCPSocket(newfd);
|
||||
}
|
||||
/*!
|
||||
* \brief bind the socket to an address
|
||||
* \param addr
|
||||
*/
|
||||
inline void Bind(const SockAddr &addr) {
|
||||
if (bind(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
|
||||
SockError("Bind");
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief try bind the socket to host, from start_port to end_port
|
||||
* \param start_port starting port number to try
|
||||
* \param end_port ending port number to try
|
||||
* \param out_addr the binding address, if successful
|
||||
* \return whether the binding is successful
|
||||
*/
|
||||
inline int TryBindHost(int start_port, int end_port) {
|
||||
for (int port = start_port; port < end_port; ++port) {
|
||||
SockAddr addr("0.0.0.0", port);
|
||||
if (bind(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == 0) {
|
||||
return port;
|
||||
}
|
||||
if (errno != EADDRINUSE) {
|
||||
SockError("TryBindHost");
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
/*!
|
||||
* \brief connect to an address
|
||||
* \param addr the address to connect to
|
||||
*/
|
||||
inline void Connect(const SockAddr &addr) {
|
||||
if (connect(sockfd, (sockaddr*)&addr.addr, sizeof(addr.addr)) == -1) {
|
||||
SockError("Connect");
|
||||
}
|
||||
}
|
||||
/*! \brief close the connection */
|
||||
inline void Close(void) {
|
||||
if (sockfd != -1) {
|
||||
#ifdef _WIN32
|
||||
closesocket(sockfd);
|
||||
#else
|
||||
close(sockfd);
|
||||
#endif
|
||||
sockfd = INVALID_SOCKET;
|
||||
} else {
|
||||
Error("TCPSocket::Close double close the socket or close without create");
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief send data using the socket
|
||||
* \param buf the pointer to the buffer
|
||||
* \param len the size of the buffer
|
||||
* \param flags extra flags
|
||||
* \return size of data actually sent
|
||||
*/
|
||||
inline size_t Send(const void *buf_, size_t len, int flag = 0) {
|
||||
const char *buf = reinterpret_cast<const char*>(buf_);
|
||||
if (len == 0) return 0;
|
||||
ssize_t ret = send(sockfd, buf, static_cast<sock_size_t>(len), flag);
|
||||
if (ret == -1) {
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
|
||||
SockError("Send");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
/*!
|
||||
* \brief receive data using the socket
|
||||
* \param buf_ the pointer to the buffer
|
||||
* \param len the size of the buffer
|
||||
* \param flags extra flags
|
||||
* \return size of data actually received
|
||||
*/
|
||||
inline size_t Recv(void *buf_, size_t len, int flags = 0) {
|
||||
char *buf = reinterpret_cast<char*>(buf_);
|
||||
if (len == 0) return 0;
|
||||
ssize_t ret = recv(sockfd, buf, static_cast<sock_size_t>(len), flags);
|
||||
if (ret == -1) {
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK) return 0;
|
||||
SockError("Recv");
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
/*!
|
||||
* \brief peform block write that will attempt to send all data out
|
||||
* can still return smaller than request when error occurs
|
||||
* \param buf the pointer to the buffer
|
||||
* \param len the size of the buffer
|
||||
* \return size of data actually sent
|
||||
*/
|
||||
inline size_t SendAll(const void *buf_, size_t len) {
|
||||
const char *buf = reinterpret_cast<const char*>(buf_);
|
||||
size_t ndone = 0;
|
||||
while (ndone < len) {
|
||||
ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
|
||||
if (ret == -1) {
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
|
||||
SockError("Recv");
|
||||
}
|
||||
buf += ret;
|
||||
ndone += ret;
|
||||
}
|
||||
return ndone;
|
||||
}
|
||||
/*!
|
||||
* \brief peforma block read that will attempt to read all data
|
||||
* can still return smaller than request when error occurs
|
||||
* \param buf_ the buffer pointer
|
||||
* \param len length of data to recv
|
||||
* \return size of data actually sent
|
||||
*/
|
||||
inline size_t RecvAll(void *buf_, size_t len) {
|
||||
char *buf = reinterpret_cast<char*>(buf_);
|
||||
size_t ndone = 0;
|
||||
while (ndone < len) {
|
||||
ssize_t ret = recv(sockfd, buf, static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
|
||||
if (ret == -1) {
|
||||
if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
|
||||
SockError("Recv");
|
||||
}
|
||||
if (ret == 0) return ndone;
|
||||
buf += ret;
|
||||
ndone += ret;
|
||||
}
|
||||
return ndone;
|
||||
}
|
||||
|
||||
private:
|
||||
// report an socket error
|
||||
inline static void SockError(const char *msg) {
|
||||
int errsv = errno;
|
||||
Error("Socket %s Error:%s", msg, strerror(errsv));
|
||||
}
|
||||
};
|
||||
/*! \brief helper data structure to perform select */
|
||||
struct SelectHelper {
|
||||
public:
|
||||
SelectHelper(void) {
|
||||
this->Clear();
|
||||
}
|
||||
/*!
|
||||
* \brief add file descriptor to watch for read
|
||||
* \param fd file descriptor to be watched
|
||||
*/
|
||||
inline void WatchRead(SOCKET fd) {
|
||||
read_fds.push_back(fd);
|
||||
if (fd > maxfd) maxfd = fd;
|
||||
}
|
||||
/*!
|
||||
* \brief add file descriptor to watch for write
|
||||
* \param fd file descriptor to be watched
|
||||
*/
|
||||
inline void WatchWrite(SOCKET fd) {
|
||||
write_fds.push_back(fd);
|
||||
if (fd > maxfd) maxfd = fd;
|
||||
}
|
||||
/*!
|
||||
* \brief Check if the descriptor is ready for read
|
||||
* \param fd file descriptor to check status
|
||||
*/
|
||||
inline bool CheckRead(SOCKET fd) const {
|
||||
return FD_ISSET(fd, &read_set) != 0;
|
||||
}
|
||||
/*!
|
||||
* \brief Check if the descriptor is ready for write
|
||||
* \param fd file descriptor to check status
|
||||
*/
|
||||
inline bool CheckWrite(SOCKET fd) const {
|
||||
return FD_ISSET(fd, &write_set) != 0;
|
||||
}
|
||||
/*!
|
||||
* \brief clear all the monitored descriptors
|
||||
*/
|
||||
inline void Clear(void) {
|
||||
read_fds.clear();
|
||||
write_fds.clear();
|
||||
maxfd = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief peform select on the set defined
|
||||
* \param timeout specify timeout in micro-seconds(ms) if equals 0, means select will always block
|
||||
* \return number of active descriptors selected
|
||||
*/
|
||||
inline int Select(long timeout = 0) {
|
||||
FD_ZERO(&read_set);
|
||||
FD_ZERO(&write_set);
|
||||
for (size_t i = 0; i < read_fds.size(); ++i) {
|
||||
FD_SET(read_fds[i], &read_set);
|
||||
}
|
||||
for (size_t i = 0; i < write_fds.size(); ++i) {
|
||||
FD_SET(write_fds[i], &write_set);
|
||||
}
|
||||
int ret;
|
||||
if (timeout == 0) {
|
||||
ret = select(static_cast<int>(maxfd + 1), &read_set, &write_set, NULL, NULL);
|
||||
} else {
|
||||
timeval tm;
|
||||
tm.tv_usec = (timeout % 1000) * 1000;
|
||||
tm.tv_sec = timeout / 1000;
|
||||
ret = select(static_cast<int>(maxfd + 1), &read_set, &write_set, NULL, &tm);
|
||||
}
|
||||
if (ret == -1) {
|
||||
int errsv = errno;
|
||||
Error("Select Error: %s", strerror(errsv));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private:
|
||||
SOCKET maxfd;
|
||||
fd_set read_set, write_set;
|
||||
std::vector<SOCKET> read_fds, write_fds;
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -1,5 +1,6 @@
|
||||
export CC = gcc
|
||||
ifndef CXX
|
||||
export CXX = g++
|
||||
endif
|
||||
export MPICXX = mpicxx
|
||||
export LDFLAGS= -Llib
|
||||
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic
|
||||
@ -10,13 +11,13 @@ BPATH=.
|
||||
# objectives that makes up rabit library
|
||||
MPIOBJ= $(BPATH)/engine_mpi.o
|
||||
OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(BPATH)/engine_empty.o $(BPATH)/engine_mock.o\
|
||||
$(BPATH)/rabit_wrapper.o
|
||||
$(BPATH)/rabit_wrapper.o $(BPATH)/engine_base.o
|
||||
SLIB= wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so wrapper/librabit_wrapper_mpi.so
|
||||
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a
|
||||
ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
|
||||
HEADERS=src/*.h include/*.h include/rabit/*.h
|
||||
.PHONY: clean all install mpi python
|
||||
|
||||
all: lib/librabit.a lib/librabit_mock.a wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so
|
||||
all: lib/librabit.a lib/librabit_mock.a wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so lib/librabit_base.a
|
||||
mpi: lib/librabit_mpi.a wrapper/librabit_wrapper_mpi.so
|
||||
python: wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so
|
||||
|
||||
@ -26,8 +27,10 @@ $(BPATH)/allreduce_robust.o: src/allreduce_robust.cc $(HEADERS)
|
||||
$(BPATH)/engine_mpi.o: src/engine_mpi.cc $(HEADERS)
|
||||
$(BPATH)/engine_empty.o: src/engine_empty.cc $(HEADERS)
|
||||
$(BPATH)/engine_mock.o: src/engine_mock.cc $(HEADERS)
|
||||
$(BPATH)/engine_base.o: src/engine_base.cc $(HEADERS)
|
||||
|
||||
lib/librabit.a: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o
|
||||
lib/librabit_base.a: $(BPATH)/allreduce_base.o $(BPATH)/engine_base.o
|
||||
lib/librabit_mock.a: $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine_mock.o
|
||||
lib/librabit_empty.a: $(BPATH)/engine_empty.o
|
||||
lib/librabit_mpi.a: $(MPIOBJ)
|
||||
|
||||
@ -5,6 +5,7 @@ rabit is a light weight library that provides a fault tolerant interface of Allr
|
||||
* [Tutorial](guide)
|
||||
* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
|
||||
* You can also directly read the [interface header](include/rabit.h)
|
||||
* [Machine Learning Tools](rabit-learn)
|
||||
|
||||
Features
|
||||
====
|
||||
@ -26,3 +27,11 @@ Use Rabit
|
||||
* Add lib to the library path and include to the include path of compiler
|
||||
* Languages: You can use rabit in C++ and python
|
||||
- It is also possible to port the library to other languages
|
||||
|
||||
Contributing
|
||||
====
|
||||
Rabit is an open-source library, contributions are welcomed, including:
|
||||
* The rabit core library.
|
||||
* Customized tracker script for new platforms and interface of new languages.
|
||||
* Toolkits, benchmarks, resource (links to related repos).
|
||||
* Tutorial and examples about the library.
|
||||
|
||||
@ -135,7 +135,6 @@ template<typename OP, typename DType>
|
||||
inline void Allreduce(DType *sendrecvbuf, size_t count,
|
||||
void (*prepare_fun)(void *arg) = NULL,
|
||||
void *prepare_arg = NULL);
|
||||
|
||||
// C++11 support for lambda prepare function
|
||||
#if __cplusplus >= 201103L
|
||||
/*!
|
||||
@ -238,11 +237,13 @@ class ReduceHandle;
|
||||
} // namespace engine
|
||||
/*!
|
||||
* \brief template class to make customized reduce and all reduce easy
|
||||
* Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
|
||||
* Do not use reducer directly in the function you call Finalize,
|
||||
* because the destructor can execute after Finalize
|
||||
* \tparam DType data type that to be reduced
|
||||
* DType must be a struct, with no pointer, and contain a function Reduce(const DType &d);
|
||||
* \tparam freduce the customized reduction function
|
||||
* DType must be a struct, with no pointer
|
||||
*/
|
||||
template<typename DType>
|
||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
|
||||
class Reducer {
|
||||
public:
|
||||
Reducer(void);
|
||||
@ -280,7 +281,8 @@ class Reducer {
|
||||
* Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
|
||||
*
|
||||
* \tparam DType data type that to be reduced, DType must contain the following functions:
|
||||
* (1) Save(IStream &fs) (2) Load(IStream &fs) (3) Reduce(const DType &d);
|
||||
* \tparam freduce the customized reduction function
|
||||
* (1) Save(IStream &fs) (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte)
|
||||
*/
|
||||
template<typename DType>
|
||||
class SerializeReducer {
|
||||
|
||||
@ -195,8 +195,8 @@ inline int VersionNumber(void) {
|
||||
// Code to handle customized Reduce
|
||||
// ---------------------------------
|
||||
// function to perform reduction for Reducer
|
||||
template<typename DType>
|
||||
inline void ReducerFunc_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
|
||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
|
||||
inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
|
||||
const size_t kUnit = sizeof(DType);
|
||||
const char *psrc = reinterpret_cast<const char*>(src_);
|
||||
char *pdst = reinterpret_cast<char*>(dst_);
|
||||
@ -205,18 +205,32 @@ inline void ReducerFunc_(const void *src_, void *dst_, int len_, const MPI::Data
|
||||
// use memcpy to avoid alignment issue
|
||||
std::memcpy(&tdst, pdst + i * kUnit, sizeof(tdst));
|
||||
std::memcpy(&tsrc, psrc + i * kUnit, sizeof(tsrc));
|
||||
tdst.Reduce(tsrc);
|
||||
freduce(tdst, tsrc);
|
||||
std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
|
||||
}
|
||||
}
|
||||
template<typename DType>
|
||||
inline Reducer<DType>::Reducer(void) {
|
||||
this->handle_.Init(ReducerFunc_<DType>, sizeof(DType));
|
||||
// function to perform reduction for Reducer
|
||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
|
||||
inline void ReducerAlign_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
|
||||
const DType *psrc = reinterpret_cast<const DType*>(src_);
|
||||
DType *pdst = reinterpret_cast<DType*>(dst_);
|
||||
for (int i = 0; i < len_; ++i) {
|
||||
freduce(pdst[i], psrc[i]);
|
||||
}
|
||||
}
|
||||
template<typename DType>
|
||||
inline void Reducer<DType>::Allreduce(DType *sendrecvbuf, size_t count,
|
||||
void (*prepare_fun)(void *arg),
|
||||
void *prepare_arg) {
|
||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
|
||||
inline Reducer<DType, freduce>::Reducer(void) {
|
||||
// it is safe to directly use handle for aligned data types
|
||||
if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
|
||||
this->handle_.Init(ReducerAlign_<DType, freduce>, sizeof(DType));
|
||||
} else {
|
||||
this->handle_.Init(ReducerSafe_<DType, freduce>, sizeof(DType));
|
||||
}
|
||||
}
|
||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
|
||||
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
|
||||
void (*prepare_fun)(void *arg),
|
||||
void *prepare_arg) {
|
||||
handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun, prepare_arg);
|
||||
}
|
||||
// function to perform reduction for SerializeReducer
|
||||
@ -280,9 +294,9 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
|
||||
}
|
||||
|
||||
#if __cplusplus >= 201103L
|
||||
template<typename DType>
|
||||
inline void Reducer<DType>::Allreduce(DType *sendrecvbuf, size_t count,
|
||||
std::function<void()> prepare_fun) {
|
||||
template<typename DType, void (*freduce)(DType &dst, const DType &src)>
|
||||
inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
|
||||
std::function<void()> prepare_fun) {
|
||||
this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun);
|
||||
}
|
||||
template<typename DType>
|
||||
|
||||
@ -8,7 +8,7 @@ It also contain links to the Machine Learning packages that uses rabit.
|
||||
Toolkits
|
||||
====
|
||||
* [KMeans Clustering](kmeans)
|
||||
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/unity/multi-node)
|
||||
* [XGBoost: eXtreme Gradient Boosting](https://github.com/tqchen/xgboost/tree/master/multi-node)
|
||||
- xgboost is a very fast boosted tree(also known as GBDT) library, that can run more than
|
||||
10 times faster than existing packages
|
||||
- Rabit carries xgboost to distributed enviroment, inheritating all the benefits of xgboost
|
||||
|
||||
@ -17,11 +17,15 @@
|
||||
namespace rabit {
|
||||
namespace engine {
|
||||
// singleton sync manager
|
||||
#ifndef RABIT_USE_BASE
|
||||
#ifndef RABIT_USE_MOCK
|
||||
AllreduceRobust manager;
|
||||
#else
|
||||
AllreduceMock manager;
|
||||
#endif
|
||||
#else
|
||||
AllreduceBase manager;
|
||||
#endif
|
||||
|
||||
/*! \brief intiialize the synchronization module */
|
||||
void Init(int argc, char *argv[]) {
|
||||
|
||||
15
subtree/rabit/src/engine_base.cc
Normal file
15
subtree/rabit/src/engine_base.cc
Normal file
@ -0,0 +1,15 @@
|
||||
/*!
|
||||
* Copyright (c) 2014 by Contributors
|
||||
* \file engine_mock.cc
|
||||
* \brief this is an engine implementation that will
|
||||
* insert failures in certain call point, to test if the engine is robust to failure
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
// define use MOCK, os we will use mock Manager
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_DEPRECATE
|
||||
#define NOMINMAX
|
||||
// switch engine to AllreduceMock
|
||||
#define RABIT_USE_BASE
|
||||
#include "./engine.cc"
|
||||
|
||||
@ -159,12 +159,17 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
|
||||
utils::Assert(handle_ == NULL, "cannot initialize reduce handle twice");
|
||||
if (type_nbytes != 0) {
|
||||
MPI::Datatype *dtype = new MPI::Datatype();
|
||||
*dtype = MPI::CHAR.Create_contiguous(type_nbytes);
|
||||
if (type_nbytes % 8 == 0) {
|
||||
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
|
||||
} else if (type_nbytes % 4 == 0) {
|
||||
*dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
|
||||
} else {
|
||||
*dtype = MPI::CHAR.Create_contiguous(type_nbytes);
|
||||
}
|
||||
dtype->Commit();
|
||||
created_type_nbytes_ = type_nbytes;
|
||||
htype_ = dtype;
|
||||
}
|
||||
|
||||
MPI::Op *op = new MPI::Op();
|
||||
MPI::User_function *pf = redfunc;
|
||||
op->Init(pf, true);
|
||||
@ -183,7 +188,13 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
|
||||
} else {
|
||||
dtype->Free();
|
||||
}
|
||||
*dtype = MPI::CHAR.Create_contiguous(type_nbytes);
|
||||
if (type_nbytes % 8 == 0) {
|
||||
*dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
|
||||
} else if (type_nbytes % 4 == 0) {
|
||||
*dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
|
||||
} else {
|
||||
*dtype = MPI::CHAR.Create_contiguous(type_nbytes);
|
||||
}
|
||||
dtype->Commit();
|
||||
created_type_nbytes_ = type_nbytes;
|
||||
}
|
||||
|
||||
@ -35,7 +35,7 @@ def mpi_submit(nslave, worker_args):
|
||||
if args.hostfile is None:
|
||||
cmd = ' '.join(['mpirun -n %d' % (nslave)] + args.command + worker_args)
|
||||
else:
|
||||
' '.join(['mpirun -n %d --hostfile %s' % (nslave, args.hostfile)] + args.command + worker_args)
|
||||
cmd = ' '.join(['mpirun -n %d --hostfile %s' % (nslave, args.hostfile)] + args.command + worker_args)
|
||||
print cmd
|
||||
subprocess.check_call(cmd, shell = True)
|
||||
|
||||
|
||||
2
subtree/rabit/windows/.gitignore
vendored
2
subtree/rabit/windows/.gitignore
vendored
@ -1,6 +1,6 @@
|
||||
*.suo
|
||||
*.exp
|
||||
*.sdf
|
||||
*sdf
|
||||
*.exe
|
||||
ipch
|
||||
x64
|
||||
|
||||
@ -105,7 +105,7 @@
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalDependencies>..\x64\Release\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@ -4,8 +4,14 @@ Microsoft Visual Studio Solution File, Format Version 11.00
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "basic", "basic\basic.vcxproj", "{A6A95246-EB0A-46BA-9471-5939CB6B0006}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit_wrapper", "rabit_wrapper\rabit_wrapper.vcxproj", "{2F89A7C5-CA4F-4D77-A728-6702D9F33F9F}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
|
||||
@ -106,7 +106,7 @@
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalDependencies>..\x64\Release\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
|
||||
@ -57,6 +57,22 @@ class Booster: public learner::BoostLearner {
|
||||
learner::BoostLearner::LoadModel(fname);
|
||||
this->init_model = true;
|
||||
}
|
||||
inline void LoadModelFromBuffer(const void *buf, size_t size) {
|
||||
utils::MemoryFixSizeBuffer fs((void*)buf, size);
|
||||
learner::BoostLearner::LoadModel(fs);
|
||||
this->init_model = true;
|
||||
}
|
||||
inline const char *GetModelRaw(bst_ulong *out_len) {
|
||||
model_str.resize(0);
|
||||
utils::MemoryBufferStream fs(&model_str);
|
||||
learner::BoostLearner::SaveModel(fs);
|
||||
*out_len = static_cast<bst_ulong>(model_str.length());
|
||||
if (*out_len == 0) {
|
||||
return NULL;
|
||||
} else {
|
||||
return &model_str[0];
|
||||
}
|
||||
}
|
||||
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
|
||||
model_dump = this->DumpModel(fmap, with_stats);
|
||||
model_dump_cptr.resize(model_dump.size());
|
||||
@ -69,6 +85,8 @@ class Booster: public learner::BoostLearner {
|
||||
// temporal fields
|
||||
// temporal data to save evaluation dump
|
||||
std::string eval_str;
|
||||
// temporal data to save model dump
|
||||
std::string model_str;
|
||||
// temporal space to save model dump
|
||||
std::vector<std::string> model_dump;
|
||||
std::vector<const char*> model_dump_cptr;
|
||||
@ -133,7 +151,7 @@ extern "C"{
|
||||
void* XGDMatrixCreateFromMat(const float *data,
|
||||
bst_ulong nrow,
|
||||
bst_ulong ncol,
|
||||
float missing) {
|
||||
float missing) {
|
||||
bool nan_missing = isnan(missing);
|
||||
DMatrixSimple *p_mat = new DMatrixSimple();
|
||||
DMatrixSimple &mat = *p_mat;
|
||||
@ -143,7 +161,8 @@ extern "C"{
|
||||
bst_ulong nelem = 0;
|
||||
for (bst_ulong j = 0; j < ncol; ++j) {
|
||||
if (isnan(data[j])) {
|
||||
utils::Check(nan_missing, "There are NAN in the matrix, however, you did not set missing=NAN");
|
||||
utils::Check(nan_missing,
|
||||
"There are NAN in the matrix, however, you did not set missing=NAN");
|
||||
} else {
|
||||
if (nan_missing || data[j] != missing) {
|
||||
mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
|
||||
@ -295,6 +314,12 @@ extern "C"{
|
||||
void XGBoosterSaveModel(const void *handle, const char *fname) {
|
||||
static_cast<const Booster*>(handle)->SaveModel(fname);
|
||||
}
|
||||
void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len) {
|
||||
static_cast<Booster*>(handle)->LoadModelFromBuffer(buf, len);
|
||||
}
|
||||
const char *XGBoosterGetModelRaw(void *handle, bst_ulong *out_len) {
|
||||
return static_cast<Booster*>(handle)->GetModelRaw(out_len);
|
||||
}
|
||||
const char** XGBoosterDumpModel(void *handle, const char *fmap, int with_stats, bst_ulong *len){
|
||||
utils::FeatMap featmap;
|
||||
if (strlen(fmap) != 0) {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#ifndef XGBOOST_WRAPPER_H_
|
||||
#define XGBOOST_WRAPPER_H_
|
||||
/*!
|
||||
* \file xgboost_wrapperh
|
||||
* \file xgboost_wrapper.h
|
||||
* \author Tianqi Chen
|
||||
* \brief a C style wrapper of xgboost
|
||||
* can be used to create wrapper of other languages
|
||||
@ -17,28 +17,6 @@ typedef unsigned long bst_ulong;
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/*!
|
||||
* \brief initialize sync module, this is needed if used in distributed model
|
||||
* normally, argv need to contain master_uri and master_port
|
||||
* if start using submit_job_tcp script, then pass args to this will do
|
||||
* \param argc number of arguments
|
||||
* \param argv the arguments to be passed in sync module
|
||||
*/
|
||||
XGB_DLL void XGSyncInit(int argc, char *argv[]);
|
||||
/*!
|
||||
* \brief finalize sync module, call this when everything is done
|
||||
*/
|
||||
XGB_DLL void XGSyncFinalize(void);
|
||||
/*!
|
||||
* \brief get the rank
|
||||
* \return return the rank of
|
||||
*/
|
||||
XGB_DLL int XGSyncGetRank(void);
|
||||
/*!
|
||||
* \brief get the world size from sync
|
||||
* \return return the number of distributed job ran in the group
|
||||
*/
|
||||
XGB_DLL int XGSyncGetWorldSize(void);
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \return a loaded data matrix
|
||||
@ -224,6 +202,21 @@ extern "C" {
|
||||
* \param fname file name
|
||||
*/
|
||||
XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
|
||||
/*!
|
||||
* \brief load model from in memory buffer
|
||||
* \param handle handle
|
||||
* \param buf pointer to the buffer
|
||||
* \param len the length of the buffer
|
||||
*/
|
||||
XGB_DLL void XGBoosterLoadModelFromBuffer(void *handle, const void *buf, bst_ulong len);
|
||||
/*!
|
||||
* \brief save model into binary raw bytes, return header of the array
|
||||
* user must copy the result out, before next xgboost call
|
||||
* \param handle handle
|
||||
* \param out_len the argument to hold the output length
|
||||
* \return the pointer to the beginning of binary buffer
|
||||
*/
|
||||
XGB_DLL const char *XGBoosterGetModelRaw(void *handle, bst_ulong *out_len);
|
||||
/*!
|
||||
* \brief dump model, return array of strings representing model dump
|
||||
* \param handle handle
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user