Merge branch 'master' of ssh://github.com/tqchen/xgboost

Conflicts:
	.gitignore
This commit is contained in:
tqchen 2015-01-26 10:28:20 -08:00
commit 1f6b8eb344
175 changed files with 15232 additions and 425 deletions

10
.gitignore vendored
View File

@ -2,7 +2,7 @@
*.slo
*.lo
*.o
*.page
# Compiled Dynamic libraries
*.so
*.dylib
@ -45,5 +45,13 @@ Debug
*save
*csv
.Rproj.user
*.cpage.col
*.cpage
xgboost
xgboost.mpi
xgboost.mock
train*
rabit
.Rbuildignore
R-package.Rproj

View File

@ -20,3 +20,9 @@ xgboost-0.3
* Linear booster is now parallelized, using parallel coordinated descent.
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
* Add R module
in progress version
=====
* Distributed version
* Feature importance visualization in R module, thanks to Michael Benesty
* Predict leaf inde

View File

@ -1,8 +1,8 @@
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC
ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
@ -10,56 +10,90 @@ else
CFLAGS += -fopenmp
endif
# by default use c++11
ifeq ($(cxx11),1)
CFLAGS += -std=c++11
else
endif
# specify tensor path
BIN = xgboost
OBJ = updater.o gbm.o io.o
MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o
MPIBIN = xgboost.mpi
SLIB = wrapper/libxgboostwrapper.so
.PHONY: clean all python Rpack
.PHONY: clean all mpi python Rpack
all: $(BIN) $(OBJ) $(SLIB)
all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
xgboost.mpi: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mpi.a
xgboost.mock: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mock.a
xgboost: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit.a
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o subtree/rabit/lib/librabit.a
# dependency on rabit
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
cd subtree/rabit;make lib/librabit.a; cd ../..
subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
cd subtree/rabit;make lib/librabit_empty.a; cd ../..
subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
cd subtree/rabit;make lib/librabit_mock.a; cd ../..
subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
install:
cp -f -r $(BIN) $(INSTALL_PATH)
Rpack:
make clean
cd subtree/rabit;make clean;cd ..
rm -rf xgboost xgboost*.tar.gz
cp -r R-package xgboost
rm -rf xgboost/inst/examples/*.buffer
rm -rf xgboost/inst/examples/*.model
rm -rf xgboost/inst/examples/dump*
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
rm -rf subtree/rabit/src/*.o
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
rm -rf xgboost/demo/runall.R
cp -r src xgboost/src/src
cp -r subtree xgboost/src/subtree
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
cp ./LICENSE xgboost
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
cp xgboost/src/Makevars xgboost/src/Makevars.win
R CMD build xgboost
rm -rf xgboost
R CMD check --as-cran xgboost*.tar.gz
clean:
$(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
cd subtree/rabit; make clean; cd ..

View File

@ -22,7 +22,7 @@ Depends:
Imports:
Matrix (>= 1.1-0),
methods,
data.table (>= 1.9),
data.table (>= 1.9.4),
magrittr (>= 1.5),
stringr,
DiagrammeR
DiagrammeR

View File

@ -1,4 +1,4 @@
# Generated by roxygen2 (4.1.0): do not edit by hand
# Generated by roxygen2 (4.0.1): do not edit by hand
export(getinfo)
export(setinfo)

View File

@ -32,10 +32,15 @@ setMethod("getinfo", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name != "label" && name != "weight" && name != "base_margin") {
if (name != "label" && name != "weight" &&
name != "base_margin" && name != "nrow") {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
if (name != "nrow"){
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
} else {
ret <- xgb.numrow(object)
}
return(ret)
})

View File

@ -7,6 +7,8 @@ setClass("xgb.Booster")
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
@ -14,6 +16,7 @@ setClass("xgb.Booster")
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
@ -25,7 +28,8 @@ setClass("xgb.Booster")
#' @export
#'
setMethod("predict", signature = "xgb.Booster",
definition = function(object, newdata, missing = NULL, outputmargin = FALSE, ntreelimit = NULL) {
definition = function(object, newdata, missing = NULL,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
if (class(newdata) != "xgb.DMatrix") {
if (is.null(missing)) {
newdata <- xgb.DMatrix(newdata)
@ -40,7 +44,24 @@ setMethod("predict", signature = "xgb.Booster",
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
option = 0
if (outputmargin) {
option <- option + 1
}
if (predleaf) {
option <- option + 2
}
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(option),
as.integer(ntreelimit), PACKAGE = "xgboost")
if (predleaf){
len <- getinfo(newdata, "nrow")
if (length(ret) == len){
ret <- matrix(ret,ncol = 1)
} else {
ret <- matrix(ret, ncol = len)
ret <- t(ret)
}
}
return(ret)
})

View File

@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
PACKAGE = "xgboost")
attr_list <- attributes(object)
nr <- xgb.numrow(object)
len <- sapply(attr_list,length)
ind <- which(len==nr)
if (length(ind)>0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix"))
})

View File

@ -131,7 +131,7 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
@ -169,18 +169,27 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
} else {
msg <- ""
}
if (prediction){
preds <- predict(booster,watchlist[[2]])
return(list(msg,preds))
}
return(msg)
}
}
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param) {
randidx <- sample(1 : xgb.numrow(dall))
kstep <- length(randidx) / nfold
idset <- list()
for (i in 1:nfold) {
idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
randidx <- sample(1 : xgb.numrow(dall))
kstep <- length(randidx) %/% nfold
idset <- list()
for (i in 1:(nfold-1)) {
idset[[i]] = randidx[1:kstep]
randidx = setdiff(randidx,idset[[i]])
}
idset[[nfold]] = randidx
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]])
@ -193,7 +202,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
}
return (ret)
}

View File

@ -6,7 +6,7 @@
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'

View File

@ -31,6 +31,9 @@
#' @param nrounds the max number of iterations
#' @param nfold number of folds used
#' @param label option field, when data is Matrix
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
@ -47,8 +50,6 @@
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param verbose \code{boolean}, print the statistics during the process.
#' @param ... other parameters to pass to \code{params}.
#'
@ -71,7 +72,8 @@
#' @export
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, verbose = T,...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
@ -90,13 +92,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
}
folds <- xgb.cv.mknfold(dtrain, nfold, params)
predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
if (!prediction){
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
predictValues[fd$index] <- res[[2]]
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
}
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
@ -115,5 +124,14 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
split <- str_split(string = history, pattern = "\t")
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
dt
}
if (prediction) {
return(list(dt = dt,pred = predictValues))
}
return(dt)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")

View File

@ -29,7 +29,7 @@
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' # save the model in file 'xgb.model.dump'
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#'
#' # print the model without saving it to a file
#' print(xgb.dump(bst))
@ -37,11 +37,15 @@
#'
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
if (class(model) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster")
stop("model: argument must be type xgb.Booster")
}
if (!class(fname) %in% c("character", "NULL")) {
stop("xgb.dump: second argument must be type character when provided")
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fname: argument must be type character (when provided)")
}
if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
stop("fmap: argument must be type character (when provided)")
}
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
if(is.null(fname)) {
@ -50,4 +54,9 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname)
return(TRUE)
}
}
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")

View File

@ -9,6 +9,7 @@
#' @importFrom magrittr %>%
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#'
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#'
@ -31,41 +32,57 @@
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#' test <- agaricus.test
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump')
#' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst)
#'
#' @export
xgb.importance <- function(feature_names = NULL, filename_dump = NULL){
xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a path to the model dump file.")
}
text <- readLines(filename_dump)
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if(is.null(model)){
text <- readLines(filename_dump)
} else {
text <- xgb.dump(model = model, with.stats = T)
}
if(text[2] == "bias:"){
result <- linearDump(feature_names, text)
result <- readLines(filename_dump) %>% linearDump(feature_names, .)
} else {
result <- treeDump(feature_names, text)
result <- treeDump(feature_names, text = text)
}
result
}
treeDump <- function(feature_names, text){
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",][,.(sum(Quality), sum(Cover), .N),by = Feature][,V1:=V1/sum(V1)][,V2:=V2/sum(V2)][,N:=N/sum(N)][order(-rank(V1))]
setnames(result, c("Feature", "Gain", "Cover", "Frequence"))
result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover),Frequence=Frequence/sum(Frequence))][order(-Gain)]
result
}
linearDump <- function(feature_names, text){
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
}
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")

View File

@ -16,6 +16,8 @@
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
@ -40,38 +42,47 @@
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump')
#'
#' @export
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){
xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
if (!class(filename_dump) %in% c("character", "NULL")) {
stop("filename_dump: Has to be a character vector representing the path to the model dump file.")
} else if (class(filename_dump) == "character" && !file.exists(filename_dump)) {
if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
} else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
stop("filename_dump: path to the model doesn't exist.")
} else if(is.null(filename_dump) & is.null(text)){
stop("filename_dump: no path and no string version of the model dump have been provided.")
} else if(is.null(filename_dump) && is.null(model) && is.null(text)){
stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
}
if (!class(text) %in% c("character", "NULL")) {
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
if (!class(text) %in% c("character", "NULL")) {
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
}
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
stop("n_first_tree: Has to be a numeric vector of size 1.")
}
if(is.null(text)){
if(!is.null(model)){
text = xgb.dump(model = model, with.stats = T)
} else if(!is.null(filename_dump)){
text <- readLines(filename_dump) %>% str_trim(side = "both")
}
@ -89,6 +100,9 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
tree <- text[(position[i]+1):(position[i+1]-1)]
# avoid tree made of a leaf only (no split)
if(length(tree) <2) next
treeID <- i-1
notLeaf <- str_match(tree, "leaf") %>% is.na
@ -119,7 +133,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
}
yes <- allTrees[!is.na(Yes),Yes]
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "Yes.Feature",
value = allTrees[ID == yes,Feature])
@ -148,3 +162,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
allTrees
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))

View File

@ -17,9 +17,12 @@
#' @importFrom stringr str_trim
#' @importFrom DiagrammeR DiagrammeR
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
#' @param width the width of the diagram in pixels.
#' @param height the height of the diagram in pixels.
#'
#' @return A \code{DiagrammeR} of the model.
#'
@ -39,39 +42,53 @@
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){
#'
xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){
if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) {
if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
stop("style: Has to be a character vector of size 1.")
}
if (!class(model) %in% c("xgb.Booster", "NULL")) {
stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
}
allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree)
if(is.null(model)){
allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
} else {
allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
}
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
if(is.null(styles)){
styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
if(is.null(CSSstyle)){
CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
}
yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";")
DiagrammeR(path)
path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
DiagrammeR(path, width, height)
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))

View File

@ -5,7 +5,7 @@
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field,
# if data is local data file or \code{xgb.DMatrix}.
#' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
@ -24,8 +24,8 @@
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details

Binary file not shown.

Binary file not shown.

View File

@ -4,4 +4,5 @@ boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
generalized_linear_model Generalized Linear Model
cross_validation Cross validation
create_sparse_matrix
create_sparse_matrix Create Sparse Matrix
predict_leaf_indices Predicting the corresponding leaves

View File

@ -1,7 +1,7 @@
require(xgboost)
require(Matrix)
require(data.table)
require(vcd) #Available in Cran. Used for its dataset with categorical values.
if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
# According to its documentation, Xgboost works only on numbers.
# Sometimes the dataset we have to work on have categorical data.
@ -86,4 +86,4 @@ print(chisq.test(df$AgeCat, df$Y))
# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
# However it's almost always worse when you add some arbitrary rules.
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.

View File

@ -45,3 +45,7 @@ param <- list(max.depth=2,eta=1,silent=1)
xgb.cv(param, dtrain, nround, nfold = 5,
obj = logregobj, feval=evalerror)
# do cross validation with prediction values for each fold
res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
res$dt
length(res$pred)

View File

@ -0,0 +1,21 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
nround = 5
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, watchlist)
cat('start testing prediction from first n trees\n')
### predict using first 2 tree
pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
head(pred_with_leaf)
# by default, we predict using all the trees
pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
head(pred_with_leaf)

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/getinfo.xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{getinfo}
\alias{getinfo}
@ -13,9 +12,9 @@ getinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{...}{other parameters}
}
\description{
Get information of an xgb.DMatrix object

View File

@ -1,12 +1,11 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/predict.xgb.Booster.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
outputmargin = FALSE, ntreelimit = NULL)
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
}
\arguments{
\item{object}{Object of class "xgb.Boost"}
@ -14,6 +13,9 @@
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is
untransformed margin value. In logistic regression, outputmargin=T will
@ -22,6 +24,8 @@ output value before logistic transformation.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is
only valid for gbtree, but not for gblinear. set it to be value bigger
than 0. It will use all trees by default.}
\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
}
\description{
Predicted values based on xgboost model object.

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/setinfo.xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{setinfo}
\alias{setinfo}
@ -13,11 +12,11 @@ setinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
\item{...}{other parameters}
}
\description{
Set information of an xgb.DMatrix object

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/slice.xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{slice}
\alias{slice}
@ -14,9 +13,9 @@ slice(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
\item{...}{other parameters}
\item{idxset}{a integer vector of indices of rows needed}
\item{...}{other parameters}
}
\description{
Get a new DMatrix containing the specified rows of

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
@ -12,7 +11,8 @@ indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other information to pass to \code{info}.}
}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.DMatrix.save.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}

View File

@ -1,12 +1,11 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.cv.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
\usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
feval = NULL, verbose = T, ...)
missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
obj = NULL, feval = NULL, verbose = T, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
@ -32,7 +31,10 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
\item{label}{option field, when data is Matrix}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{prediction}{A logical value indicating whether to return the prediction vector.}
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.dump.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}
@ -37,7 +36,7 @@ test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
# save the model in file 'xgb.model.dump'
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
# print the model without saving it to a file
print(xgb.dump(bst))

View File

@ -1,15 +1,16 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.importance.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.importance}
\alias{xgb.importance}
\title{Show importance of features in a model}
\usage{
xgb.importance(feature_names = NULL, filename_dump = NULL)
xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
}
\value{
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
@ -36,16 +37,16 @@ There are 3 columns :
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump')
xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst)
}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.load.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}

View File

@ -1,17 +1,20 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.model.dt.tree.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table}
\usage{
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL,
n_first_tree = NULL)
xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
model = NULL, text = NULL, n_first_tree = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
}
\value{
@ -40,15 +43,16 @@ The content of the \code{data.table} is organised that way:
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump')
}

View File

@ -1,20 +1,25 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.plot.tree.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.plot.tree}
\alias{xgb.plot.tree}
\title{Plot a boosted tree model}
\usage{
xgb.plot.tree(feature_names = NULL, filename_dump = NULL,
n_first_tree = NULL, styles = NULL)
xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
\item{width}{the width of the diagram in pixels.}
\item{height}{the height of the diagram in pixels.}
}
\value{
A \code{DiagrammeR} of the model.
@ -38,15 +43,15 @@ It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpos
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.save.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgb.train.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}

View File

@ -1,5 +1,4 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/xgboost.R
% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
@ -11,9 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
\item{label}{the response variable. User should not set this field,}
\item{missing}{Missing is only used when input is dense matrix, pick a float}
\item{label}{the response variable. User should not set this field,
if data is local data file or \code{xgb.DMatrix}.}
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
@ -36,6 +34,9 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
information of performance. If 2, xgboost will print information of both
performance and construction progress information}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{

View File

@ -1,9 +1,7 @@
# package root
PKGROOT=../../
# _*_ mode: Makefile; _*_
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o

View File

@ -1,7 +1,19 @@
# package root
PKGROOT=../../
PKGROOT=./
# _*_ mode: Makefile; _*_
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
# This file is only used for windows compilation from github
# It will be replaced by Makevars in CRAN version
.PHONY: all xgblib
all: $(SHLIB)
$(SHLIB): xgblib
xgblib:
cp -r ../../src .
cp -r ../../wrapper .
cp -r ../../subtree .
PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
$(OBJECTS) : xgblib

View File

@ -3,10 +3,12 @@
#include <utility>
#include <cstring>
#include <cstdio>
#include "xgboost_R.h"
#include <sstream>
#include "wrapper/xgboost_wrapper.h"
#include "src/utils/utils.h"
#include "src/utils/omp.h"
#include "xgboost_R.h"
using namespace std;
using namespace xgboost;
@ -246,12 +248,12 @@ extern "C" {
asInteger(iter),
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
_WrapperBegin();
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(output_margin),
asInteger(option_mask),
asInteger(ntree_limit),
&olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
@ -280,14 +282,13 @@ extern "C" {
asInteger(with_stats),
&olen);
SEXP out = PROTECT(allocVector(STRSXP, olen));
char buffer [2000];
for (size_t i = 0; i < olen; ++i) {
memset(buffer, 0, sizeof buffer);
sprintf (buffer, "booster[%u]:\n%s", static_cast<unsigned>(i), res[i]);
SET_STRING_ELT(out, i, mkChar(buffer));
stringstream stream;
stream << "booster["<<i<<"]\n" << res[i];
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
}
_WrapperEnd();
UNPROTECT(1);
return out;
}
}
}

View File

@ -111,10 +111,10 @@ extern "C" {
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param output_margin whether only output raw margin value
* \param option_mask output_margin:1 predict_leaf:2
* \param ntree_limit limit number of trees used in prediction
*/
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit);
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit);
/*!
* \brief load model from existing file
* \param handle handle

View File

@ -1,6 +1,7 @@
xgboost: eXtreme Gradient Boosting
======
An optimized general purpose gradient boosting library. The library is parallelized using OpenMP. It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree.
An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also also distributed and scale to even larger data.
Contributors: https://github.com/tqchen/xgboost/graphs/contributors
@ -10,6 +11,8 @@ Questions and Issues: [https://github.com/tqchen/xgboost/issues](https://github.
Examples Code: [Learning to use xgboost by examples](demo)
Distributed Version: [Distributed XGBoost](multi-node)
Notes on the Code: [Code Guide](src)
Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
@ -19,10 +22,14 @@ Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washin
What's New
=====
* [Distributed XGBoost](multi-node) is now available!!
* New features in the lastest changes :)
- Distributed version that scale xgboost to even larger problems with cluster
- Feature importance visualization in R module, thanks to Michael Benesty
- Predict leaf index, see [demo/guide-python/predict_leaf_indices.py](demo/guide-python/predict_leaf_indices.py)
* XGBoost wins [Tradeshift Text Classification](https://kaggle2.blob.core.windows.net/forum-message-attachments/60041/1813/TradeshiftTextClassification.pdf?sv=2012-02-12&se=2015-01-02T13%3A55%3A16Z&sr=b&sp=r&sig=5MHvyjCLESLexYcvbSRFumGQXCS7MVmfdBIY3y01tMk%3D)
* XGBoost wins [HEP meets ML Award in Higgs Boson Challenge](http://atlas.ch/news/2014/machine-learning-wins-the-higgs-challenge.html)
* Thanks to Bing Xu, [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl) allows you to use xgboost from Julia
* See the updated [demo folder](demo) for feature walkthrough
* Thanks to Tong He, the new [R package](R-package) is available
Features
@ -34,10 +41,15 @@ Features
* Speed: XGBoost is very fast
- IN [demo/higgs/speedtest.py](demo/kaggle-higgs/speedtest.py), kaggle higgs data it is faster(on our machine 20 times faster using 4 threads) than sklearn.ensemble.GradientBoostingClassifier
* Layout of gradient boosting algorithm to support user defined objective
* Distributed and portable
- The distributed version of xgboost is highly portable and can be used in different platforms
- It inheritates all the optimizations made in single machine mode, maximumly utilize the resources using both multi-threading and distributed computing.
Build
=====
* Run ```bash build.sh``` (you can also type make)
* If you have C++11 compiler, it is recommended to type ```make cxx11=1```
- C++11 is not used by default
* If your compiler does not come with OpenMP support, it will fire an warning telling you that the code will compile into single thread mode, and you will get single thread xgboost
* You may get a error: -lgomp is not found
- You can type ```make no_omp=1```, this will get you single thread xgboost

View File

@ -3,6 +3,9 @@
# basically, it first try to make with OpenMP, if fails, disable OpenMP and make again
# This will automatically make xgboost for MAC users who do not have openmp support
# In most cases, type make will give what you want
# download rabit
if make; then
echo "Successfully build multi-thread xgboost"
else

View File

@ -32,6 +32,9 @@ This is a list of short codes introducing different functionalities of xgboost a
[python](guide-python/cross_validation.py)
[R](../R-package/demo/cross_validation.R)
[Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)
* Predicting leaf indices
[python](guide-python/predict_leaf_indices.py)
[R](../R-package/demo/predict_leaf_indices.R)
Basic Examples by Tasks
====

View File

@ -6,3 +6,4 @@ XGBoost Python Feature Walkthrough
* [Predicting using first n trees](predict_first_ntree.py)
* [Generalized Linear Model](generalized_linear_model.py)
* [Cross validation](cross_validation.py)
* [Predicting leaf indices](predict_leaf_indices.py)

View File

@ -0,0 +1,22 @@
#!/usr/bin/python
import sys
import numpy as np
sys.path.append('../../wrapper')
import xgboost as xgb
### load data in do training
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 3
bst = xgb.train(param, dtrain, num_round, watchlist)
print ('start testing predict the leaf indices')
### predict using first 2 tree
leafindex = bst.predict(dtest, ntree_limit=2, pred_leaf = True)
print leafindex.shape
print leafindex
### predict all trees
leafindex = bst.predict(dtest, pred_leaf = True)
print leafindex.shape

View File

@ -4,4 +4,5 @@ python custom_objective.py
python boost_from_prediction.py
python generalized_linear_model.py
python cross_validation.py
rm -rf *~ *.model *.buffer
python predict_leaf_indices.py
rm -rf *~ *.model *.buffer

37
multi-node/README.md Normal file
View File

@ -0,0 +1,37 @@
Distributed XGBoost
======
This folder contains information of Distributed XGBoost (Distributed GBDT).
* The distributed version is built on Rabit:[Reliable Allreduce and Broadcast Library](https://github.com/tqchen/rabit)
- Rabit is a portable library that provides fault-tolerance for Allreduce calls for distributed machine learning
- This makes xgboost portable and fault-tolerant against node failures
* You can run Distributed XGBoost on platforms including Hadoop(see [hadoop folder](hadoop)) and MPI
- Rabit only replies a platform to start the programs, so it should be easy to port xgboost to most platforms
Build
=====
* In the root folder, type ```make```
- If you have C++11 compiler, it is recommended to use ```make cxx11=1```
Notes
====
* Rabit handles all the fault tolerant and communications efficiently, we only use platform specific command to start programs
- The Hadoop version does not rely on Mapreduce to do iterations
- You can expect xgboost not suffering the drawbacks of iterative MapReduce program
* The design choice was made because Allreduce is very natural and efficient for distributed tree building
- In current version of xgboost, the distributed version is only adds several lines of Allreduce synchronization code
* The multi-threading nature of xgboost is inheritated in distributed mode
- This means xgboost efficiently use all the threads in one machine, and communicates only between machines
- Remember to run on xgboost process per machine and this will give you maximum speedup
* For more information about rabit and how it works, see the [Rabit's Tutorial](https://github.com/tqchen/rabit/tree/master/guide)
Solvers
=====
There are two solvers in distributed xgboost. You can check for local demo of the two solvers, see [row-split](row-split) and [col-split](col-split)
* Column-based solver split data by column, each node work on subset of columns,
it uses exactly the same algorithm as single node version.
* Row-based solver split data by row, each node work on subset of rows,
it uses an approximate histogram count algorithm, and will only examine subset of
potential split points as opposed to all split points.
- This is the mode used by current hadoop version, since usually data was stored by rows in many industry system

View File

@ -0,0 +1,19 @@
Distributed XGBoost: Column Split Version
====
* run ```bash mushroom-col-rabit.sh <n-process>```
- mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
* run ```bash mushroom-col-rabit-mock.sh <n-process>```
- mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
How to Use
====
* First split the data by column,
* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
* Enable column split mode by ```dsplit=col```
Notes
====
* The code is multi-threaded, so you want to run one process per node
* The code will work correctly as long as union of each column subset is all the columns we are interested in.
- The column subset can overlap with each other.
* It uses exactly the same algorithm as single node version, to examine all potential split points.

View File

@ -0,0 +1,25 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
#
# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
#
rm -rf train.col* *.model
k=$1
# split the lib svm file into k subfiles
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
# run xgboost mpi
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
# the model can be directly loaded by single machine xgboost solver, as usuall
#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
#cat dump.nice.$k.txt

View File

@ -0,0 +1,28 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
#
# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
#
rm -rf train.col* *.model
k=$1
# split the lib svm file into k subfiles
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
# run xgboost mpi
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
# the model can be directly loaded by single machine xgboost solver, as usuall
../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
# run for one round, and continue training
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col num_round=1
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model
cat dump.nice.$k.txt

View File

@ -0,0 +1,35 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
use_buffer = 0
# The path of training data %d is the wildcard for the rank of the data
# The idea is each process take a feature matrix with subset of columns
#
data = "train.col%d"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "../../demo/data/agaricus.txt.test"
# evaluate on training data as well each round
eval_train = 1
# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
test:data = "../../demo/data/agaricus.txt.test"

View File

@ -0,0 +1,32 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different subcolumns
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
fmap = {}
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
for l in open(sys.argv[1]):
arr = l.split()
for f in fos:
f.write(arr[0])
for it in arr[1:]:
fid = int(it.split(':')[0])
if fid not in fmap:
fmap[fid] = random.randint(0, k-1)
fos[fmap[fid]].write(' '+it)
for f in fos:
f.write('\n')
for f in fos:
f.close()

View File

@ -0,0 +1,43 @@
Distributed XGBoost: Hadoop Version
====
* The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
* It relies on [Rabit Library](https://github.com/tqchen/rabit) (Reliable Allreduce and Broadcast Interface) and Hadoop Streaming. Rabit provides an interface to aggregate gradient values and split statistics, that allow xgboost to run reliably on hadoop. You do not need to care how to update model in each iteration, just use the script ```rabit_hadoop.py```. For those who want to know how it exactly works, plz refer to the main page of [Rabit](https://github.com/tqchen/rabit).
* Quick start: run ```bash run_mushroom.sh <n_hadoop_workers> <n_thread_per_worker> <path_in_HDFS>```
- This is the hadoop version of binary classification example in the demo folder.
- More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki)
Before you run the script
====
* Make sure you have set up the hadoop environment.
* If you want to only use single machine multi-threading, try single machine examples in the [demo folder](../../demo).
* Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost.
* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py.
How to Use
====
* Input data format: LIBSVM format. The example here uses generated data in demo/data folder.
* Put the training data in HDFS (hadoop distributed file system).
* Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file.
* Get the final model file from HDFS, and locally do prediction as well as visualization of model.
Single machine vs Hadoop version
====
If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first.
* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data=stdin``` and ```model_out=stdout```.
* File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on.
- Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
- You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names).
- The local path of cached files in command is "./".
- Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance.
* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train".
* More details of submission can be referred to the usage of ```rabit_hadoop.py```.
* The model saved by hadoop version is compatible with single machine version.
Notes
====
* The code has been tested on MapReduce 1 (MRv1) and YARN.
- We recommend to run it on MapReduce 2 (MRv2, YARN) so that multi-threading can be enabled.
* The code is optimized with multi-threading, so you will want to run one xgboost per node/worker for best performance.
- You will want to set <n_thread_per_worker> to be number of cores you have on each machine.
- You will need YARN to set specify number of cores of each worker

View File

@ -0,0 +1,39 @@
# General Parameters, see comment for each definition
# choose the booster, can be gbtree or gblinear
booster = gbtree
# choose logistic regression loss function for binary classification
objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task Parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
# evaluate on training data as well each round
# eval_train = 1
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
# eval[test] = "agaricus.txt.test"
# Plz donot modify the following parameters
# The path of training data
data = stdin
# The path of model file
model_out = stdout
# split pattern of xgboost
dsplit = row
<<<<<<< HEAD
# evaluate on training data as well each round
eval_train = 1
=======
>>>>>>> df3f87c182cc12ccc9ac1f9cafbe01ea7ebf0ac4

View File

@ -0,0 +1,23 @@
#!/bin/bash
if [ "$#" -lt 3 ];
then
echo "Usage: <nworkers> <nthreads> <path_in_HDFS>"
exit -1
fi
# put the local training file to HDFS
hadoop fs -mkdir $3/data
hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
../../subtree/rabit/tracker/rabit_hadoop.py -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/mushroom.final.model ../../xgboost mushroom.hadoop.conf nthread=$2
# get the final model file
hadoop fs -get $3/mushroom.final.model/part-00000 ./final.model
# output prediction task=pred
../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
# print the boosters of final.model in dump.raw.txt
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
# use the feature map in printing for better visualization
../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
cat dump.nice.txt

View File

@ -0,0 +1,18 @@
Distributed XGBoost: Row Split Version
====
* You might be interested to checkout the [Hadoop example](../hadoop)
* Machine Rabit: run ```bash machine-row-rabit.sh <n-mpi-process>```
- machine-col-rabit.sh starts xgboost job using rabit
How to Use
====
* First split the data by rows
* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
* Enable ow split mode by ```dsplit=row```
Notes
====
* The code is multi-threaded, so you want to run one xgboost-mpi per node
* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
and will only examine subset of potential split points as opposed to all split points.

View File

@ -0,0 +1,20 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf train-machine.row* *.model
k=$1
# make machine data
cd ../../demo/regression/
python mapfeat.py
python mknfold.py machine.txt 1
cd -
# split the lib svm file into k subfiles
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
# run xgboost mpi
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 mock=0,0,3,0 mock=2,2,3,0

View File

@ -0,0 +1,24 @@
#!/bin/bash
if [[ $# -ne 1 ]]
then
echo "Usage: nprocess"
exit -1
fi
rm -rf train-machine.row* *.model
k=$1
# make machine data
cd ../../demo/regression/
python mapfeat.py
python mknfold.py machine.txt 1
cd -
# split the lib svm file into k subfiles
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
# run xgboost mpi
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
# run xgboost-mpi save model 0001, continue to run from existing model
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model

View File

@ -0,0 +1,30 @@
# General Parameters, see comment for each definition
# choose the tree booster, can also change to gblinear
booster = gbtree
# this is the only difference with classification, use reg:linear to do linear classification
# when labels are in [0,1] we can also use reg:logistic
objective = reg:linear
# Tree Booster Parameters
# step size shrinkage
eta = 1.0
# minimum loss reduction required to make a further partition
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
min_child_weight = 1
# maximum depth of a tree
max_depth = 3
# Task parameters
# the number of round to do boosting
num_round = 2
# 0 means do not save any model except the final round model
save_period = 0
use_buffer = 0
# The path of training data
data = "train-machine.row%d"
# The path of validation data, used to monitor training process, here [test] sets name of the validation set
eval[test] = "../../demo/regression/machine.txt.test"
# The path of test data
test:data = "../../demo/regression/machine.txt.test"

View File

@ -0,0 +1,24 @@
#!/usr/bin/python
import sys
import random
# split libsvm file into different rows
if len(sys.argv) < 4:
print ('Usage:<fin> <fo> k')
exit(0)
random.seed(10)
k = int(sys.argv[3])
fi = open( sys.argv[1], 'r' )
fos = []
for i in range(k):
fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
for l in open(sys.argv[1]):
i = random.randint(0, k-1)
fos[i].write(l)
for f in fos:
f.close()

View File

@ -138,9 +138,10 @@ class IFMatrix {
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) = 0;
/*!
* \brief check if column access is supported, if not, initialize column access
* \param enabled whether certain feature should be included in column access
* \param subsample subsample ratio when generating column access
*/
virtual void InitColAccess(float subsample) = 0;
virtual void InitColAccess(const std::vector<bool> &enabled, float subsample) = 0;
// the following are column meta data, should be able to answer them fast
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const = 0;

View File

@ -33,16 +33,17 @@ class GBLinear : public IGradBooster {
model.param.SetParam(name, val);
}
}
virtual void LoadModel(utils::IStream &fi) {
virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
model.LoadModel(fi);
}
virtual void SaveModel(utils::IStream &fo) const {
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
model.SaveModel(fo);
}
virtual void InitModel(void) {
model.InitModel();
}
virtual void DoBoost(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) {
std::vector<bst_gpair> &gpair = *in_gpair;
@ -135,8 +136,22 @@ class GBLinear : public IGradBooster {
}
}
}
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
virtual void Predict(const SparseBatch::Inst &inst,
std::vector<float> *out_preds,
unsigned ntree_limit,
unsigned root_index) {
const int ngroup = model.param.num_output_group;
for (int gid = 0; gid < ngroup; ++gid) {
this->Pred(inst, BeginPtr(*out_preds));
}
}
virtual void PredictLeaf(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<float> *out_preds,
unsigned ntree_limit = 0) {
utils::Error("gblinear does not support predict leaf index");
}
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
std::stringstream fo("");
fo << "bias:\n";
for (int i = 0; i < model.param.num_output_group; ++i) {
@ -151,8 +166,8 @@ class GBLinear : public IGradBooster {
std::vector<std::string> v;
v.push_back(fo.str());
return v;
}
}
protected:
inline void Pred(const RowBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) {

View File

@ -1,5 +1,6 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <cstring>
#include "./gbm.h"
#include "./gbtree-inl.hpp"

View File

@ -27,25 +27,44 @@ class IGradBooster {
/*!
* \brief load model from stream
* \param fi input stream
* \param with_pbuffer whether the incoming data contains pbuffer
*/
virtual void LoadModel(utils::IStream &fi) = 0;
virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0;
/*!
* \brief save model to stream
* \param fo output stream
* \param with_pbuffer whether save out pbuffer
*/
virtual void SaveModel(utils::IStream &fo) const = 0;
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0;
/*!
* \brief initialize the model
*/
virtual void InitModel(void) = 0;
/*!
* \brief reset the predict buffer
* this will invalidate all the previous cached results
* and recalculate from scratch
*/
virtual void ResetPredBuffer(size_t num_pbuffer) {}
/*!
* \brief whether the model allow lazy checkpoint
* return true if model is only updated in DoBoost
* after all Allreduce calls
*/
virtual bool AllowLazyCheckPoint(void) const {
return false;
}
/*!
* \brief peform update to the model(boosting)
* \param p_fmat feature matrix that provide access to features
* \param buffer_offset buffer index offset of these instances, if equals -1
* this means we do not have buffer index allocated to the gbm
* \param info meta information about training
* \param in_gpair address of the gradient pair statistics of the data
* the booster may change content of gpair
*/
virtual void DoBoost(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) = 0;
/*!
@ -64,7 +83,36 @@ class IGradBooster {
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<float> *out_preds,
unsigned ntree_limit = 0) = 0;
unsigned ntree_limit = 0) = 0;
/*!
* \brief online prediction funciton, predict score for one instance at a time
* NOTE: use the batch prediction interface if possible, batch prediction is usually
* more efficient than online prediction
* This function is NOT threadsafe, make sure you only call from one thread
*
* \param inst the instance you want to predict
* \param out_preds output vector to hold the predictions
* \param ntree_limit limit the number of trees used in prediction
* \param root_index the root index
* \sa Predict
*/
virtual void Predict(const SparseBatch::Inst &inst,
std::vector<float> *out_preds,
unsigned ntree_limit = 0,
unsigned root_index = 0) = 0;
/*!
* \brief predict the leaf index of each tree, the output will be nsample * ntree vector
* this is only valid in gbtree predictor
* \param p_fmat feature matrix
* \param info extra side information that may be needed for prediction
* \param out_preds output vector to hold the predictions
* \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
* we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
*/
virtual void PredictLeaf(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<float> *out_preds,
unsigned ntree_limit = 0) = 0;
/*!
* \brief dump the model in text format
* \param fmap feature map that may help give interpretations of feature

View File

@ -19,6 +19,8 @@ namespace gbm {
*/
class GBTree : public IGradBooster {
public:
GBTree(void) {
}
virtual ~GBTree(void) {
this->Clear();
}
@ -37,7 +39,7 @@ class GBTree : public IGradBooster {
tparam.SetParam(name, val);
if (trees.size() == 0) mparam.SetParam(name, val);
}
virtual void LoadModel(utils::IStream &fi) {
virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
this->Clear();
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"GBTree: invalid model file");
@ -51,7 +53,7 @@ class GBTree : public IGradBooster {
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
"GBTree: invalid model file");
}
if (mparam.num_pbuffer != 0) {
if (mparam.num_pbuffer != 0 && with_pbuffer) {
pred_buffer.resize(mparam.PredBufferSize());
pred_counter.resize(mparam.PredBufferSize());
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
@ -60,7 +62,7 @@ class GBTree : public IGradBooster {
"GBTree: invalid model file");
}
}
virtual void SaveModel(utils::IStream &fo) const {
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
fo.Write(&mparam, sizeof(ModelParam));
for (size_t i = 0; i < trees.size(); ++i) {
@ -69,7 +71,7 @@ class GBTree : public IGradBooster {
if (tree_info.size() != 0) {
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
}
if (mparam.num_pbuffer != 0) {
if (mparam.num_pbuffer != 0 && with_pbuffer) {
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
}
@ -82,12 +84,23 @@ class GBTree : public IGradBooster {
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
}
virtual void ResetPredBuffer(size_t num_pbuffer) {
mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer);
pred_buffer.clear(); pred_counter.clear();
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
pred_counter.resize(mparam.PredBufferSize(), 0);
}
virtual bool AllowLazyCheckPoint(void) const {
return !(tparam.distcol_mode != 0 && mparam.num_output_group != 1);
}
virtual void DoBoost(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) {
const std::vector<bst_gpair> &gpair = *in_gpair;
if (mparam.num_output_group == 1) {
this->BoostNewTrees(gpair, p_fmat, info, 0);
std::vector<std::vector<tree::RegTree*> > new_trees;
if (mparam.num_output_group == 1) {
new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
} else {
const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0,
@ -99,15 +112,18 @@ class GBTree : public IGradBooster {
for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
this->BoostNewTrees(tmp, p_fmat, info, gid);
new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
}
}
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->CommitModel(new_trees[gid], gid);
}
}
virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<float> *out_preds,
unsigned ntree_limit = 0) {
unsigned ntree_limit = 0) {
int nthread;
#pragma omp parallel
{
@ -117,7 +133,6 @@ class GBTree : public IGradBooster {
for (int i = 0; i < nthread; ++i) {
thread_temp[i].Init(mparam.num_feature);
}
std::vector<float> &preds = *out_preds;
const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
@ -144,6 +159,38 @@ class GBTree : public IGradBooster {
}
}
}
}
virtual void Predict(const SparseBatch::Inst &inst,
std::vector<float> *out_preds,
unsigned ntree_limit,
unsigned root_index) {
if (thread_temp.size() == 0) {
thread_temp.resize(1, tree::RegTree::FVec());
thread_temp[0].Init(mparam.num_feature);
}
out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->Pred(inst, -1, gid, root_index, &thread_temp[0],
&(*out_preds)[gid], mparam.num_output_group,
ntree_limit);
}
}
virtual void PredictLeaf(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<float> *out_preds,
unsigned ntree_limit) {
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
thread_temp.resize(nthread, tree::RegTree::FVec());
for (int i = 0; i < nthread; ++i) {
thread_temp[i].Init(mparam.num_feature);
}
this->PredPath(p_fmat, info, out_preds, ntree_limit);
}
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
std::vector<std::string> dump;
@ -184,13 +231,15 @@ class GBTree : public IGradBooster {
tparam.updater_initialized = 1;
}
// do group specific group
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
int bst_group) {
inline std::vector<tree::RegTree*>
BoostNewTrees(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
int bst_group) {
std::vector<tree::RegTree *> new_trees;
this->InitUpdater();
// create the trees
std::vector<tree::RegTree *> new_trees;
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
new_trees.push_back(new tree::RegTree());
for (size_t j = 0; j < cfg.size(); ++j) {
@ -201,13 +250,52 @@ class GBTree : public IGradBooster {
// update the trees
for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, p_fmat, info, new_trees);
}
// optimization, update buffer, if possible
// this is only under distributed column mode
// for safety check of lazy checkpoint
if (
buffer_offset >= 0 &&
new_trees.size() == 1 && updaters.size() > 0 &&
updaters.back()->GetLeafPosition() != NULL) {
utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
"distributed mode is not compatible with prob_buffer_row");
this->UpdateBufferByPosition(p_fmat,
buffer_offset, bst_group,
*new_trees[0],
updaters.back()->GetLeafPosition());
}
// push back to model
return new_trees;
}
// commit new trees all at once
inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(new_trees[i]);
tree_info.push_back(bst_group);
}
mparam.num_trees += tparam.num_parallel_tree;
mparam.num_trees += static_cast<int>(new_trees.size());
}
// update buffer by pre-cached position
inline void UpdateBufferByPosition(IFMatrix *p_fmat,
int64_t buffer_offset,
int bst_group,
const tree::RegTree &new_tree,
const int* leaf_position) {
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
const int tid = leaf_position[ridx];
utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date");
utils::Assert(tid >= 0, "invalid leaf position");
pred_buffer[bid] += new_tree[tid].leaf_value();
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
}
pred_counter[bid] += tparam.num_parallel_tree;
}
}
// make a prediction for a single instance
inline void Pred(const RowBatch::Inst &inst,
@ -215,7 +303,8 @@ class GBTree : public IGradBooster {
int bst_group,
unsigned root_index,
tree::RegTree::FVec *p_feats,
float *out_pred, size_t stride, unsigned ntree_limit) {
float *out_pred, size_t stride,
unsigned ntree_limit) {
size_t itop = 0;
float psum = 0.0f;
// sum of leaf vector
@ -258,6 +347,39 @@ class GBTree : public IGradBooster {
out_pred[stride * (i + 1)] = vec_psum[i];
}
}
// predict independent leaf index
inline void PredPath(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<float> *out_preds,
unsigned ntree_limit) {
// number of valid trees
if (ntree_limit == 0 || ntree_limit > trees.size()) {
ntree_limit = static_cast<unsigned>(trees.size());
}
std::vector<float> &preds = *out_preds;
preds.resize(info.num_row * ntree_limit);
// start collecting the prediction
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
// parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num();
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
tree::RegTree::FVec &feats = thread_temp[tid];
feats.Fill(batch[i]);
for (unsigned j = 0; j < ntree_limit; ++j) {
int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
preds[ridx * ntree_limit + j] = static_cast<float>(tid);
}
feats.Drop(batch[i]);
}
}
}
// --- data structure ---
/*! \brief training parameters */
struct TrainParam {
@ -270,6 +392,8 @@ class GBTree : public IGradBooster {
int num_parallel_tree;
/*! \brief whether updater is already initialized */
int updater_initialized;
/*! \brief distributed column mode */
int distcol_mode;
/*! \brief tree updater sequence */
std::string updater_seq;
// construction
@ -278,6 +402,7 @@ class GBTree : public IGradBooster {
updater_seq = "grow_colmaker,prune";
num_parallel_tree = 1;
updater_initialized = 0;
distcol_mode = 0;
}
inline void SetParam(const char *name, const char *val){
using namespace std;
@ -286,6 +411,9 @@ class GBTree : public IGradBooster {
updater_seq = val;
updater_initialized = 0;
}
if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
distcol_mode = 1;
}
if (!strcmp(name, "nthread")) {
omp_set_num_threads(nthread = atoi(val));
}

View File

@ -1,15 +1,32 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <string>
#include "./io.h"
#include "../utils/io.h"
#include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp"
#include "page_dmatrix-inl.hpp"
#include "page_fmatrix-inl.hpp"
// implements data loads using dmatrix simple for now
namespace xgboost {
namespace io {
DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
if (!strcmp(fname, "stdin")) {
DMatrixSimple *dmat = new DMatrixSimple();
dmat->LoadText(fname, silent);
return dmat;
}
std::string tmp_fname;
const char *fname_ext = NULL;
if (strchr(fname, ';') != NULL) {
tmp_fname = fname;
char *ptr = strchr(&tmp_fname[0], ';');
ptr[0] = '\0'; fname = &tmp_fname[0];
fname_ext = ptr + 1;
}
int magic;
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
@ -20,7 +37,27 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
dmat->LoadBinary(fs, silent, fname);
fs.Close();
return dmat;
}
}
if (magic == DMatrixPage::kMagic) {
if (fname_ext == NULL) {
DMatrixPage *dmat = new DMatrixPage();
dmat->Load(fs, silent, fname);
return dmat;
} else {
DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
dmat->Load(fs, silent, fname, true);
return dmat;
}
}
if (magic == DMatrixColPage::kMagic) {
std::string sfname = fname;
if (fname_ext == NULL) {
sfname += ".col"; fname_ext = sfname.c_str();
}
DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
dmat->Load(fs, silent, fname);
return dmat;
}
fs.Close();
DMatrixSimple *dmat = new DMatrixSimple();
@ -29,11 +66,21 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
}
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
if (!strcmp(fname + strlen(fname) - 5, ".page")) {
DMatrixPage::Save(fname, dmat, silent);
return;
}
if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
DMatrixColPage::Save(fname, dmat, silent);
return;
}
if (dmat.magic == DMatrixSimple::kMagic) {
const DMatrixSimple *p_dmat = static_cast<const DMatrixSimple*>(&dmat);
p_dmat->SaveBinary(fname, silent);
} else {
utils::Error("not implemented");
DMatrixSimple smat;
smat.CopyFrom(dmat);
smat.SaveBinary(fname, silent);
}
}

278
src/io/page_dmatrix-inl.hpp Normal file
View File

@ -0,0 +1,278 @@
#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
/*!
* \file page_row_iter-inl.hpp
* row iterator based on sparse page
* \author Tianqi Chen
*/
#include <vector>
#include "../data.h"
#include "../utils/iterator.h"
#include "../utils/thread_buffer.h"
#include "./simple_fmatrix-inl.hpp"
namespace xgboost {
namespace io {
/*! \brief page structure that can be used to store a rowbatch */
struct RowBatchPage {
public:
explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
data_ = new int[kPageSize];
utils::Assert(data_ != NULL, "fail to allocate row batch page");
this->Clear();
}
~RowBatchPage(void) {
if (data_ != NULL) delete [] data_;
}
/*!
* \brief Push one row into page
* \param row an instance row
* \return false or true to push into
*/
inline bool PushRow(const RowBatch::Inst &row) {
const size_t dsize = row.length * sizeof(RowBatch::Entry);
if (FreeBytes() < dsize+ sizeof(int)) return false;
row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
++data_[0];
return true;
}
/*!
* \brief get a row batch representation from the page
* \param p_rptr a temporal space that can be used to provide
* ind_ptr storage for RowBatch
* \return a new RowBatch object
*/
inline RowBatch GetRowBatch(std::vector<size_t> *p_rptr, size_t base_rowid) {
RowBatch batch;
batch.base_rowid = base_rowid;
batch.data_ptr = this->data_ptr(0);
batch.size = static_cast<size_t>(this->Size());
std::vector<size_t> &rptr = *p_rptr;
rptr.resize(this->Size() + 1);
for (size_t i = 0; i < rptr.size(); ++i) {
rptr[i] = static_cast<size_t>(this->row_ptr(static_cast<int>(i)));
}
batch.ind_ptr = &rptr[0];
return batch;
}
/*! \brief get i-th row from the batch */
inline RowBatch::Inst operator[](int i) {
return RowBatch::Inst(data_ptr(0) + row_ptr(i),
static_cast<bst_uint>(row_ptr(i+1) - row_ptr(i)));
}
/*!
* \brief clear the page, cleanup the content
*/
inline void Clear(void) {
memset(&data_[0], 0, sizeof(int) * kPageSize);
}
/*!
* \brief load one page form instream
* \return true if loading is successful
*/
inline bool Load(utils::IStream &fi) {
return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
}
/*! \brief save one page into outstream */
inline void Save(utils::IStream &fo) {
fo.Write(&data_[0], sizeof(int) * kPageSize);
}
/*! \return number of elements */
inline int Size(void) const {
return data_[0];
}
protected:
/*! \return number of elements */
inline size_t FreeBytes(void) {
return (kPageSize - (Size() + 2)) * sizeof(int) -
row_ptr(Size()) * sizeof(RowBatch::Entry);
}
/*! \brief equivalent row pointer at i */
inline int& row_ptr(int i) {
return data_[kPageSize - i - 1];
}
inline RowBatch::Entry* data_ptr(int i) {
return (RowBatch::Entry*)(&data_[1]) + i;
}
// content of data
int *data_;
// page size
const size_t kPageSize;
};
/*! \brief thread buffer iterator */
class ThreadRowPageIterator: public utils::IIterator<RowBatch> {
public:
ThreadRowPageIterator(void) {
itr.SetParam("buffer_size", "2");
page_ = NULL;
base_rowid_ = 0;
}
virtual ~ThreadRowPageIterator(void) {}
virtual void Init(void) {
}
virtual void BeforeFirst(void) {
itr.BeforeFirst();
base_rowid_ = 0;
}
virtual bool Next(void) {
if (!itr.Next(page_)) return false;
out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
base_rowid_ += out_.size;
return true;
}
virtual const RowBatch &Value(void) const {
return out_;
}
/*! \brief load and initialize the iterator with fi */
inline void Load(const utils::FileStream &fi) {
itr.get_factory().SetFile(fi);
itr.Init();
this->BeforeFirst();
}
/*!
* \brief save a row iterator to output stream, in row iterator format
*/
inline static void Save(utils::IIterator<RowBatch> *iter,
utils::IStream &fo) {
RowBatchPage page(kPageSize);
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (!page.PushRow(batch[i])) {
page.Save(fo);
page.Clear();
utils::Check(page.PushRow(batch[i]), "row is too big");
}
}
}
if (page.Size() != 0) page.Save(fo);
}
/*! \brief page size 64 MB */
static const size_t kPageSize = 64 << 18;
private:
// base row id
size_t base_rowid_;
// temporal ptr
std::vector<size_t> tmp_ptr_;
// output data
RowBatch out_;
// page pointer type
typedef RowBatchPage* PagePtr;
// loader factory for page
struct Factory {
public:
size_t file_begin_;
utils::FileStream fi;
Factory(void) {}
inline void SetFile(const utils::FileStream &fi) {
this->fi = fi;
file_begin_ = this->fi.Tell();
}
inline bool Init(void) {
return true;
}
inline void SetParam(const char *name, const char *val) {}
inline bool LoadNext(PagePtr &val) {
return val->Load(fi);
}
inline PagePtr Create(void) {
PagePtr a = new RowBatchPage(kPageSize);
return a;
}
inline void FreeSpace(PagePtr &a) {
delete a;
}
inline void Destroy(void) {
fi.Close();
}
inline void BeforeFirst(void) {
fi.Seek(file_begin_);
}
};
protected:
PagePtr page_;
utils::ThreadBuffer<PagePtr, Factory> itr;
};
/*! \brief data matrix using page */
template<int TKMagic>
class DMatrixPageBase : public DataMatrix {
public:
DMatrixPageBase(void) : DataMatrix(kMagic) {
iter_ = new ThreadRowPageIterator();
}
// virtual destructor
virtual ~DMatrixPageBase(void) {
// do not delete row iterator, since it is owned by fmat
// to be cleaned up in a more clear way
}
/*! \brief load and initialize the iterator with fi */
inline void Load(utils::FileStream &fi,
bool silent = false,
const char *fname = NULL,
bool skip_magic_check = false) {
int tmagic;
utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
if (!skip_magic_check) {
utils::Check(tmagic == magic, "invalid format,magic number mismatch");
}
this->info.LoadBinary(fi);
iter_->Load(fi);
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
static_cast<unsigned long>(info.num_row()),
static_cast<unsigned long>(info.num_col()));
if (fname != NULL) {
utils::Printf(" from %s\n", fname);
} else {
utils::Printf("\n");
}
if (info.group_ptr.size() != 0) {
utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
}
}
}
/*! \brief save a DataMatrix as DMatrixPage*/
inline static void Save(const char* fname, const DataMatrix &mat, bool silent) {
utils::FileStream fs(utils::FopenCheck(fname, "wb"));
int magic = kMagic;
fs.Write(&magic, sizeof(magic));
mat.info.SaveBinary(fs);
ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
fs.Close();
if (!silent) {
utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
static_cast<unsigned long>(mat.info.num_row()),
static_cast<unsigned long>(mat.info.num_col()), fname);
}
}
/*! \brief magic number used to identify DMatrix */
static const int kMagic = TKMagic;
protected:
/*! \brief row iterator */
ThreadRowPageIterator *iter_;
};
class DMatrixPage : public DMatrixPageBase<0xffffab02> {
public:
DMatrixPage(void) {
fmat_ = new FMatrixS(iter_);
}
virtual ~DMatrixPage(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
}
/*! \brief the real fmatrix */
IFMatrix *fmat_;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_

382
src/io/page_fmatrix-inl.hpp Normal file
View File

@ -0,0 +1,382 @@
#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
/*!
* \file page_fmatrix-inl.hpp
* sparse page manager for fmatrix
* \author Tianqi Chen
*/
#include <vector>
#include <string>
#include <algorithm>
#include "../data.h"
#include "../utils/iterator.h"
#include "../utils/io.h"
#include "../utils/matrix_csr.h"
#include "../utils/thread_buffer.h"
namespace xgboost {
namespace io {
class CSCMatrixManager {
public:
/*! \brief in memory page */
struct Page {
public:
/*! \brief initialize the page */
explicit Page(size_t size) {
buffer.resize(size);
col_index.reserve(10);
col_data.reserve(10);
}
/*! \brief clear the page */
inline void Clear(void) {
num_entry = 0;
col_index.clear();
col_data.clear();
}
/*! \brief number of used entries */
size_t num_entry;
/*! \brief column index */
std::vector<bst_uint> col_index;
/*! \brief column data */
std::vector<ColBatch::Inst> col_data;
/*! \brief number of free entries */
inline size_t NumFreeEntry(void) const {
return buffer.size() - num_entry;
}
inline ColBatch::Entry* AllocEntry(size_t len) {
ColBatch::Entry *p_data = &buffer[0] + num_entry;
num_entry += len;
return p_data;
}
/*! \brief get underlying batch */
inline ColBatch GetBatch(void) const {
ColBatch batch;
batch.size = col_index.size();
batch.col_index = BeginPtr(col_index);
batch.col_data = BeginPtr(col_data);
return batch;
}
private:
/*! \brief buffer space, not to be changed since ready */
std::vector<ColBatch::Entry> buffer;
};
/*! \brief define type of page pointer */
typedef Page *PagePtr;
// constructor
CSCMatrixManager(void) {
fi_ = NULL;
}
/*! \brief get column pointer */
inline const std::vector<size_t> &col_ptr(void) const {
return col_ptr_;
}
inline void SetParam(const char *name, const char *val) {
}
inline PagePtr Create(void) {
return new Page(page_size_);
}
inline void FreeSpace(PagePtr &a) {
delete a;
}
inline void Destroy(void) {
}
inline void BeforeFirst(void) {
col_index_ = col_todo_;
read_top_ = 0;
}
inline bool LoadNext(PagePtr &val) {
val->Clear();
if (read_top_ >= col_index_.size()) return false;
while (read_top_ < col_index_.size()) {
if (!this->TryFill(col_index_[read_top_], val)) {
return true;
}
++read_top_;
}
return true;
}
inline bool Init(void) {
this->BeforeFirst();
return true;
}
inline void Setup(utils::ISeekStream *fi, double page_ratio) {
fi_ = fi;
fi_->Read(&begin_meta_ , sizeof(begin_meta_));
begin_data_ = static_cast<size_t>(fi->Tell());
fi_->Seek(begin_meta_);
fi_->Read(&col_ptr_);
size_t psmax = 0;
for (size_t i = 0; i < col_ptr_.size() - 1; ++i) {
psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
}
utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
page_size_ = std::max(static_cast<size_t>(psmax * page_ratio), psmax);
}
inline void SetColSet(const std::vector<bst_uint> &cset, bool setall) {
if (!setall) {
col_todo_.resize(0);
for (size_t i = 0; i < cset.size(); ++i) {
if (col_todo_[i] < static_cast<bst_uint>(col_ptr_.size() - 1)) {
col_todo_.push_back(cset[i]);
}
}
std::sort(col_todo_.begin(), col_todo_.end());
} else {
col_todo_.resize(col_ptr_.size()-1);
for (size_t i = 0; i < col_todo_.size(); ++i) {
col_todo_[i] = static_cast<bst_uint>(i);
}
}
}
private:
/*! \brief fill a page with */
inline bool TryFill(size_t cidx, Page *p_page) {
size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
if (p_page->NumFreeEntry() < len) return false;
ColBatch::Entry *p_data = p_page->AllocEntry(len);
fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_);
utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
"invalid column buffer format");
p_page->col_data.push_back(ColBatch::Inst(p_data, static_cast<bst_uint>(len)));
p_page->col_index.push_back(static_cast<bst_uint>(cidx));
return true;
}
// the following are in memory auxiliary data structure
/*! \brief top of reader position */
size_t read_top_;
/*! \brief size of page */
size_t page_size_;
/*! \brief column index to be loaded */
std::vector<bst_uint> col_index_;
/*! \brief column index to be after calling before first */
std::vector<bst_uint> col_todo_;
// the following are input content
/*! \brief beginning position of data content */
size_t begin_data_;
/*! \brief size of data content */
size_t begin_meta_;
/*! \brief input stream */
utils::ISeekStream *fi_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
};
class ThreadColPageIterator : public utils::IIterator<ColBatch> {
public:
explicit ThreadColPageIterator(utils::ISeekStream *fi,
float page_ratio, bool silent) {
itr_.SetParam("buffer_size", "2");
itr_.get_factory().Setup(fi, page_ratio);
itr_.Init();
if (!silent) {
utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n",
static_cast<unsigned>(col_ptr().size() - 1));
}
}
virtual ~ThreadColPageIterator(void) {
}
virtual void BeforeFirst(void) {
itr_.BeforeFirst();
}
virtual bool Next(void) {
// page to be loaded
CSCMatrixManager::PagePtr page;
if (!itr_.Next(page)) return false;
out_ = page->GetBatch();
return true;
}
virtual const ColBatch &Value(void) const {
return out_;
}
inline const std::vector<size_t> &col_ptr(void) const {
return itr_.get_factory().col_ptr();
}
inline void SetColSet(const std::vector<bst_uint> &cset,
bool setall = false) {
itr_.get_factory().SetColSet(cset, setall);
}
private:
// output data
ColBatch out_;
// internal iterator
utils::ThreadBuffer<CSCMatrixManager::PagePtr, CSCMatrixManager> itr_;
};
/*!
* \brief sparse matrix that support column access
*/
class FMatrixPage : public IFMatrix {
public:
/*! \brief constructor */
FMatrixPage(utils::IIterator<RowBatch> *iter, std::string fname_buffer)
: fname_cbuffer_(fname_buffer) {
this->row_iter_ = iter;
this->col_iter_ = NULL;
this->fi_ = NULL;
}
// destructor
virtual ~FMatrixPage(void) {
if (row_iter_ != NULL) delete row_iter_;
if (col_iter_ != NULL) delete col_iter_;
if (fi_ != NULL) {
fi_->Close(); delete fi_;
}
}
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const {
return col_iter_ != NULL;
}
/*! \brief get number of colmuns */
virtual size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_iter_->col_ptr().size() - 1;
}
/*! \brief get number of buffered rows */
virtual const std::vector<bst_uint> &buffered_rowset(void) const {
return buffered_rowset_;
}
/*! \brief get column size */
virtual size_t GetColSize(size_t cidx) const {
const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
return col_ptr[cidx+1] - col_ptr[cidx];
}
/*! \brief get column density */
virtual float GetColDensity(size_t cidx) const {
const std::vector<size_t> &col_ptr = col_iter_->col_ptr();
size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
virtual void InitColAccess(const std::vector<bool> &enabled, float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
utils::Printf("start to initialize page col access\n");
if (this->LoadColData()) {
utils::Printf("loading previously saved col data\n");
return;
}
this->InitColData(pkeep, fname_cbuffer_.c_str(),
1 << 30, 5);
utils::Check(this->LoadColData(), "fail to read in column data");
utils::Printf("finish initialize page col access\n");
}
/*!
* \brief get the row iterator associated with FMatrix
*/
virtual utils::IIterator<RowBatch>* RowIterator(void) {
row_iter_->BeforeFirst();
return row_iter_;
}
/*!
* \brief get the column based iterator
*/
virtual utils::IIterator<ColBatch>* ColIterator(void) {
std::vector<bst_uint> cset;
col_iter_->SetColSet(cset, true);
col_iter_->BeforeFirst();
return col_iter_;
}
/*!
* \brief colmun based iterator
*/
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
col_iter_->SetColSet(fset, false);
col_iter_->BeforeFirst();
return col_iter_;
}
protected:
/*!
* \brief try load column data from file
*/
inline bool LoadColData(void) {
FILE *fp = fopen64(fname_cbuffer_.c_str(), "rb");
if (fp == NULL) return false;
fi_ = new utils::FileStream(fp);
static_cast<utils::IStream*>(fi_)->Read(&buffered_rowset_);
col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
return true;
}
/*!
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep, const char *fname,
size_t buffer_size, size_t col_step) {
buffered_rowset_.clear();
utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
// use 64M buffer
utils::SparseCSRFileBuilder<ColBatch::Entry> builder(&fo, buffer_size);
// start working
row_iter_->BeforeFirst();
while (row_iter_->Next()) {
const RowBatch &batch = row_iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index);
}
}
}
}
// write buffered rowset
static_cast<utils::IStream*>(&fo)->Write(buffered_rowset_);
builder.InitStorage();
row_iter_->BeforeFirst();
size_t ktop = 0;
while (row_iter_->Next()) {
const RowBatch &batch = row_iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (ktop < buffered_rowset_.size() &&
buffered_rowset_[ktop] == batch.base_rowid + i) {
++ktop;
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].index,
ColBatch::Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
if (ktop % 100000 == 0) {
utils::Printf("\r \r");
utils::Printf("InitCol: %lu rows ", static_cast<unsigned long>(ktop));
}
}
}
}
builder.Finalize();
builder.SortRows(ColBatch::Entry::CmpValue, col_step);
fo.Close();
}
private:
// row iterator
utils::IIterator<RowBatch> *row_iter_;
// column iterator
ThreadColPageIterator *col_iter_;
// file pointer to data
utils::FileStream *fi_;
// file name of column buffer
std::string fname_cbuffer_;
/*! \brief list of row index that are buffered */
std::vector<bst_uint> buffered_rowset_;
};
class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
public:
explicit DMatrixColPage(const char *fname) {
fmat_ = new FMatrixPage(iter_, fname);
}
virtual ~DMatrixColPage(void) {
delete fmat_;
}
virtual IFMatrix *fmat(void) const {
return fmat_;
}
/*! \brief the real fmatrix */
IFMatrix *fmat_;
};
} // namespace io
} // namespace xgboost
#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_

View File

@ -44,8 +44,8 @@ class DMatrixSimple : public DataMatrix {
}
/*! \brief copy content data from source matrix */
inline void CopyFrom(const DataMatrix &src) {
this->info = src.info;
this->Clear();
this->info = src.info;
// clone data content in thos matrix
utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
iter->BeforeFirst();
@ -84,7 +84,12 @@ class DMatrixSimple : public DataMatrix {
inline void LoadText(const char* fname, bool silent = false) {
using namespace std;
this->Clear();
FILE* file = utils::FopenCheck(fname, "r");
FILE* file;
if (!strcmp(fname, "stdin")) {
file = stdin;
} else {
file = utils::FopenCheck(fname, "r");
}
float label; bool init = true;
char tmp[1024];
std::vector<RowBatch::Entry> feats;
@ -112,7 +117,9 @@ class DMatrixSimple : public DataMatrix {
static_cast<unsigned long>(info.num_col()),
static_cast<unsigned long>(row_data_.size()), fname);
}
fclose(file);
if (file != stdin) {
fclose(file);
}
// try to load in additional file
std::string name = fname;
std::string gname = name + ".group";
@ -152,7 +159,7 @@ class DMatrixSimple : public DataMatrix {
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
int tmagic;
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);

View File

@ -48,9 +48,10 @@ class FMatrixS : public IFMatrix{
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
}
virtual void InitColAccess(float pkeep = 1.0f) {
virtual void InitColAccess(const std::vector<bool> &enabled,
float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
this->InitColData(pkeep);
this->InitColData(pkeep, enabled);
}
/*!
* \brief get the row iterator associated with FMatrix
@ -75,7 +76,11 @@ class FMatrixS : public IFMatrix{
* \brief colmun based iterator
*/
virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
col_iter_.col_index_ = fset;
size_t ncol = this->NumCol();
col_iter_.col_index_.resize(0);
for (size_t i = 0; i < fset.size(); ++i) {
if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
}
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
@ -141,7 +146,7 @@ class FMatrixS : public IFMatrix{
* \brief intialize column data
* \param pkeep probability to keep a row
*/
inline void InitColData(float pkeep) {
inline void InitColData(float pkeep, const std::vector<bool> &enabled) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<RowBatch::Entry> builder(col_ptr_, col_data_);
@ -150,12 +155,14 @@ class FMatrixS : public IFMatrix{
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index);
if (enabled[inst[j].index]){
builder.AddBudget(inst[j].index);
}
}
}
}
@ -172,9 +179,11 @@ class FMatrixS : public IFMatrix{
++ktop;
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.PushElem(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
if (enabled[inst[j].index]) {
builder.PushElem(inst[j].index,
Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
}
}
}

View File

@ -11,6 +11,7 @@
#include <cmath>
#include <climits>
#include <algorithm>
#include "../sync/sync.h"
#include "./evaluation.h"
#include "./helper_utils.h"
@ -23,7 +24,8 @@ namespace learner {
template<typename Derived>
struct EvalEWiseBase : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed) const {
utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
@ -37,7 +39,11 @@ struct EvalEWiseBase : public IEvaluator {
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
wsum += wt;
}
return Derived::GetFinal(sum, wsum);
float dat[2]; dat[0] = sum, dat[1] = wsum;
if (distributed) {
rabit::Allreduce<rabit::op::Sum>(dat, 2);
}
return Derived::GetFinal(dat[0], dat[1]);
}
/*!
* \brief to be implemented by subclass,
@ -113,7 +119,9 @@ struct EvalCTest: public IEvaluator {
return name_.c_str();
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed) const {
utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str());
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
size_t ngroup = preds.size() / info.labels.size() - 1;
@ -150,7 +158,9 @@ struct EvalAMS : public IEvaluator {
utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed) const {
utils::Check(!distributed, "metric AMS do not support distributed evaluation");
using namespace std;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(info.labels.size());
@ -212,7 +222,9 @@ struct EvalPrecisionRatio : public IEvaluator{
}
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed) const {
utils::Check(!distributed, "metric %s do not support distributed evaluation", Name());
utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Assert(preds.size() % info.labels.size() == 0,
"label size predict size not match");
@ -252,7 +264,8 @@ struct EvalPrecisionRatio : public IEvaluator{
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed) const {
utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Check(preds.size() % info.labels.size() == 0,
"label size predict size not match");
@ -299,8 +312,16 @@ struct EvalAuc : public IEvaluator {
sum_auc += sum_pospair / (sum_npos*sum_nneg);
}
}
// return average AUC over list
return static_cast<float>(sum_auc) / ngroup;
if (distributed) {
float dat[2];
dat[0] = static_cast<float>(sum_auc);
dat[1] = static_cast<float>(ngroup);
// approximately estimate auc using mean
rabit::Allreduce<rabit::op::Sum>(dat, 2);
return dat[0] / dat[1];
} else {
return static_cast<float>(sum_auc) / ngroup;
}
}
virtual const char *Name(void) const {
return "auc";
@ -311,7 +332,8 @@ struct EvalAuc : public IEvaluator {
struct EvalRankList : public IEvaluator {
public:
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed) const {
utils::Check(preds.size() == info.labels.size(),
"label size predict size not match");
// quick consistency when group is not available
@ -336,7 +358,16 @@ struct EvalRankList : public IEvaluator {
sum_metric += this->EvalMetric(rec);
}
}
return static_cast<float>(sum_metric) / ngroup;
if (distributed) {
float dat[2];
dat[0] = static_cast<float>(sum_metric);
dat[1] = static_cast<float>(ngroup);
// approximately estimate auc using mean
rabit::Allreduce<rabit::op::Sum>(dat, 2);
return dat[0] / dat[1];
} else {
return static_cast<float>(sum_metric) / ngroup;
}
}
virtual const char *Name(void) const {
return name_.c_str();

View File

@ -19,9 +19,13 @@ struct IEvaluator{
* \brief evaluate a specific metric
* \param preds prediction
* \param info information, including label etc.
* \param distributed whether a call to Allreduce is needed to gather
* the average statistics across all the node,
* this is only supported by some metrics
*/
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const = 0;
const MetaInfo &info,
bool distributed = false) const = 0;
/*! \return name of metric */
virtual const char *Name(void) const = 0;
/*! \brief virtual destructor */
@ -70,10 +74,11 @@ class EvalSet{
}
inline std::string Eval(const char *evname,
const std::vector<float> &preds,
const MetaInfo &info) const {
const MetaInfo &info,
bool distributed = false) {
std::string result = "";
for (size_t i = 0; i < evals_.size(); ++i) {
float res = evals_[i]->Eval(preds, info);
float res = evals_[i]->Eval(preds, info, distributed);
char tmp[1024];
utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
result += tmp;

View File

@ -10,6 +10,9 @@
#include <utility>
#include <string>
#include <limits>
#include "../sync/sync.h"
#include "../utils/io.h"
#include "../utils/base64.h"
#include "./objective.h"
#include "./evaluation.h"
#include "../gbm/gbm.h"
@ -21,7 +24,7 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction
*/
class BoostLearner {
class BoostLearner : public rabit::ISerializable {
public:
BoostLearner(void) {
obj_ = NULL;
@ -30,8 +33,13 @@ class BoostLearner {
name_gbm_ = "gbtree";
silent= 0;
prob_buffer_row = 1.0f;
distributed_mode = 0;
pred_buffer_size = 0;
seed_per_iteration = 0;
seed = 0;
save_base64 = 0;
}
~BoostLearner(void) {
virtual ~BoostLearner(void) {
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
}
@ -44,11 +52,9 @@ class BoostLearner {
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<DMatrix*>& mats) {
// estimate feature bound
unsigned num_feature = 0;
utils::Assert(cache_.size() == 0, "can only call cache data once");
// assign buffer index
size_t buffer_size = 0;
utils::Assert(cache_.size() == 0, "can only call cache data once");
for (size_t i = 0; i < mats.size(); ++i) {
bool dupilicate = false;
for (size_t j = 0; j < i; ++j) {
@ -59,19 +65,12 @@ class BoostLearner {
mats[i]->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
buffer_size += mats[i]->info.num_row();
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
}
char str_temp[25];
if (num_feature > mparam.num_feature) {
utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
this->SetParam("bst:num_feature", str_temp);
}
utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
static_cast<unsigned long>(buffer_size));
utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
static_cast<unsigned long>(buffer_size));
this->SetParam("num_pbuffer", str_temp);
if (!silent) {
utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
}
this->pred_buffer_size = buffer_size;
}
/*!
* \brief set parameters from outside
@ -86,9 +85,29 @@ class BoostLearner {
this->SetParam(n.c_str(), val);
}
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast<float>(atof(val));
if (!strcmp(name, "dsplit")) {
if (!strcmp(val, "col")) {
this->SetParam("updater", "distcol");
distributed_mode = 1;
} else if (!strcmp(val, "row")) {
this->SetParam("updater", "grow_histmaker,prune");
distributed_mode = 2;
} else {
utils::Error("%s is invalid value for dsplit, should be row or col", val);
}
}
if (!strcmp(name, "prob_buffer_row")) {
prob_buffer_row = static_cast<float>(atof(val));
utils::Check(distributed_mode == 0,
"prob_buffer_row can only be used in single node mode so far");
this->SetParam("updater", "grow_colmaker,refresh,prune");
}
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp("seed", name)) random::Seed(atoi(val));
if (!strcmp("seed", name)) {
this->seed = seed; random::Seed(atoi(val));
}
if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
if (!strcmp("save_base64", name)) save_base64 = atoi(val);
if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
if (!strcmp(name, "nthread")) {
omp_set_num_threads(atoi(val));
@ -104,10 +123,29 @@ class BoostLearner {
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
}
}
// this is an internal function
// initialize the trainer, called at InitModel and LoadModel
inline void InitTrainer(bool calc_num_feature = true) {
if (calc_num_feature) {
// estimate feature bound
unsigned num_feature = 0;
for (size_t i = 0; i < cache_.size(); ++i) {
num_feature = std::max(num_feature,
static_cast<unsigned>(cache_[i].mat_->info.num_col()));
}
// run allreduce on num_feature to find the maximum value
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
}
char str_temp[25];
utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
this->SetParam("bst:num_feature", str_temp);
}
/*!
* \brief initialize the model
*/
inline void InitModel(void) {
this->InitTrainer();
// initialize model
this->InitObjGBM();
// reset the base score
@ -118,8 +156,10 @@ class BoostLearner {
/*!
* \brief load model from stream
* \param fi input stream
* \param with_pbuffer whether to load with predict buffer
* \param calc_num_feature whether call InitTrainer with calc_num_feature
*/
inline void LoadModel(utils::IStream &fi) {
inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true, bool calc_num_feature = true) {
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"BoostLearner: wrong model format");
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
@ -127,32 +167,90 @@ class BoostLearner {
// delete existing gbm if any
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
this->InitTrainer(calc_num_feature);
this->InitObjGBM();
gbm_->LoadModel(fi);
gbm_->LoadModel(fi, with_pbuffer);
if (!with_pbuffer || distributed_mode == 2) {
gbm_->ResetPredBuffer(pred_buffer_size);
}
}
// rabit load model from rabit checkpoint
virtual void Load(rabit::IStream &fi) {
RabitStreamAdapter fs(fi);
// for row split, we should not keep pbuffer
this->LoadModel(fs, distributed_mode != 2, false);
}
// rabit save model to rabit checkpoint
virtual void Save(rabit::IStream &fo) const {
RabitStreamAdapter fs(fo);
// for row split, we should not keep pbuffer
this->SaveModel(fs, distributed_mode != 2);
}
/*!
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname) {
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
FILE *fp = utils::FopenCheck(fname, "rb");
std::string header; header.resize(4);
utils::FileStream fi(fp);
// check header for different binary encode
// can be base64 or binary
if (fi.Read(&header[0], 4) != 0) {
// base64 format
if (header == "bs64") {
utils::Base64InStream bsin(fp);
bsin.InitPosition();
this->LoadModel(bsin);
fclose(fp);
return;
}
if (header == "binf") {
this->LoadModel(fi);
fclose(fp);
return;
}
}
fi.Seek(0);
this->LoadModel(fi);
fi.Close();
fclose(fp);
}
inline void SaveModel(utils::IStream &fo) const {
inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
fo.Write(&mparam, sizeof(ModelParam));
fo.Write(name_obj_);
fo.Write(name_gbm_);
gbm_->SaveModel(fo);
gbm_->SaveModel(fo, with_pbuffer);
}
/*!
* \brief save model into file
* \param fname file name
*/
inline void SaveModel(const char *fname) const {
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
this->SaveModel(fo);
fo.Close();
FILE *fp;
bool use_stdout = false;;
#ifndef XGBOOST_STRICT_CXX98_
if (!strcmp(fname, "stdout")) {
fp = stdout;
use_stdout = true;
} else
#endif
{
fp = utils::FopenCheck(fname, "wb");
}
utils::FileStream fo(fp);
std::string header;
if (save_base64 != 0|| use_stdout) {
fo.Write("bs64\t", 5);
utils::Base64OutStream bout(fp);
this->SaveModel(bout);
bout.Finish('\n');
} else {
fo.Write("binf", 4);
this->SaveModel(fo);
}
if (!use_stdout) {
fclose(fp);
}
}
/*!
* \brief check if data matrix is ready to be used by training,
@ -160,7 +258,10 @@ class BoostLearner {
* \param p_train pointer to the matrix used by training
*/
inline void CheckInit(DMatrix *p_train) {
p_train->fmat()->InitColAccess(prob_buffer_row);
int ncol = static_cast<int>(p_train->info.info.num_col);
std::vector<bool> enabled(ncol, true);
// initialize column access
p_train->fmat()->InitColAccess(enabled, prob_buffer_row);
}
/*!
* \brief update the model for one iteration
@ -168,9 +269,18 @@ class BoostLearner {
* \param p_train pointer to the data matrix
*/
inline void UpdateOneIter(int iter, const DMatrix &train) {
if (seed_per_iteration || rabit::IsDistributed()) {
random::Seed(this->seed * kRandSeedMagic);
}
this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_);
gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
}
/*!
* \brief whether model allow lazy checkpoint
*/
inline bool AllowLazyCheckPoint(void) const {
return gbm_->AllowLazyCheckPoint();
}
/*!
* \brief evaluate the model for specific iteration
@ -189,7 +299,7 @@ class BoostLearner {
for (size_t i = 0; i < evals.size(); ++i) {
this->PredictRaw(*evals[i], &preds_);
obj_->EvalTransform(&preds_);
res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2);
}
return res;
}
@ -217,10 +327,41 @@ class BoostLearner {
* predictor, when it equals 0, this means we are using all the trees
*/
inline void Predict(const DMatrix &data,
bool output_margin,
std::vector<float> *out_preds,
unsigned ntree_limit = 0,
bool pred_leaf = false
) const {
if (pred_leaf) {
gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit);
} else {
this->PredictRaw(data, out_preds, ntree_limit);
if (!output_margin) {
obj_->PredTransform(out_preds);
}
}
}
/*!
* \brief online prediction funciton, predict score for one instance at a time
* NOTE: use the batch prediction interface if possible, batch prediction is usually
* more efficient than online prediction
* This function is NOT threadsafe, make sure you only call from one thread
*
* \param inst the instance you want to predict
* \param output_margin whether to only predict margin value instead of transformed prediction
* \param out_preds output vector to hold the predictions
* \param ntree_limit limit the number of trees used in prediction
* \param root_index the root index
* \sa Predict
*/
inline void Predict(const SparseBatch::Inst &inst,
bool output_margin,
std::vector<float> *out_preds,
unsigned ntree_limit = 0) const {
this->PredictRaw(data, out_preds, ntree_limit);
gbm_->Predict(inst, out_preds, ntree_limit);
if (out_preds->size() == 1) {
(*out_preds)[0] += mparam.base_score;
}
if (!output_margin) {
obj_->PredTransform(out_preds);
}
@ -240,6 +381,7 @@ class BoostLearner {
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@ -287,7 +429,7 @@ class BoostLearner {
/* \brief number of class, if it is multi-class classification */
int num_class;
/*! \brief reserved field */
int reserved[32];
int reserved[31];
/*! \brief constructor */
ModelParam(void) {
base_score = 0.5f;
@ -308,14 +450,26 @@ class BoostLearner {
}
};
// data fields
// stored random seed
int seed;
// whether seed the PRNG each iteration
// this is important for restart from existing iterations
// default set to no, but will auto switch on in distributed mode
int seed_per_iteration;
// save model in base64 encoding
int save_base64;
// silent during training
int silent;
// distributed learning mode, if any, 0:none, 1:col, 2:row
int distributed_mode;
// cached size of predict buffer
size_t pred_buffer_size;
// maximum buffred row value
float prob_buffer_row;
// evaluation set
EvalSet evaluator_;
// model parameter
ModelParam mparam;
ModelParam mparam;
// gbm model that back everything
gbm::IGradBooster *gbm_;
// name of gbm model used for training
@ -331,7 +485,9 @@ class BoostLearner {
// gradient pairs
std::vector<bst_gpair> gpair_;
private:
protected:
// magic number to transform random seed
const static int kRandSeedMagic = 127;
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix *mat_;
@ -354,6 +510,23 @@ class BoostLearner {
// data structure field
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
private:
// adapt rabit stream to utils stream
struct RabitStreamAdapter : public utils::IStream {
// rabit stream
rabit::IStream &fs;
// constructr
RabitStreamAdapter(rabit::IStream &fs) : fs(fs) {}
// destructor
virtual ~RabitStreamAdapter(void){}
virtual size_t Read(void *ptr, size_t size) {
return fs.Read(ptr, size);
}
virtual void Write(const void *ptr, size_t size) {
fs.Write(ptr, size);
}
};
};
} // namespace learner
} // namespace xgboost

View File

@ -41,6 +41,25 @@ struct LossType {
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief check if label range is valid
*/
inline bool CheckLabel(float x) const {
if (loss_type != kLinearSquare) {
return x >= 0.0f && x <= 1.0f;
}
return true;
}
/*!
* \brief error message displayed when check label fail
*/
inline const char * CheckLabelErrorMsg(void) const {
if (loss_type != kLinearSquare) {
return "label must be in [0,1] for logistic regression";
} else {
return "";
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
@ -115,6 +134,8 @@ class RegLossObj : public IObjFunction{
"labels are not correctly provided");
std::vector<bst_gpair> &gpair = *out_gpair;
gpair.resize(preds.size());
// check if label in range
bool label_correct = true;
// start calculating gradient
const unsigned nstep = static_cast<unsigned>(info.labels.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
@ -124,9 +145,11 @@ class RegLossObj : public IObjFunction{
float p = loss.PredTransform(preds[i]);
float w = info.GetWeight(j);
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
if (!loss.CheckLabel(info.labels[j])) label_correct = false;
gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
loss.SecondOrderGradient(p, info.labels[j]) * w);
}
utils::Check(label_correct, loss.CheckLabelErrorMsg());
}
virtual const char* DefaultEvalMetric(void) const {
return loss.DefaultEvalMetric();
@ -183,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
Softmax(&rec);
const unsigned j = i % nstep;
int label = static_cast<int>(info.labels[j]);
utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
utils::Check(label >= 0 && label < nclass,
"SoftmaxMultiClassObj: label must be in [0, num_class)");
const float wt = info.GetWeight(j);
for (int k = 0; k < nclass; ++k) {
float p = rec[k];
@ -325,9 +349,9 @@ class LambdaRankObj : public IObjFunction {
float h = loss.SecondOrderGradient(p, 1.0f);
// accumulate gradient and hessian in both pid, and nid
gpair[pos.rindex].grad += g * w;
gpair[pos.rindex].hess += 2.0f * h;
gpair[pos.rindex].hess += 2.0f * w * h;
gpair[neg.rindex].grad -= g * w;
gpair[neg.rindex].hess += 2.0f * h;
gpair[neg.rindex].hess += 2.0f * w * h;
}
}
}

12
src/sync/sync.h Normal file
View File

@ -0,0 +1,12 @@
#ifndef XGBOOST_SYNC_H_
#define XGBOOST_SYNC_H_
/*!
* \file sync.h
* \brief the synchronization module of rabit
* redirects to subtree rabit header
* \author Tianqi Chen
*/
#include "../../subtree/rabit/include/rabit.h"
#endif // XGBOOST_SYNC_H_

View File

@ -68,8 +68,9 @@ class TreeModel {
}
};
/*! \brief tree node */
class Node{
class Node {
public:
Node(void) : sindex_(0) {}
/*! \brief index of left child */
inline int cleft(void) const {
return this->cleft_;
@ -110,6 +111,10 @@ class TreeModel {
inline bool is_left_child(void) const {
return (parent_ & (1U << 31)) != 0;
}
/*! \brief whether this node is deleted */
inline bool is_deleted(void) const {
return sindex_ == std::numeric_limits<unsigned>::max();
}
/*! \brief whether current node is root */
inline bool is_root(void) const {
return parent_ == -1;
@ -144,7 +149,11 @@ class TreeModel {
this->cleft_ = -1;
this->cright_ = right;
}
/*! \brief mark that this node is deleted */
inline void mark_delete(void) {
this->sindex_ = std::numeric_limits<unsigned>::max();
}
private:
friend class TreeModel<TSplitCond, TNodeStat>;
/*!
@ -197,11 +206,11 @@ class TreeModel {
leaf_vector.resize(param.num_nodes * param.size_leaf_vector);
return nd;
}
// delete a tree node
// delete a tree node, keep the parent field to allow trace back
inline void DeleteNode(int nid) {
utils::Assert(nid >= param.num_roots, "can not delete root");
deleted_nodes.push_back(nid);
nodes[nid].set_parent(-1);
nodes[nid].mark_delete();
++param.num_deleted;
}
@ -296,11 +305,12 @@ class TreeModel {
}
// chg deleted nodes
deleted_nodes.resize(0);
for (int i = param.num_roots; i < param.num_nodes; i ++) {
if (nodes[i].is_root()) deleted_nodes.push_back(i);
for (int i = param.num_roots; i < param.num_nodes; ++i) {
if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
}
utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
"number of deleted nodes do not match");
"number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
param.num_deleted, deleted_nodes.size(), param.num_nodes);
}
/*!
* \brief save model to stream

View File

@ -36,8 +36,14 @@ struct TrainParam{
float colsample_bytree;
// speed optimization for dense column
float opt_dense_col;
// accuracy of sketch
float sketch_eps;
// accuracy of sketch
float sketch_ratio;
// leaf vector size
int size_leaf_vector;
int size_leaf_vector;
// option for parallelization
int parallel_option;
// number of threads to be used for tree construction,
// if OpenMP is enabled, if equals 0, use system default
int nthread;
@ -55,6 +61,9 @@ struct TrainParam{
opt_dense_col = 1.0f;
nthread = 0;
size_leaf_vector = 0;
parallel_option = 2;
sketch_eps = 0.1f;
sketch_ratio = 2.0f;
}
/*!
* \brief set parameters from outside
@ -76,10 +85,13 @@ struct TrainParam{
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
if (!strcmp(name, "sketch_eps")) sketch_eps = static_cast<float>(atof(val));
if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast<float>(atof(val));
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
if (!strcmp(name, "nthread")) nthread = atoi(val);
if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
if (!strcmp(name, "default_direction")) {
if (!strcmp(val, "learn")) default_direction = 0;
if (!strcmp(val, "left")) default_direction = 1;
@ -132,6 +144,12 @@ struct TrainParam{
inline bool cannot_split(double sum_hess, int depth) const {
return sum_hess < this->min_child_weight * 2.0;
}
/*! \brief maximum sketch size */
inline unsigned max_sketch_size(void) const {
unsigned ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
return ret;
}
protected:
// functions for L1 cost
@ -186,6 +204,10 @@ struct GradStats {
inline void Add(const GradStats &b) {
this->Add(b.sum_grad, b.sum_hess);
}
/*! \brief same as add, reduce is used in All Reduce */
inline static void Reduce(GradStats &a, const GradStats &b) {
a.Add(b);
}
/*! \brief set current value to a - b */
inline void SetSubstract(const GradStats &a, const GradStats &b) {
sum_grad = a.sum_grad - b.sum_grad;
@ -262,6 +284,10 @@ struct CVGradStats : public GradStats {
valid[i].Add(b.valid[i]);
}
}
/*! \brief same as add, reduce is used in All Reduce */
inline static void Reduce(CVGradStats &a, const CVGradStats &b) {
a.Add(b);
}
/*! \brief set current value to a - b */
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
GradStats::SetSubstract(a, b);
@ -341,6 +367,10 @@ struct SplitEntry{
return false;
}
}
/*! \brief same as update, used by AllReduce*/
inline static void Reduce(SplitEntry &dst, const SplitEntry &src) {
dst.Update(src);
}
/*!\return feature index to split on */
inline unsigned split_index(void) const {
return sindex & ((1U << 31) - 1U);

View File

@ -1,10 +1,16 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <cstring>
#include "./updater.h"
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
#ifndef XGBOOST_STRICT_CXX98_
#include "./updater_sync-inl.hpp"
#include "./updater_distcol-inl.hpp"
#include "./updater_histmaker-inl.hpp"
#endif
namespace xgboost {
namespace tree {
@ -13,6 +19,11 @@ IUpdater* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner();
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
#ifndef XGBOOST_STRICT_CXX98_
if (!strcmp(name, "sync")) return new TreeSyncher();
if (!strcmp(name, "grow_histmaker")) return new CQHistMaker<GradStats>();
if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
#endif
utils::Error("unknown updater:%s", name);
return NULL;
}

View File

@ -37,6 +37,16 @@ class IUpdater {
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) = 0;
/*!
* \brief this is simply a function for optimizing performance
* this function asks the updater to return the leaf position of each instance in the p_fmat,
* if it is cached in the updater, if it is not available, return NULL
* \return array of leaf position of each instance in the last updated tree
*/
virtual const int* GetLeafPosition(void) const {
return NULL;
}
// destructor
virtual ~IUpdater(void) {}
};

View File

@ -0,0 +1,409 @@
#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
/*!
* \file updater_basemaker-inl.hpp
* \brief implement a common tree constructor
* \author Tianqi Chen
*/
#include <vector>
#include <algorithm>
#include <limits>
#include "../sync/sync.h"
#include "../utils/random.h"
#include "../utils/quantile.h"
namespace xgboost {
namespace tree {
/*!
* \brief base tree maker class that defines common operation
* needed in tree making
*/
class BaseMaker: public IUpdater {
public:
// destructor
virtual ~BaseMaker(void) {}
// set training parameter
virtual void SetParam(const char *name, const char *val) {
param.SetParam(name, val);
}
protected:
// helper to collect and query feature meta information
struct FMetaHelper {
public:
/*! \brief find type of each feature, use column format */
inline void InitByCol(IFMatrix *p_fmat,
const RegTree &tree) {
fminmax.resize(tree.param.num_feature * 2);
std::fill(fminmax.begin(), fminmax.end(),
-std::numeric_limits<bst_float>::max());
// start accumulating statistics
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
iter->BeforeFirst();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
for (bst_uint i = 0; i < batch.size; ++i) {
const bst_uint fid = batch.col_index[i];
const ColBatch::Inst &c = batch[i];
if (c.length != 0) {
fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
}
}
}
rabit::Allreduce<rabit::op::Max>(BeginPtr(fminmax), fminmax.size());
}
// get feature type, 0:empty 1:binary 2:real
inline int Type(bst_uint fid) const {
utils::Assert(fid * 2 + 1 < fminmax.size(),
"FeatHelper fid exceed query bound ");
bst_float a = fminmax[fid * 2];
bst_float b = fminmax[fid * 2 + 1];
if (a == -std::numeric_limits<bst_float>::max()) return 0;
if (-a == b) return 1;
else return 2;
}
inline bst_float MaxValue(bst_uint fid) const {
return fminmax[fid *2 + 1];
}
inline void SampleCol(float p, std::vector<bst_uint> *p_findex) const {
std::vector<bst_uint> &findex = *p_findex;
findex.clear();
for (size_t i = 0; i < fminmax.size(); i += 2) {
const bst_uint fid = static_cast<bst_uint>(i / 2);
if (this->Type(fid) != 0) findex.push_back(fid);
}
unsigned n = static_cast<unsigned>(p * findex.size());
random::Shuffle(findex);
findex.resize(n);
// sync the findex if it is subsample
std::string s_cache;
utils::MemoryBufferStream fc(&s_cache);
utils::IStream &fs = fc;
if (rabit::GetRank() == 0) {
fs.Write(findex);
}
rabit::Broadcast(&s_cache, 0);
fs.Read(&findex);
}
private:
std::vector<bst_float> fminmax;
};
// ------static helper functions ------
// helper function to get to next level of the tree
/*! \brief this is helper function for row based data*/
inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
const RegTree::Node &n = tree[nid];
bst_uint findex = n.split_index();
for (unsigned i = 0; i < inst.length; ++i) {
if (findex == inst[i].index) {
if (inst[i].fvalue < n.split_cond()) {
return n.cleft();
} else {
return n.cright();
}
}
}
return n.cdefault();
}
/*! \brief get number of omp thread in current context */
inline static int get_nthread(void) {
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
return nthread;
}
// ------class member helpers---------
/*! \brief initialize temp data structure */
inline void InitData(const std::vector<bst_gpair> &gpair,
const IFMatrix &fmat,
const std::vector<unsigned> &root_index,
const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots,
"TreeMaker: can only grow new tree");
{// setup position
position.resize(gpair.size());
if (root_index.size() == 0) {
std::fill(position.begin(), position.end(), 0);
} else {
for (size_t i = 0; i < position.size(); ++i) {
position[i] = root_index[i];
utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
"root index exceed setting");
}
}
// mark delete for the deleted datas
for (size_t i = 0; i < position.size(); ++i) {
if (gpair[i].hess < 0.0f) position[i] = ~position[i];
}
// mark subsample
if (param.subsample < 1.0f) {
for (size_t i = 0; i < position.size(); ++i) {
if (gpair[i].hess < 0.0f) continue;
if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
}
}
}
{// expand query
qexpand.reserve(256); qexpand.clear();
for (int i = 0; i < tree.param.num_roots; ++i) {
qexpand.push_back(i);
}
this->UpdateNode2WorkIndex(tree);
}
}
/*! \brief update queue expand add in new leaves */
inline void UpdateQueueExpand(const RegTree &tree) {
std::vector<int> newnodes;
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
if (!tree[nid].is_leaf()) {
newnodes.push_back(tree[nid].cleft());
newnodes.push_back(tree[nid].cright());
}
}
// use new nodes for qexpand
qexpand = newnodes;
this->UpdateNode2WorkIndex(tree);
}
// return decoded position
inline int DecodePosition(bst_uint ridx) const{
const int pid = position[ridx];
return pid < 0 ? ~pid : pid;
}
// encode the encoded position value for ridx
inline void SetEncodePosition(bst_uint ridx, int nid) {
if (position[ridx] < 0) {
position[ridx] = ~nid;
} else {
position[ridx] = nid;
}
}
/*!
* \brief this is helper function uses column based data structure,
* reset the positions to the lastest one
* \param nodes the set of nodes that contains the split to be used
* \param p_fmat feature matrix needed for tree construction
* \param tree the regression tree structure
*/
inline void ResetPositionCol(const std::vector<int> &nodes, IFMatrix *p_fmat, const RegTree &tree) {
// set the positions in the nondefault
this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
// set rest of instances to default position
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// set default direct nodes to default
// for leaf nodes that are not fresh, mark then to ~nid,
// so that they are ignored in future statistics collection
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int nid = this->DecodePosition(ridx);
if (tree[nid].is_leaf()) {
// mark finish when it is not a fresh leaf
if (tree[nid].cright() == -1) {
position[ridx] = ~nid;
}
} else {
// push to default branch
if (tree[nid].default_left()) {
this->SetEncodePosition(ridx, tree[nid].cleft());
} else {
this->SetEncodePosition(ridx, tree[nid].cright());
}
}
}
}
/*!
* \brief this is helper function uses column based data structure,
* update all positions into nondefault branch, if any, ignore the default branch
* \param nodes the set of nodes that contains the split to be used
* \param p_fmat feature matrix needed for tree construction
* \param tree the regression tree structure
*/
virtual void SetNonDefaultPositionCol(const std::vector<int> &nodes,
IFMatrix *p_fmat, const RegTree &tree) {
// step 1, classify the non-default data into right places
std::vector<unsigned> fsplits;
for (size_t i = 0; i < nodes.size(); ++i) {
const int nid = nodes[i];
if (!tree[nid].is_leaf()) {
fsplits.push_back(tree[nid].split_index());
}
}
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
while (iter->Next()) {
const ColBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
ColBatch::Inst col = batch[i];
const bst_uint fid = batch.col_index[i];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx);
// go back to parent, correct those who are not default
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
if(fvalue < tree[nid].split_cond()) {
this->SetEncodePosition(ridx, tree[nid].cleft());
} else {
this->SetEncodePosition(ridx, tree[nid].cright());
}
}
}
}
}
}
/*! \brief helper function to get statistics from a tree */
template<typename TStats>
inline void GetNodeStats(const std::vector<bst_gpair> &gpair,
const IFMatrix &fmat,
const RegTree &tree,
const BoosterInfo &info,
std::vector< std::vector<TStats> > *p_thread_temp,
std::vector<TStats> *p_node_stats) {
std::vector< std::vector<TStats> > &thread_temp = *p_thread_temp;
thread_temp.resize(this->get_nthread());
p_node_stats->resize(tree.param.num_nodes);
#pragma omp parallel
{
const int tid = omp_get_thread_num();
thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
for (size_t i = 0; i < qexpand.size(); ++i) {
const unsigned nid = qexpand[i];
thread_temp[tid][nid].Clear();
}
}
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
// setup position
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int nid = position[ridx];
const int tid = omp_get_thread_num();
if (nid >= 0) {
thread_temp[tid][nid].Add(gpair, info, ridx);
}
}
// sum the per thread statistics together
for (size_t j = 0; j < qexpand.size(); ++j) {
const int nid = qexpand[j];
TStats &s = (*p_node_stats)[nid];
s.Clear();
for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
s.Add(thread_temp[tid][nid]);
}
}
}
/*! \brief common helper data structure to build sketch*/
struct SketchEntry {
/*! \brief total sum of amount to be met */
bst_float sum_total;
/*! \brief statistics used in the sketch */
bst_float rmin, wmin;
/*! \brief last seen feature value */
bst_float last_fvalue;
/*! \brief current size of sketch */
bst_float next_goal;
// pointer to the sketch to put things in
utils::WXQuantileSketch<bst_float, bst_float> *sketch;
// initialize the space
inline void Init(unsigned max_size) {
next_goal = -1.0f;
rmin = wmin = 0.0f;
sketch->temp.Reserve(max_size + 1);
sketch->temp.size = 0;
}
/*!
* \brief push a new element to sketch
* \param fvalue feature value, comes in sorted ascending order
* \param w weight
* \param max_size
*/
inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
if (next_goal == -1.0f) {
next_goal = 0.0f;
last_fvalue = fvalue;
wmin = w;
return;
}
if (last_fvalue != fvalue) {
bst_float rmax = rmin + wmin;
if (rmax >= next_goal) {
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
utils::Assert(sketch->temp.size < max_size,
"invalid maximum size max_size=%u, stemp.size=%lu\n",
max_size, sketch->temp.size);
++sketch->temp.size;
}
if (sketch->temp.size == max_size) {
next_goal = sum_total * 2.0f + 1e-5f;
} else{
next_goal = static_cast<bst_float>(sketch->temp.size * sum_total / max_size);
}
}
rmin = rmax;
wmin = w;
last_fvalue = fvalue;
} else {
wmin += w;
}
}
/*! \brief push final unfinished value to the sketch */
inline void Finalize(unsigned max_size) {
bst_float rmax = rmin + wmin;
if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
utils::Assert(sketch->temp.size <= max_size,
"Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
sketch->temp.size, max_size );
// push to sketch
sketch->temp.data[sketch->temp.size] =
utils::WXQuantileSketch<bst_float, bst_float>::
Entry(rmin, rmax, wmin, last_fvalue);
++sketch->temp.size;
}
sketch->PushTemp();
}
};
/*! \brief training parameter of tree grower */
TrainParam param;
/*! \brief queue of nodes to be expanded */
std::vector<int> qexpand;
/*!
* \brief map active node to is working index offset in qexpand,
* can be -1, which means the node is node actively expanding
*/
std::vector<int> node2workindex;
/*!
* \brief position of each instance in the tree
* can be negative, which means this position is no longer expanding
* see also Decode/EncodePosition
*/
std::vector<int> position;
private:
inline void UpdateNode2WorkIndex(const RegTree &tree) {
// update the node2workindex
std::fill(node2workindex.begin(), node2workindex.end(), -1);
node2workindex.resize(tree.param.num_nodes);
for (size_t i = 0; i < qexpand.size(); ++i) {
node2workindex[qexpand[i]] = static_cast<int>(i);
}
}
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_

View File

@ -14,7 +14,7 @@
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
/*! \brief colunwise update to construct a tree */
template<typename TStats>
class ColMaker: public IUpdater {
public:
@ -36,24 +36,29 @@ class ColMaker: public IUpdater {
Builder builder(param);
builder.Update(gpair, p_fmat, info, trees[i]);
}
param.learning_rate = lr;
}
private:
protected:
// training parameter
TrainParam param;
// data structure
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry {
/*! \brief statistics of data*/
/*! \brief statistics of data */
TStats stats;
/*! \brief extra statistics of data */
TStats stats_extra;
/*! \brief last feature value scanned */
float last_fvalue;
/*! \brief first feature value scanned */
float first_fvalue;
/*! \brief current best solution */
SplitEntry best;
// constructor
explicit ThreadEntry(const TrainParam &param)
: stats(param) {
: stats(param), stats_extra(param) {
}
};
struct NodeEntry {
@ -104,7 +109,7 @@ class ColMaker: public IUpdater {
}
}
private:
protected:
// initialize temp data structure
inline void InitData(const std::vector<bst_gpair> &gpair,
const IFMatrix &fmat,
@ -127,17 +132,17 @@ class ColMaker: public IUpdater {
// mark delete for the deleted datas
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f) position[ridx] = -1;
if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
}
// mark subsample
if (param.subsample < 1.0f) {
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f) continue;
if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
}
}
}
}
{
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
@ -148,7 +153,8 @@ class ColMaker: public IUpdater {
}
unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
random::Shuffle(feat_index);
utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
//utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included", param.colsample_bytree);
feat_index.resize(n);
}
{// setup temp space for each thread
@ -219,7 +225,138 @@ class ColMaker: public IUpdater {
}
// use new nodes for qexpand
qexpand = newnodes;
}
}
// parallel find the best split of current fid
// this function does not support nested functions
inline void ParallelFindSplit(const ColBatch::Inst &col,
bst_uint fid,
const IFMatrix &fmat,
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info) {
bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
const std::vector<int> &qexpand = qexpand_;
int nthread;
#pragma omp parallel
{
const int tid = omp_get_thread_num();
std::vector<ThreadEntry> &temp = stemp[tid];
// cleanup temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear();
}
nthread = omp_get_num_threads();
bst_uint step = (col.length + nthread - 1) / nthread;
bst_uint end = std::min(col.length, step * (tid + 1));
for (bst_uint i = tid * step; i < end; ++i) {
const bst_uint ridx = col[i].index;
const int nid = position[ridx];
if (nid < 0) continue;
const float fvalue = col[i].fvalue;
if (temp[nid].stats.Empty()) {
temp[nid].first_fvalue = fvalue;
}
temp[nid].stats.Add(gpair, info, ridx);
temp[nid].last_fvalue = fvalue;
}
}
// start collecting the partial sum statistics
bst_omp_uint nnode = static_cast<bst_omp_uint>(qexpand.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < nnode; ++j) {
const int nid = qexpand[j];
TStats sum(param), tmp(param), c(param);
for (int tid = 0; tid < nthread; ++tid) {
tmp = stemp[tid][nid].stats;
stemp[tid][nid].stats = sum;
sum.Add(tmp);
if (tid != 0) {
std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
}
}
for (int tid = 0; tid < nthread; ++tid) {
stemp[tid][nid].stats_extra = sum;
ThreadEntry &e = stemp[tid][nid];
float fsplit;
if (tid != 0) {
if(fabsf(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
} else {
continue;
}
} else {
fsplit = e.first_fvalue - rt_eps;
}
if (need_forward && tid != 0) {
c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, fsplit, false);
}
}
if (need_backward) {
tmp.SetSubstract(sum, e.stats);
c.SetSubstract(snode[nid].stats, tmp);
if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, fsplit, true);
}
}
}
if (need_backward) {
tmp = sum;
ThreadEntry &e = stemp[nthread-1][nid];
c.SetSubstract(snode[nid].stats, tmp);
if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
}
}
}
// rescan, generate candidate split
#pragma omp parallel
{
TStats c(param), cright(param);
const int tid = omp_get_thread_num();
std::vector<ThreadEntry> &temp = stemp[tid];
nthread = static_cast<bst_uint>(omp_get_num_threads());
bst_uint step = (col.length + nthread - 1) / nthread;
bst_uint end = std::min(col.length, step * (tid + 1));
for (bst_uint i = tid * step; i < end; ++i) {
const bst_uint ridx = col[i].index;
const int nid = position[ridx];
if (nid < 0) continue;
const float fvalue = col[i].fvalue;
// get the statistics of nid
ThreadEntry &e = temp[nid];
if (e.stats.Empty()) {
e.stats.Add(gpair, info, ridx);
e.first_fvalue = fvalue;
} else {
// forward default right
if (fabsf(fvalue - e.first_fvalue) > rt_2eps){
if (need_forward) {
c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
}
}
if (need_backward) {
cright.SetSubstract(e.stats_extra, e.stats);
c.SetSubstract(snode[nid].stats, cright);
if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) {
bst_float loss_chg = static_cast<bst_float>(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
}
}
}
e.stats.Add(gpair, info, ridx);
e.first_fvalue = fvalue;
}
}
}
}
// enumerate the split values of specific feature
inline void EnumerateSplit(const ColBatch::Entry *begin,
const ColBatch::Entry *end,
@ -273,6 +410,42 @@ class ColMaker: public IUpdater {
}
}
}
// update the solution candidate
virtual void UpdateSolution(const ColBatch &batch,
const std::vector<bst_gpair> &gpair,
const IFMatrix &fmat,
const BoosterInfo &info) {
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#if defined(_OPENMP)
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
#endif
int poption = param.parallel_option;
if (poption == 2) {
poption = nsize * 2 < nthread ? 1 : 0;
}
if (poption == 0) {
#pragma omp parallel for schedule(dynamic, batch_size)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const bst_uint fid = batch.col_index[i];
const int tid = omp_get_thread_num();
const ColBatch::Inst c = batch[i];
if (param.need_forward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]);
}
if (param.need_backward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]);
}
}
} else {
for (bst_omp_uint i = 0; i < nsize; ++i) {
this->ParallelFindSplit(batch[i], batch.col_index[i],
fmat, gpair, info);
}
}
}
// find splits at current level, do split per level
inline void FindSplit(int depth,
const std::vector<int> &qexpand,
@ -289,66 +462,76 @@ class ColMaker: public IUpdater {
}
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(feat_set);
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#if defined(_OPENMP)
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
#endif
#pragma omp parallel for schedule(dynamic, batch_size)
for (bst_omp_uint i = 0; i < nsize; ++i) {
const bst_uint fid = batch.col_index[i];
const int tid = omp_get_thread_num();
const ColBatch::Inst c = batch[i];
if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data, c.data + c.length, +1,
fid, gpair, info, stemp[tid]);
this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
}
// after this each thread's stemp will get the best candidates, aggregate results
this->SyncBestSolution(qexpand);
// get the best result, we can synchronize the solution
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
NodeEntry &e = snode[nid];
// now we know the solution in snode[nid], set split
if (e.best.loss_chg > rt_eps) {
p_tree->AddChilds(nid);
(*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
// mark right child as 0, to indicate fresh leaf
(*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
(*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
} else {
(*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
}
}
}
// reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
// set the positions in the nondefault
this->SetNonDefaultPosition(qexpand, p_fmat, tree);
// set rest of instances to default position
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// set default direct nodes to default
// for leaf nodes that are not fresh, mark then to ~nid,
// so that they are ignored in future statistics collection
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int nid = this->DecodePosition(ridx);
if (tree[nid].is_leaf()) {
// mark finish when it is not a fresh leaf
if (tree[nid].cright() == -1) {
position[ridx] = ~nid;
}
if (param.need_backward_search(p_fmat->GetColDensity(fid))) {
this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
fid, gpair, info, stemp[tid]);
} else {
// push to default branch
if (tree[nid].default_left()) {
this->SetEncodePosition(ridx, tree[nid].cleft());
} else {
this->SetEncodePosition(ridx, tree[nid].cright());
}
}
}
// after this each thread's stemp will get the best candidates, aggregate results
}
// customization part
// synchronize the best solution of each node
virtual void SyncBestSolution(const std::vector<int> &qexpand) {
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
NodeEntry &e = snode[nid];
for (int tid = 0; tid < this->nthread; ++tid) {
e.best.Update(stemp[tid][nid].best);
}
// now we know the solution in snode[nid], set split
if (e.best.loss_chg > rt_eps) {
p_tree->AddChilds(nid);
(*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
} else {
(*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
}
}
}
// reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int nid = position[ridx];
if (nid >= 0) {
if (tree[nid].is_leaf()) {
position[ridx] = -1;
} else {
// push to default branch, correct latter
position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
}
}
}
// step 2, classify the non-default data into right places
virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
IFMatrix *p_fmat, const RegTree &tree) {
// step 1, classify the non-default data into right places
std::vector<unsigned> fsplits;
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
if (!tree[nid].is_leaf()) fsplits.push_back(tree[nid].split_index());
if (!tree[nid].is_leaf()) {
fsplits.push_back(tree[nid].split_index());
}
}
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
@ -364,21 +547,33 @@ class ColMaker: public IUpdater {
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const float fvalue = col[j].fvalue;
int nid = position[ridx];
if (nid == -1) continue;
const int nid = this->DecodePosition(ridx);
// go back to parent, correct those who are not default
nid = tree[nid].parent();
if (tree[nid].split_index() == fid) {
if (fvalue < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft();
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
if(fvalue < tree[nid].split_cond()) {
this->SetEncodePosition(ridx, tree[nid].cleft());
} else {
position[ridx] = tree[nid].cright();
this->SetEncodePosition(ridx, tree[nid].cright());
}
}
}
}
}
}
// utils to get/set position, with encoded format
// return decoded position
inline int DecodePosition(bst_uint ridx) const{
const int pid = position[ridx];
return pid < 0 ? ~pid : pid;
}
// encode the encoded position value for ridx
inline void SetEncodePosition(bst_uint ridx, int nid) {
if (position[ridx] < 0) {
position[ridx] = ~nid;
} else {
position[ridx] = nid;
}
}
//--data fields--
const TrainParam &param;
// number of omp thread used during training

View File

@ -0,0 +1,169 @@
#ifndef XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
#define XGBOOST_TREE_UPDATER_DISTCOL_INL_HPP_
/*!
* \file updater_distcol-inl.hpp
* \brief beta distributed version that takes a sub-column
* and construct a tree
* \author Tianqi Chen
*/
#include "../sync/sync.h"
#include "../utils/bitmap.h"
#include "../utils/io.h"
#include "./updater_colmaker-inl.hpp"
#include "./updater_prune-inl.hpp"
namespace xgboost {
namespace tree {
template<typename TStats>
class DistColMaker : public ColMaker<TStats> {
public:
DistColMaker(void) : builder(param) {}
virtual ~DistColMaker(void) {}
// set training parameter
virtual void SetParam(const char *name, const char *val) {
param.SetParam(name, val);
pruner.SetParam(name, val);
}
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
TStats::CheckInfo(info);
utils::Check(trees.size() == 1, "DistColMaker: only support one tree at a time");
// build the tree
builder.Update(gpair, p_fmat, info, trees[0]);
//// prune the tree, note that pruner will sync the tree
pruner.Update(gpair, p_fmat, info, trees);
// update position after the tree is pruned
builder.UpdatePosition(p_fmat, *trees[0]);
}
virtual const int* GetLeafPosition(void) const {
return builder.GetLeafPosition();
}
private:
struct Builder : public ColMaker<TStats>::Builder {
public:
Builder(const TrainParam &param)
: ColMaker<TStats>::Builder(param) {
}
inline void UpdatePosition(IFMatrix *p_fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
int nid = this->DecodePosition(ridx);
while (tree[nid].is_deleted()) {
nid = tree[nid].parent();
utils::Assert(nid >=0, "distributed learning error");
}
this->position[ridx] = nid;
}
}
virtual const int* GetLeafPosition(void) const {
return BeginPtr(this->position);
}
protected:
virtual void SetNonDefaultPosition(const std::vector<int> &qexpand,
IFMatrix *p_fmat, const RegTree &tree) {
// step 2, classify the non-default data into right places
std::vector<unsigned> fsplits;
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
if (!tree[nid].is_leaf()) {
fsplits.push_back(tree[nid].split_index());
}
}
// get the candidate split index
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
while (fsplits.size() != 0 && fsplits.back() >= p_fmat->NumCol()) {
fsplits.pop_back();
}
// bitmap is only word concurrent, set to bool first
{
bst_omp_uint ndata = static_cast<bst_omp_uint>(this->position.size());
boolmap.resize(ndata);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
boolmap[j] = 0;
}
}
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fsplits);
while (iter->Next()) {
const ColBatch &batch = iter->Value();
for (size_t i = 0; i < batch.size; ++i) {
ColBatch::Inst col = batch[i];
const bst_uint fid = batch.col_index[i];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_uint ridx = col[j].index;
const float fvalue = col[j].fvalue;
const int nid = this->DecodePosition(ridx);
if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
if (fvalue < tree[nid].split_cond()) {
if (!tree[nid].default_left()) boolmap[ridx] = 1;
} else {
if (tree[nid].default_left()) boolmap[ridx] = 1;
}
}
}
}
}
bitmap.InitFromBool(boolmap);
// communicate bitmap
rabit::Allreduce<rabit::op::BitOR>(BeginPtr(bitmap.data), bitmap.data.size());
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// get the new position
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int nid = this->DecodePosition(ridx);
if (bitmap.Get(ridx)) {
utils::Assert(!tree[nid].is_leaf(), "inconsistent reduce information");
if (tree[nid].default_left()) {
this->SetEncodePosition(ridx, tree[nid].cright());
} else {
this->SetEncodePosition(ridx, tree[nid].cleft());
}
}
}
}
// synchronize the best solution of each node
virtual void SyncBestSolution(const std::vector<int> &qexpand) {
std::vector<SplitEntry> vec;
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
for (int tid = 0; tid < this->nthread; ++tid) {
this->snode[nid].best.Update(this->stemp[tid][nid].best);
}
vec.push_back(this->snode[nid].best);
}
// TODO, lazy version
// communicate best solution
reducer.Allreduce(BeginPtr(vec), vec.size());
// assign solution back
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
this->snode[nid].best = vec[i];
}
}
private:
utils::BitMap bitmap;
std::vector<int> boolmap;
rabit::Reducer<SplitEntry, SplitEntry::Reduce> reducer;
};
// we directly introduce pruner here
TreePruner pruner;
// training parameter
TrainParam param;
// pointer to the builder
Builder builder;
};
} // namespace tree
} // namespace xgboost
#endif

View File

@ -0,0 +1,701 @@
#ifndef XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
#define XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_
/*!
* \file updater_histmaker-inl.hpp
* \brief use histogram counting to construct a tree
* \author Tianqi Chen
*/
#include <vector>
#include <algorithm>
#include "../sync/sync.h"
#include "../utils/quantile.h"
#include "../utils/group_data.h"
#include "./updater_basemaker-inl.hpp"
namespace xgboost {
namespace tree {
template<typename TStats>
class HistMaker: public BaseMaker {
public:
virtual ~HistMaker(void) {}
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
TStats::CheckInfo(info);
// rescale learning rate according to size of trees
float lr = param.learning_rate;
param.learning_rate = lr / trees.size();
// build tree
for (size_t i = 0; i < trees.size(); ++i) {
this->Update(gpair, p_fmat, info, trees[i]);
}
param.learning_rate = lr;
}
protected:
/*! \brief a single histogram */
struct HistUnit {
/*! \brief cutting point of histogram, contains maximum point */
const bst_float *cut;
/*! \brief content of statistics data */
TStats *data;
/*! \brief size of histogram */
unsigned size;
// default constructor
HistUnit(void) {}
// constructor
HistUnit(const bst_float *cut, TStats *data, unsigned size)
: cut(cut), data(data), size(size) {}
/*! \brief add a histogram to data */
inline void Add(bst_float fv,
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
const bst_uint ridx) {
unsigned i = std::upper_bound(cut, cut + size, fv) - cut;
utils::Assert(size != 0, "try insert into size=0");
utils::Assert(i < size,
"maximum value must be in cut, fv = %g, cutmax=%g", fv, cut[size-1]);
data[i].Add(gpair, info, ridx);
}
};
/*! \brief a set of histograms from different index */
struct HistSet {
/*! \brief the index pointer of each histunit */
const unsigned *rptr;
/*! \brief cutting points in each histunit */
const bst_float *cut;
/*! \brief data in different hist unit */
std::vector<TStats> data;
/*! \brief */
inline HistUnit operator[](size_t fid) {
return HistUnit(cut + rptr[fid],
&data[0] + rptr[fid],
rptr[fid+1] - rptr[fid]);
}
};
// thread workspace
struct ThreadWSpace {
/*! \brief actual unit pointer */
std::vector<unsigned> rptr;
/*! \brief cut field */
std::vector<bst_float> cut;
// per thread histset
std::vector<HistSet> hset;
// initialize the hist set
inline void Init(const TrainParam &param, int nthread) {
hset.resize(nthread);
// cleanup statistics
for (int tid = 0; tid < nthread; ++tid) {
for (size_t i = 0; i < hset[tid].data.size(); ++i) {
hset[tid].data[i].Clear();
}
hset[tid].rptr = BeginPtr(rptr);
hset[tid].cut = BeginPtr(cut);
hset[tid].data.resize(cut.size(), TStats(param));
}
}
// aggregate all statistics to hset[0]
inline void Aggregate(void) {
bst_omp_uint nsize = static_cast<bst_omp_uint>(cut.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
for (size_t tid = 1; tid < hset.size(); ++tid) {
hset[0].data[i].Add(hset[tid].data[i]);
}
}
}
/*! \brief clear the workspace */
inline void Clear(void) {
cut.clear(); rptr.resize(1); rptr[0] = 0;
}
/*! \brief total size */
inline size_t Size(void) const {
return rptr.size() - 1;
}
};
// workspace of thread
ThreadWSpace wspace;
// reducer for histogram
rabit::Reducer<TStats, TStats::Reduce> histred;
// set of working features
std::vector<bst_uint> fwork_set;
// update function implementation
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
RegTree *p_tree) {
this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
this->InitWorkSet(p_fmat, *p_tree, &fwork_set);
for (int depth = 0; depth < param.max_depth; ++depth) {
// reset and propose candidate split
this->ResetPosAndPropose(gpair, p_fmat, info, fwork_set, *p_tree);
// create histogram
this->CreateHist(gpair, p_fmat, info, fwork_set, *p_tree);
// find split based on histogram statistics
this->FindSplit(depth, gpair, p_fmat, info, fwork_set, p_tree);
// reset position after split
this->ResetPositionAfterSplit(p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree);
// if nothing left to be expand, break
if (qexpand.size() == 0) break;
}
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
}
}
// this function does two jobs
// (1) reset the position in array position, to be the latest leaf id
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector <bst_uint> &fset,
const RegTree &tree) = 0;
// initialize the current working set of features in this round
virtual void InitWorkSet(IFMatrix *p_fmat,
const RegTree &tree,
std::vector<bst_uint> *p_fset) {
p_fset->resize(tree.param.num_feature);
for (size_t i = 0; i < p_fset->size(); ++i) {
(*p_fset)[i] = static_cast<unsigned>(i);
}
}
// reset position after split, this is not a must, depending on implementation
virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
const RegTree &tree) {
}
virtual void CreateHist(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector <bst_uint> &fset,
const RegTree &tree) = 0;
private:
inline void EnumerateSplit(const HistUnit &hist,
const TStats &node_sum,
bst_uint fid,
SplitEntry *best,
TStats *left_sum) {
if (hist.size == 0) return;
double root_gain = node_sum.CalcGain(param);
TStats s(param), c(param);
for (bst_uint i = 0; i < hist.size; ++i) {
s.Add(hist.data[i]);
if (s.sum_hess >= param.min_child_weight) {
c.SetSubstract(node_sum, s);
if (c.sum_hess >= param.min_child_weight) {
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
if (best->Update((float)loss_chg, fid, hist.cut[i], false)) {
*left_sum = s;
}
}
}
}
s.Clear();
for (bst_uint i = hist.size - 1; i != 0; --i) {
s.Add(hist.data[i]);
if (s.sum_hess >= param.min_child_weight) {
c.SetSubstract(node_sum, s);
if (c.sum_hess >= param.min_child_weight) {
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
if (best->Update((float)loss_chg, fid, hist.cut[i-1], true)) {
*left_sum = c;
}
}
}
}
}
inline void FindSplit(int depth,
const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector <bst_uint> &fset,
RegTree *p_tree) {
const size_t num_feature = fset.size();
// get the best split condition for each node
std::vector<SplitEntry> sol(qexpand.size());
std::vector<TStats> left_sum(qexpand.size());
bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
const int nid = qexpand[wid];
utils::Assert(node2workindex[nid] == static_cast<int>(wid),
"node2workindex inconsistent");
SplitEntry &best = sol[wid];
TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
for (size_t i = 0; i < fset.size(); ++ i) {
EnumerateSplit(this->wspace.hset[0][i + wid * (num_feature+1)],
node_sum, fset[i], &best, &left_sum[wid]);
}
}
// get the best result, we can synchronize the solution
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
const int nid = qexpand[wid];
const SplitEntry &best = sol[wid];
const TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
this->SetStats(p_tree, nid, node_sum);
// set up the values
p_tree->stat(nid).loss_chg = best.loss_chg;
// now we know the solution in snode[nid], set split
if (best.loss_chg > rt_eps) {
p_tree->AddChilds(nid);
(*p_tree)[nid].set_split(best.split_index(),
best.split_value, best.default_left());
// mark right child as 0, to indicate fresh leaf
(*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
(*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
// right side sum
TStats right_sum;
right_sum.SetSubstract(node_sum, left_sum[wid]);
this->SetStats(p_tree, (*p_tree)[nid].cleft(), left_sum[wid]);
this->SetStats(p_tree, (*p_tree)[nid].cright(), right_sum);
} else {
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
}
}
}
inline void SetStats(RegTree *p_tree, int nid, const TStats &node_sum) {
p_tree->stat(nid).base_weight = static_cast<float>(node_sum.CalcWeight(param));
p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
node_sum.SetLeafVec(param, p_tree->leafvec(nid));
}
};
template<typename TStats>
class CQHistMaker: public HistMaker<TStats> {
protected:
struct HistEntry {
typename HistMaker<TStats>::HistUnit hist;
unsigned istart;
/*!
* \brief add a histogram to data,
* do linear scan, start from istart
*/
inline void Add(bst_float fv,
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
const bst_uint ridx) {
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
utils::Assert(istart != hist.size, "the bound variable must be max");
hist.data[istart].Add(gpair, info, ridx);
}
};
// sketch type used for this
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
// initialize the work set of tree
virtual void InitWorkSet(IFMatrix *p_fmat,
const RegTree &tree,
std::vector<bst_uint> *p_fset) {
feat_helper.InitByCol(p_fmat, tree);
feat_helper.SampleCol(this->param.colsample_bytree, p_fset);
}
// code to create histogram
virtual void CreateHist(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<bst_uint> &fset,
const RegTree &tree) {
// fill in reverse map
feat2workindex.resize(tree.param.num_feature);
std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
for (size_t i = 0; i < fset.size(); ++i) {
feat2workindex[fset[i]] = static_cast<int>(i);
}
// start to work
this->wspace.Init(this->param, 1);
// if it is C++11, use lazy evaluation for Allreduce,
// to gain speedup in recovery
#if __cplusplus >= 201103L
auto lazy_get_hist = [&]()
#endif
{
thread_hist.resize(this->get_nthread());
// start accumulating statistics
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(fset);
iter->BeforeFirst();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) {
int offset = feat2workindex[batch.col_index[i]];
if (offset >= 0) {
this->UpdateHistCol(gpair, batch[i], info, tree,
fset, offset,
&thread_hist[omp_get_thread_num()]);
}
}
}
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
const int wid = this->node2workindex[nid];
this->wspace.hset[0][fset.size() + wid * (fset.size()+1)]
.data[0] = node_stats[nid];
}
};
// sync the histogram
// if it is C++11, use lazy evaluation for Allreduce
#if __cplusplus >= 201103L
this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data),
this->wspace.hset[0].data.size(), lazy_get_hist);
#else
this->histred.Allreduce(BeginPtr(this->wspace.hset[0].data), this->wspace.hset[0].data.size());
#endif
}
virtual void ResetPositionAfterSplit(IFMatrix *p_fmat,
const RegTree &tree) {
this->ResetPositionCol(this->qexpand, p_fmat, tree);
}
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<bst_uint> &fset,
const RegTree &tree) {
// fill in reverse map
feat2workindex.resize(tree.param.num_feature);
std::fill(feat2workindex.begin(), feat2workindex.end(), -1);
freal_set.clear();
for (size_t i = 0; i < fset.size(); ++i) {
if (feat_helper.Type(fset[i]) == 2) {
feat2workindex[fset[i]] = static_cast<int>(freal_set.size());
freal_set.push_back(fset[i]);
} else {
feat2workindex[fset[i]] = -2;
}
}
this->GetNodeStats(gpair, *p_fmat, tree, info,
&thread_stats, &node_stats);
sketchs.resize(this->qexpand.size() * freal_set.size());
for (size_t i = 0; i < sketchs.size(); ++i) {
sketchs[i].Init(info.num_row, this->param.sketch_eps);
}
// intitialize the summary array
summary_array.resize(sketchs.size());
// setup maximum size
unsigned max_size = this->param.max_sketch_size();
for (size_t i = 0; i < sketchs.size(); ++i) {
summary_array[i].Reserve(max_size);
}
// if it is C++11, use lazy evaluation for Allreduce
#if __cplusplus >= 201103L
auto lazy_get_summary = [&]()
#endif
{// get smmary
thread_sketch.resize(this->get_nthread());
// number of rows in
const size_t nrows = p_fmat->buffered_rowset().size();
// start accumulating statistics
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(freal_set);
iter->BeforeFirst();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) {
int offset = feat2workindex[batch.col_index[i]];
if (offset >= 0) {
this->UpdateSketchCol(gpair, batch[i], tree,
node_stats,
freal_set, offset,
batch[i].length == nrows,
&thread_sketch[omp_get_thread_num()]);
}
}
}
for (size_t i = 0; i < sketchs.size(); ++i) {
utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array[i].SetPrune(out, max_size);
}
utils::Assert(summary_array.size() == sketchs.size(), "shape mismatch");
};
if (summary_array.size() != 0) {
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
#if __cplusplus >= 201103L
sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size(), lazy_get_summary);
#else
sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
#endif
}
// now we get the final result of sketch, setup the cut
this->wspace.cut.clear();
this->wspace.rptr.clear();
this->wspace.rptr.push_back(0);
for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
for (size_t i = 0; i < fset.size(); ++i) {
int offset = feat2workindex[fset[i]];
if (offset >= 0) {
const WXQSketch::Summary &a = summary_array[wid * freal_set.size() + offset];
for (size_t i = 1; i < a.size; ++i) {
bst_float cpt = a.data[i].value - rt_eps;
if (i == 1 || cpt > this->wspace.cut.back()) {
this->wspace.cut.push_back(cpt);
}
}
// push a value that is greater than anything
if (a.size != 0) {
bst_float cpt = a.data[a.size - 1].value;
// this must be bigger than last value in a scale
bst_float last = cpt + fabs(cpt) + rt_eps;
this->wspace.cut.push_back(last);
}
this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
} else {
utils::Assert(offset == -2, "BUG in mark");
bst_float cpt = feat_helper.MaxValue(fset[i]);
this->wspace.cut.push_back(cpt + fabs(cpt) + rt_eps);
this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
}
}
// reserve last value for global statistics
this->wspace.cut.push_back(0.0f);
this->wspace.rptr.push_back(static_cast<unsigned>(this->wspace.cut.size()));
}
utils::Assert(this->wspace.rptr.size() ==
(fset.size() + 1) * this->qexpand.size() + 1,
"cut space inconsistent");
}
private:
inline void UpdateHistCol(const std::vector<bst_gpair> &gpair,
const ColBatch::Inst &c,
const BoosterInfo &info,
const RegTree &tree,
const std::vector<bst_uint> &fset,
bst_uint fid_offset,
std::vector<HistEntry> *p_temp) {
if (c.length == 0) return;
// initialize sbuilder for use
std::vector<HistEntry> &hbuilder = *p_temp;
hbuilder.resize(tree.param.num_nodes);
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i];
const unsigned wid = this->node2workindex[nid];
hbuilder[nid].istart = 0;
hbuilder[nid].hist = this->wspace.hset[0][fid_offset + wid * (fset.size()+1)];
}
for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
hbuilder[nid].Add(c[j].fvalue, gpair, info, ridx);
}
}
}
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
const ColBatch::Inst &c,
const RegTree &tree,
const std::vector<TStats> &nstats,
const std::vector<bst_uint> &frealset,
bst_uint offset,
bool col_full,
std::vector<BaseMaker::SketchEntry> *p_temp) {
if (c.length == 0) return;
// initialize sbuilder for use
std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
sbuilder.resize(tree.param.num_nodes);
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i];
const unsigned wid = this->node2workindex[nid];
sbuilder[nid].sum_total = 0.0f;
sbuilder[nid].sketch = &sketchs[wid * frealset.size() + offset];
}
if (!col_full) {
// first pass, get sum of weight, TODO, optimization to skip first pass
for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
sbuilder[nid].sum_total += gpair[ridx].hess;
}
}
} else {
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i];
sbuilder[nid].sum_total = static_cast<bst_float>(nstats[nid].sum_hess);
}
}
// if only one value, no need to do second pass
if (c[0].fvalue == c[c.length-1].fvalue) {
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
sbuilder[nid].sketch->Push(c[0].fvalue, sbuilder[nid].sum_total);
}
return;
}
// two pass scan
unsigned max_size = this->param.max_sketch_size();
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
sbuilder[nid].Init(max_size);
}
// second pass, build the sketch
for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
sbuilder[nid].Push(c[j].fvalue, gpair[ridx].hess, max_size);
}
}
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
sbuilder[nid].Finalize(max_size);
}
}
// feature helper
BaseMaker::FMetaHelper feat_helper;
// temp space to map feature id to working index
std::vector<int> feat2workindex;
// set of index from fset that are real
std::vector<bst_uint> freal_set;
// thread temp data
std::vector< std::vector<BaseMaker::SketchEntry> > thread_sketch;
// used to hold statistics
std::vector< std::vector<TStats> > thread_stats;
// used to hold start pointer
std::vector< std::vector<HistEntry> > thread_hist;
// node statistics
std::vector<TStats> node_stats;
// summary array
std::vector<WXQSketch::SummaryContainer> summary_array;
// reducer for summary
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
// per node, per feature sketch
std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
};
template<typename TStats>
class QuantileHistMaker: public HistMaker<TStats> {
protected:
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
virtual void ResetPosAndPropose(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector <bst_uint> &fset,
const RegTree &tree) {
// initialize the data structure
int nthread = BaseMaker::get_nthread();
sketchs.resize(this->qexpand.size() * tree.param.num_feature);
for (size_t i = 0; i < sketchs.size(); ++i) {
sketchs[i].Init(info.num_row, this->param.sketch_eps);
}
// start accumulating statistics
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
// parallel convert to column major format
utils::ParallelGroupBuilder<SparseBatch::Entry> builder(&col_ptr, &col_data, &thread_col_ptr);
builder.InitBudget(tree.param.num_feature, nthread);
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) {
RowBatch::Inst inst = batch[i];
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
int nid = this->position[ridx];
if (nid >= 0) {
if (!tree[nid].is_leaf()) {
this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
}
if (this->node2workindex[nid] < 0) {
this->position[ridx] = ~nid;
} else{
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].index, omp_get_thread_num());
}
}
}
}
builder.InitStorage();
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) {
RowBatch::Inst inst = batch[i];
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
const int nid = this->position[ridx];
if (nid >= 0) {
for (bst_uint j = 0; j < inst.length; ++j) {
builder.Push(inst[j].index,
SparseBatch::Entry(nid, inst[j].fvalue),
omp_get_thread_num());
}
}
}
// start putting things into sketch
const bst_omp_uint nfeat = col_ptr.size() - 1;
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint k = 0; k < nfeat; ++k) {
for (size_t i = col_ptr[k]; i < col_ptr[k+1]; ++i) {
const SparseBatch::Entry &e = col_data[i];
const int wid = this->node2workindex[e.index];
sketchs[wid * tree.param.num_feature + k].Push(e.fvalue, gpair[e.index].hess);
}
}
}
// setup maximum size
unsigned max_size = this->param.max_sketch_size();
// synchronize sketch
summary_array.resize(sketchs.size());
for (size_t i = 0; i < sketchs.size(); ++i) {
utils::WQuantileSketch<bst_float, bst_float>::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array[i].Reserve(max_size);
summary_array[i].SetPrune(out, max_size);
}
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
sreducer.Allreduce(BeginPtr(summary_array), nbytes, summary_array.size());
// now we get the final result of sketch, setup the cut
this->wspace.cut.clear();
this->wspace.rptr.clear();
this->wspace.rptr.push_back(0);
for (size_t wid = 0; wid < this->qexpand.size(); ++wid) {
for (int fid = 0; fid < tree.param.num_feature; ++fid) {
const WXQSketch::Summary &a = summary_array[wid * tree.param.num_feature + fid];
for (size_t i = 1; i < a.size; ++i) {
bst_float cpt = a.data[i].value - rt_eps;
if (i == 1 || cpt > this->wspace.cut.back()) {
this->wspace.cut.push_back(cpt);
}
}
// push a value that is greater than anything
if (a.size != 0) {
bst_float cpt = a.data[a.size - 1].value;
// this must be bigger than last value in a scale
bst_float last = cpt + fabs(cpt) + rt_eps;
this->wspace.cut.push_back(last);
}
this->wspace.rptr.push_back(this->wspace.cut.size());
}
// reserve last value for global statistics
this->wspace.cut.push_back(0.0f);
this->wspace.rptr.push_back(this->wspace.cut.size());
}
utils::Assert(this->wspace.rptr.size() ==
(tree.param.num_feature + 1) * this->qexpand.size() + 1,
"cut space inconsistent");
}
private:
// summary array
std::vector<WXQSketch::SummaryContainer> summary_array;
// reducer for summary
rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer;
// local temp column data structure
std::vector<size_t> col_ptr;
// local storage of column data
std::vector<SparseBatch::Entry> col_data;
std::vector< std::vector<size_t> > thread_col_ptr;
// per node, per feature sketch
std::vector< utils::WQuantileSketch<bst_float, bst_float> > sketchs;
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_HISTMAKER_INL_HPP_

View File

@ -8,6 +8,7 @@
#include <vector>
#include "./param.h"
#include "./updater.h"
#include "./updater_sync-inl.hpp"
namespace xgboost {
namespace tree {
@ -19,6 +20,7 @@ class TreePruner: public IUpdater {
virtual void SetParam(const char *name, const char *val) {
using namespace std;
param.SetParam(name, val);
syncher.SetParam(name, val);
if (!strcmp(name, "silent")) silent = atoi(val);
}
// update the tree, do pruning
@ -33,8 +35,8 @@ class TreePruner: public IUpdater {
this->DoPrune(*trees[i]);
}
param.learning_rate = lr;
syncher.Update(gpair, p_fmat, info, trees);
}
private:
// try to prune off current leaf
inline int TryPruneLeaf(RegTree &tree, int nid, int depth, int npruned) {
@ -70,6 +72,8 @@ class TreePruner: public IUpdater {
}
private:
// synchronizer
TreeSyncher syncher;
// shutup
int silent;
// training parameter

View File

@ -7,6 +7,7 @@
*/
#include <vector>
#include <limits>
#include "../sync/sync.h"
#include "./param.h"
#include "./updater.h"
#include "../utils/omp.h"
@ -26,7 +27,7 @@ class TreeRefresher: public IUpdater {
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
const std::vector<RegTree*> &trees) {
if (trees.size() == 0) return;
// number of threads
// thread temporal space
@ -39,54 +40,71 @@ class TreeRefresher: public IUpdater {
nthread = omp_get_num_threads();
}
fvec_temp.resize(nthread, RegTree::FVec());
stemp.resize(trees.size() * nthread, std::vector<TStats>());
stemp.resize(nthread, std::vector<TStats>());
#pragma omp parallel
{
int tid = omp_get_thread_num();
int num_nodes = 0;
for (size_t i = 0; i < trees.size(); ++i) {
std::vector<TStats> &vec = stemp[tid * trees.size() + i];
vec.resize(trees[i]->param.num_nodes, TStats(param));
std::fill(vec.begin(), vec.end(), TStats(param));
num_nodes += trees[i]->param.num_nodes;
}
stemp[tid].resize(num_nodes, TStats(param));
std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
fvec_temp[tid].Init(trees[0]->param.num_feature);
}
// start accumulating statistics
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
"too large batch size ");
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) {
RowBatch::Inst inst = batch[i];
const int tid = omp_get_thread_num();
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
RegTree::FVec &feats = fvec_temp[tid];
feats.Fill(inst);
for (size_t j = 0; j < trees.size(); ++j) {
AddStats(*trees[j], feats, gpair, info, ridx,
&stemp[tid * trees.size() + j]);
// if it is C++11, use lazy evaluation for Allreduce,
// to gain speedup in recovery
#if __cplusplus >= 201103L
auto lazy_get_stats = [&]()
#endif
{
// start accumulating statistics
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
"too large batch size ");
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) {
RowBatch::Inst inst = batch[i];
const int tid = omp_get_thread_num();
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
RegTree::FVec &feats = fvec_temp[tid];
feats.Fill(inst);
int offset = 0;
for (size_t j = 0; j < trees.size(); ++j) {
AddStats(*trees[j], feats, gpair, info, ridx,
BeginPtr(stemp[tid]) + offset);
offset += trees[j]->param.num_nodes;
}
feats.Drop(inst);
}
feats.Drop(inst);
}
}
// start update the trees using the statistics
// aggregate the statistics
int num_nodes = static_cast<int>(stemp[0].size());
#pragma omp parallel for schedule(static)
for (int nid = 0; nid < num_nodes; ++nid) {
for (int tid = 1; tid < nthread; ++tid) {
stemp[0][nid].Add(stemp[tid][nid]);
}
}
};
#if __cplusplus >= 201103L
reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
#else
reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
#endif
// rescale learning rate according to size of trees
float lr = param.learning_rate;
param.learning_rate = lr / trees.size();
for (size_t i = 0; i < trees.size(); ++i) {
// aggregate
#pragma omp parallel for schedule(static)
for (int nid = 0; nid < trees[i]->param.num_nodes; ++nid) {
for (int tid = 1; tid < nthread; ++tid) {
stemp[i][nid].Add(stemp[tid * trees.size() + i][nid]);
}
}
int offset = 0;
for (size_t i = 0; i < trees.size(); ++i) {
for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
this->Refresh(stemp[i], rid, trees[i]);
this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
}
offset += trees[i]->param.num_nodes;
}
// set learning rate back
param.learning_rate = lr;
@ -98,8 +116,7 @@ class TreeRefresher: public IUpdater {
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
const bst_uint ridx,
std::vector<TStats> *p_gstats) {
std::vector<TStats> &gstats = *p_gstats;
TStats *gstats) {
// start from groups that belongs to current data
int pid = static_cast<int>(info.GetRoot(ridx));
gstats[pid].Add(gpair, info, ridx);
@ -110,7 +127,7 @@ class TreeRefresher: public IUpdater {
gstats[pid].Add(gpair, info, ridx);
}
}
inline void Refresh(const std::vector<TStats> &gstats,
inline void Refresh(const TStats *gstats,
int nid, RegTree *p_tree) {
RegTree &tree = *p_tree;
tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
@ -129,6 +146,8 @@ class TreeRefresher: public IUpdater {
}
// training parameter
TrainParam param;
// reducer
rabit::Reducer<TStats, TStats::Reduce> reducer;
};
} // namespace tree

View File

@ -0,0 +1,393 @@
#ifndef XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
#define XGBOOST_TREE_UPDATER_SKMAKER_INL_HPP_
/*!
* \file updater_skmaker-inl.hpp
* \brief use approximation sketch to construct a tree,
a refresh is needed to make the statistics exactly correct
* \author Tianqi Chen
*/
#include <vector>
#include <algorithm>
#include <rabit.h>
#include "../utils/quantile.h"
#include "./updater_basemaker-inl.hpp"
namespace xgboost {
namespace tree {
class SketchMaker: public BaseMaker {
public:
virtual ~SketchMaker(void) {}
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
// rescale learning rate according to size of trees
float lr = param.learning_rate;
param.learning_rate = lr / trees.size();
// build tree
for (size_t i = 0; i < trees.size(); ++i) {
this->Update(gpair, p_fmat, info, trees[i]);
}
param.learning_rate = lr;
}
protected:
inline void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
RegTree *p_tree) {
this->InitData(gpair, *p_fmat, info.root_index, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) {
this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
&thread_stats, &node_stats);
this->BuildSketch(gpair, p_fmat, info, *p_tree);
this->SyncNodeStats();
this->FindSplit(depth, gpair, p_fmat, info, p_tree);
this->ResetPositionCol(qexpand, p_fmat, *p_tree);
this->UpdateQueueExpand(*p_tree);
// if nothing left to be expand, break
if (qexpand.size() == 0) break;
}
if (qexpand.size() != 0) {
this->GetNodeStats(gpair, *p_fmat, *p_tree, info,
&thread_stats, &node_stats);
this->SyncNodeStats();
}
// set all statistics correctly
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
this->SetStats(nid, node_stats[nid], p_tree);
if (!(*p_tree)[nid].is_leaf()) {
p_tree->stat(nid).loss_chg =
node_stats[(*p_tree)[nid].cleft()].CalcGain(param) +
node_stats[(*p_tree)[nid].cright()].CalcGain(param) -
node_stats[nid].CalcGain(param);
}
}
// set left leaves
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
}
}
// define the sketch we want to use
typedef utils::WXQuantileSketch<bst_float, bst_float> WXQSketch;
private:
// statistics needed in the gradient calculation
struct SKStats {
/*! \brief sum of all positive gradient */
double pos_grad;
/*! \brief sum of all negative gradient */
double neg_grad;
/*! \brief sum of hessian statistics */
double sum_hess;
explicit SKStats(void) {}
// constructor
explicit SKStats(const TrainParam &param) {
this->Clear();
}
/*! \brief clear the statistics */
inline void Clear(void) {
neg_grad = pos_grad = sum_hess = 0.0f;
}
// accumulate statistics
inline void Add(const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
bst_uint ridx) {
const bst_gpair &b = gpair[ridx];
if (b.grad >= 0.0f) {
pos_grad += b.grad;
} else {
neg_grad -= b.grad;
}
sum_hess += b.hess;
}
/*! \brief calculate gain of the solution */
inline double CalcGain(const TrainParam &param) const {
return param.CalcGain(pos_grad - neg_grad, sum_hess);
}
/*! \brief set current value to a - b */
inline void SetSubstract(const SKStats &a, const SKStats &b) {
pos_grad = a.pos_grad - b.pos_grad;
neg_grad = a.neg_grad - b.neg_grad;
sum_hess = a.sum_hess - b.sum_hess;
}
// calculate leaf weight
inline double CalcWeight(const TrainParam &param) const {
return param.CalcWeight(pos_grad - neg_grad, sum_hess);
}
/*! \brief add statistics to the data */
inline void Add(const SKStats &b) {
pos_grad += b.pos_grad;
neg_grad += b.neg_grad;
sum_hess += b.sum_hess;
}
/*! \brief same as add, reduce is used in All Reduce */
inline void Reduce(const SKStats &b) {
this->Add(b);
}
/*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const {
}
};
inline void BuildSketch(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const RegTree &tree) {
sketchs.resize(this->qexpand.size() * tree.param.num_feature * 3);
for (size_t i = 0; i < sketchs.size(); ++i) {
sketchs[i].Init(info.num_row, this->param.sketch_eps);
}
thread_sketch.resize(this->get_nthread());
// number of rows in
const size_t nrows = p_fmat->buffered_rowset().size();
// start accumulating statistics
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
iter->BeforeFirst();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// start enumeration
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint i = 0; i < nsize; ++i) {
this->UpdateSketchCol(gpair, batch[i], tree,
node_stats,
batch.col_index[i],
batch[i].length == nrows,
&thread_sketch[omp_get_thread_num()]);
}
}
// setup maximum size
unsigned max_size = param.max_sketch_size();
// synchronize sketch
summary_array.Init(sketchs.size(), max_size);
for (size_t i = 0; i < sketchs.size(); ++i) {
utils::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
sketchs[i].GetSummary(&out);
summary_array.Set(i, out);
}
size_t nbytes = summary_array.MemSize();;
sketch_reducer.Allreduce(&summary_array, nbytes);
}
// update sketch information in column fid
inline void UpdateSketchCol(const std::vector<bst_gpair> &gpair,
const ColBatch::Inst &c,
const RegTree &tree,
const std::vector<SKStats> &nstats,
bst_uint fid,
bool col_full,
std::vector<SketchEntry> *p_temp) {
if (c.length == 0) return;
// initialize sbuilder for use
std::vector<SketchEntry> &sbuilder = *p_temp;
sbuilder.resize(tree.param.num_nodes * 3);
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i];
const unsigned wid = this->node2workindex[nid];
for (int k = 0; k < 3; ++k) {
sbuilder[3 * nid + k].sum_total = 0.0f;
sbuilder[3 * nid + k].sketch = &sketchs[(wid * tree.param.num_feature + fid) * 3 + k];
}
}
if (!col_full) {
for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
const bst_gpair &e = gpair[ridx];
if (e.grad >= 0.0f) {
sbuilder[3 * nid + 0].sum_total += e.grad;
} else {
sbuilder[3 * nid + 1].sum_total -= e.grad;
}
sbuilder[3 * nid + 2].sum_total += e.hess;
}
}
} else {
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const unsigned nid = this->qexpand[i];
sbuilder[3 * nid + 0].sum_total = nstats[nid].pos_grad;
sbuilder[3 * nid + 1].sum_total = nstats[nid].neg_grad;
sbuilder[3 * nid + 2].sum_total = nstats[nid].sum_hess;
}
}
// if only one value, no need to do second pass
if (c[0].fvalue == c[c.length-1].fvalue) {
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
for (int k = 0; k < 3; ++k) {
sbuilder[3 * nid + k].sketch->Push(c[0].fvalue, sbuilder[3 * nid + k].sum_total);
}
}
return;
}
// two pass scan
unsigned max_size = param.max_sketch_size();
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
for (int k = 0; k < 3; ++k) {
sbuilder[3 * nid + k].Init(max_size);
}
}
// second pass, build the sketch
for (bst_uint j = 0; j < c.length; ++j) {
const bst_uint ridx = c[j].index;
const int nid = this->position[ridx];
if (nid >= 0) {
const bst_gpair &e = gpair[ridx];
if (e.grad >= 0.0f) {
sbuilder[3 * nid + 0].Push(c[j].fvalue, e.grad, max_size);
} else {
sbuilder[3 * nid + 1].Push(c[j].fvalue, -e.grad, max_size);
}
sbuilder[3 * nid + 2].Push(c[j].fvalue, e.hess, max_size);
}
}
for (size_t i = 0; i < this->qexpand.size(); ++i) {
const int nid = this->qexpand[i];
for (int k = 0; k < 3; ++k) {
sbuilder[3 * nid + k].Finalize(max_size);
}
}
}
inline void SyncNodeStats(void) {
utils::Assert(qexpand.size() != 0, "qexpand must not be empty");
std::vector<SKStats> tmp(qexpand.size());
for (size_t i = 0; i < qexpand.size(); ++i) {
tmp[i] = node_stats[qexpand[i]];
}
stats_reducer.Allreduce(BeginPtr(tmp), tmp.size());
for (size_t i = 0; i < qexpand.size(); ++i) {
node_stats[qexpand[i]] = tmp[i];
}
}
inline void FindSplit(int depth,
const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
RegTree *p_tree) {
const bst_uint num_feature = p_tree->param.num_feature;
// get the best split condition for each node
std::vector<SplitEntry> sol(qexpand.size());
bst_omp_uint nexpand = static_cast<bst_omp_uint>(qexpand.size());
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
const int nid = qexpand[wid];
utils::Assert(node2workindex[nid] == static_cast<int>(wid),
"node2workindex inconsistent");
SplitEntry &best = sol[wid];
for (bst_uint fid = 0; fid < num_feature; ++ fid) {
unsigned base = (wid * p_tree->param.num_feature + fid) * 3;
EnumerateSplit(summary_array[base + 0],
summary_array[base + 1],
summary_array[base + 2],
node_stats[nid], fid, &best);
}
}
// get the best result, we can synchronize the solution
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
const int nid = qexpand[wid];
const SplitEntry &best = sol[wid];
// set up the values
p_tree->stat(nid).loss_chg = best.loss_chg;
this->SetStats(nid, node_stats[nid], p_tree);
// now we know the solution in snode[nid], set split
if (best.loss_chg > rt_eps) {
p_tree->AddChilds(nid);
(*p_tree)[nid].set_split(best.split_index(),
best.split_value, best.default_left());
// mark right child as 0, to indicate fresh leaf
(*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
(*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
} else {
(*p_tree)[nid].set_leaf(p_tree->stat(nid).base_weight * param.learning_rate);
}
}
}
// set statistics on ptree
inline void SetStats(int nid, const SKStats &node_sum, RegTree *p_tree) {
p_tree->stat(nid).base_weight = node_sum.CalcWeight(param);
p_tree->stat(nid).sum_hess = static_cast<float>(node_sum.sum_hess);
node_sum.SetLeafVec(param, p_tree->leafvec(nid));
}
inline void EnumerateSplit(const WXQSketch::Summary &pos_grad,
const WXQSketch::Summary &neg_grad,
const WXQSketch::Summary &sum_hess,
const SKStats &node_sum,
bst_uint fid,
SplitEntry *best) {
if (sum_hess.size == 0) return;
double root_gain = node_sum.CalcGain(param);
std::vector<bst_float> fsplits;
for (size_t i = 0; i < pos_grad.size; ++i) {
fsplits.push_back(pos_grad.data[i].value);
}
for (size_t i = 0; i < neg_grad.size; ++i) {
fsplits.push_back(neg_grad.data[i].value);
}
for (size_t i = 0; i < sum_hess.size; ++i) {
fsplits.push_back(sum_hess.data[i].value);
}
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// sum feature
SKStats feat_sum;
feat_sum.pos_grad = pos_grad.data[pos_grad.size - 1].rmax;
feat_sum.neg_grad = neg_grad.data[neg_grad.size - 1].rmax;
feat_sum.sum_hess = sum_hess.data[sum_hess.size - 1].rmax;
size_t ipos = 0, ineg = 0, ihess = 0;
for (size_t i = 1; i < fsplits.size(); ++i) {
WXQSketch::Entry pos = pos_grad.Query(fsplits[i], ipos);
WXQSketch::Entry neg = neg_grad.Query(fsplits[i], ineg);
WXQSketch::Entry hess = sum_hess.Query(fsplits[i], ihess);
SKStats s, c;
s.pos_grad = 0.5f * (pos.rmin + pos.rmax - pos.wmin);
s.neg_grad = 0.5f * (neg.rmin + neg.rmax - neg.wmin);
s.sum_hess = 0.5f * (hess.rmin + hess.rmax - hess.wmin);
c.SetSubstract(node_sum, s);
// forward
if (s.sum_hess >= param.min_child_weight &&
c.sum_hess >= param.min_child_weight) {
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
best->Update(loss_chg, fid, fsplits[i], false);
}
// backward
c.SetSubstract(feat_sum, s);
s.SetSubstract(node_sum, c);
if (s.sum_hess >= param.min_child_weight &&
c.sum_hess >= param.min_child_weight) {
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
best->Update(loss_chg, fid, fsplits[i], true);
}
}
{// all including
SKStats s = feat_sum, c;
c.SetSubstract(node_sum, s);
if (s.sum_hess >= param.min_child_weight &&
c.sum_hess >= param.min_child_weight) {
bst_float cpt = fsplits.back();
double loss_chg = s.CalcGain(param) + c.CalcGain(param) - root_gain;
best->Update(loss_chg, fid, cpt + fabsf(cpt) + 1.0f, true);
}
}
}
// thread temp data
// used to hold temporal sketch
std::vector< std::vector<SketchEntry> > thread_sketch;
// used to hold statistics
std::vector< std::vector<SKStats> > thread_stats;
// node statistics
std::vector<SKStats> node_stats;
// summary array
WXQSketch::SummaryArray summary_array;
// reducer for summary
rabit::Reducer<SKStats> stats_reducer;
// reducer for summary
rabit::SerializeReducer<WXQSketch::SummaryArray> sketch_reducer;
// per node, per feature sketch
std::vector< utils::WXQuantileSketch<bst_float, bst_float> > sketchs;
};
} // tree
} // xgboost
#endif

View File

@ -0,0 +1,53 @@
#ifndef XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
#define XGBOOST_TREE_UPDATER_SYNC_INL_HPP_
/*!
* \file updater_sync-inl.hpp
* \brief synchronize the tree in all distributed nodes
* \author Tianqi Chen
*/
#include <vector>
#include <limits>
#include "../sync/sync.h"
#include "./updater.h"
namespace xgboost {
namespace tree {
/*!
* \brief syncher that synchronize the tree in all distributed nodes
* can implement various strategies, so far it is always set to node 0's tree
*/
class TreeSyncher: public IUpdater {
public:
virtual ~TreeSyncher(void) {}
virtual void SetParam(const char *name, const char *val) {
}
// update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector<RegTree*> &trees) {
this->SyncTrees(trees);
}
private:
// synchronize the trees in different nodes, take tree from rank 0
inline void SyncTrees(const std::vector<RegTree *> &trees) {
if (rabit::GetWorldSize() == 1) return;
std::string s_model;
utils::MemoryBufferStream fs(&s_model);
int rank = rabit::GetRank();
if (rank == 0) {
for (size_t i = 0; i < trees.size(); ++i) {
trees[i]->SaveModel(fs);
}
}
fs.Seek(0);
rabit::Broadcast(&s_model, 0);
for (size_t i = 0; i < trees.size(); ++i) {
trees[i]->LoadModel(fs);
}
}
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_SYNC_INL_HPP_

205
src/utils/base64.h Normal file
View File

@ -0,0 +1,205 @@
#ifndef XGBOOST_UTILS_BASE64_H_
#define XGBOOST_UTILS_BASE64_H_
/*!
* \file base64.h
* \brief data stream support to input and output from/to base64 stream
* base64 is easier to store and pass as text format in mapreduce
* \author Tianqi Chen
*/
#include <cctype>
#include <cstdio>
#include "./utils.h"
#include "./io.h"
namespace xgboost {
namespace utils {
/*! \brief namespace of base64 decoding and encoding table */
namespace base64 {
const char DecodeTable[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62, // '+'
0, 0, 0,
63, // '/'
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, // '0'-'9'
0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 'A'-'Z'
0, 0, 0, 0, 0, 0,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 'a'-'z'
};
static const char EncodeTable[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
} // namespace base64
/*! \brief the stream that reads from base64, note we take from file pointers */
class Base64InStream: public IStream {
public:
explicit Base64InStream(FILE *fp) : fp(fp) {
num_prev = 0; tmp_ch = 0;
}
/*!
* \brief initialize the stream position to beginning of next base64 stream
* call this function before actually start read
*/
inline void InitPosition(void) {
// get a charater
do {
tmp_ch = fgetc(fp);
} while (isspace(tmp_ch));
}
/*! \brief whether current position is end of a base64 stream */
inline bool IsEOF(void) const {
return num_prev == 0 && (tmp_ch == EOF || isspace(tmp_ch));
}
virtual size_t Read(void *ptr, size_t size) {
using base64::DecodeTable;
if (size == 0) return 0;
// use tlen to record left size
size_t tlen = size;
unsigned char *cptr = static_cast<unsigned char*>(ptr);
// if anything left, load from previous buffered result
if (num_prev != 0) {
if (num_prev == 2) {
if (tlen >= 2) {
*cptr++ = buf_prev[0];
*cptr++ = buf_prev[1];
tlen -= 2;
num_prev = 0;
} else {
// assert tlen == 1
*cptr++ = buf_prev[0]; --tlen;
buf_prev[0] = buf_prev[1];
num_prev = 1;
}
} else {
// assert num_prev == 1
*cptr++ = buf_prev[0]; --tlen; num_prev = 0;
}
}
if (tlen == 0) return size;
int nvalue;
// note: everything goes with 4 bytes in Base64
// so we process 4 bytes a unit
while (tlen && tmp_ch != EOF && !isspace(tmp_ch)) {
// first byte
nvalue = DecodeTable[tmp_ch] << 18;
{
// second byte
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
nvalue |= DecodeTable[tmp_ch] << 12;
*cptr++ = (nvalue >> 16) & 0xFF; --tlen;
}
{
// third byte
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
// handle termination
if (tmp_ch == '=') {
Check((tmp_ch = fgetc(fp), tmp_ch == '='), "invalid base64 format");
Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
"invalid base64 format");
break;
}
nvalue |= DecodeTable[tmp_ch] << 6;
if (tlen) {
*cptr++ = (nvalue >> 8) & 0xFF; --tlen;
} else {
buf_prev[num_prev++] = (nvalue >> 8) & 0xFF;
}
}
{
// fourth byte
Check((tmp_ch = fgetc(fp), tmp_ch != EOF && !isspace(tmp_ch)),
"invalid base64 format");
if (tmp_ch == '=') {
Check((tmp_ch = fgetc(fp), tmp_ch == EOF || isspace(tmp_ch)),
"invalid base64 format");
break;
}
nvalue |= DecodeTable[tmp_ch];
if (tlen) {
*cptr++ = nvalue & 0xFF; --tlen;
} else {
buf_prev[num_prev ++] = nvalue & 0xFF;
}
}
// get next char
tmp_ch = fgetc(fp);
}
if (kStrictCheck) {
Check(tlen == 0, "Base64InStream: read incomplete");
}
return size - tlen;
}
virtual void Write(const void *ptr, size_t size) {
utils::Error("Base64InStream do not support write");
}
private:
FILE *fp;
unsigned char tmp_ch;
int num_prev;
unsigned char buf_prev[2];
// whether we need to do strict check
static const bool kStrictCheck = false;
};
/*! \brief the stream that write to base64, note we take from file pointers */
class Base64OutStream: public IStream {
public:
explicit Base64OutStream(FILE *fp) : fp(fp) {
buf_top = 0;
}
virtual void Write(const void *ptr, size_t size) {
using base64::EncodeTable;
size_t tlen = size;
const unsigned char *cptr = static_cast<const unsigned char*>(ptr);
while (tlen) {
while (buf_top < 3 && tlen != 0) {
buf[++buf_top] = *cptr++; --tlen;
}
if (buf_top == 3) {
// flush 4 bytes out
fputc(EncodeTable[buf[1] >> 2], fp);
fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
fputc(EncodeTable[((buf[2] << 2) | (buf[3] >> 6)) & 0x3F], fp);
fputc(EncodeTable[buf[3] & 0x3F], fp);
buf_top = 0;
}
}
}
virtual size_t Read(void *ptr, size_t size) {
Error("Base64OutStream do not support read");
return 0;
}
/*!
* \brief finish writing of all current base64 stream, do some post processing
* \param endch charater to put to end of stream, if it is EOF, then nothing will be done
*/
inline void Finish(char endch = EOF) {
using base64::EncodeTable;
if (buf_top == 1) {
fputc(EncodeTable[buf[1] >> 2], fp);
fputc(EncodeTable[(buf[1] << 4) & 0x3F], fp);
fputc('=', fp);
fputc('=', fp);
}
if (buf_top == 2) {
fputc(EncodeTable[buf[1] >> 2], fp);
fputc(EncodeTable[((buf[1] << 4) | (buf[2] >> 4)) & 0x3F], fp);
fputc(EncodeTable[(buf[2] << 2) & 0x3F], fp);
fputc('=', fp);
}
buf_top = 0;
if (endch != EOF) fputc(endch, fp);
}
private:
FILE *fp;
int buf_top;
unsigned char buf[4];
};
} // namespace utils
} // namespace xgboost
#endif // XGBOOST_UTILS_BASE64_H_

66
src/utils/bitmap.h Normal file
View File

@ -0,0 +1,66 @@
#ifndef XGBOOST_UTILS_BITMAP_H_
#define XGBOOST_UTILS_BITMAP_H_
/*!
* \file bitmap.h
* \brief a simple implement of bitmap
* NOTE: bitmap is only threadsafe per word access, remember this when using bitmap
* \author Tianqi Chen
*/
#include <vector>
#include "./utils.h"
#include "./omp.h"
namespace xgboost {
namespace utils {
/*! \brief bit map that contains set of bit indicators */
struct BitMap {
/*! \brief internal data structure */
std::vector<uint32_t> data;
/*!
* \brief resize the bitmap to be certain size
* \param size the size of bitmap
*/
inline void Resize(size_t size) {
data.resize((size + 31U) >> 5, 0);
}
/*!
* \brief query the i-th position of bitmap
* \param i the position in
*/
inline bool Get(size_t i) const {
return (data[i >> 5] >> (i & 31U)) & 1U;
}
/*!
* \brief set i-th position to true
* \param i position index
*/
inline void SetTrue(size_t i) {
data[i >> 5] |= (1 << (i & 31U));
}
/*! \brief initialize the value of bit map from vector of bool*/
inline void InitFromBool(const std::vector<int> &vec) {
this->Resize(vec.size());
// parallel over the full cases
bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
uint32_t res = 0;
for (int k = 0; k < 32; ++k) {
int bit = vec[(i << 5) | k];
res |= (bit << k);
}
data[i] = res;
}
if (nsize != vec.size()) data.back() = 0;
for (size_t i = nsize; i < vec.size(); ++i) {
if (vec[i]) this->SetTrue(i);
}
}
/*! \brief clear the bitmap, set all places to false */
inline void Clear(void) {
std::fill(data.begin(), data.end(), 0U);
}
};
} // namespace utils
} // namespace xgboost
#endif

111
src/utils/group_data.h Normal file
View File

@ -0,0 +1,111 @@
#ifndef XGBOOST_UTILS_GROUP_DATA_H_
#define XGBOOST_UTILS_GROUP_DATA_H_
/*!
* \file group_data.h
* \brief this file defines utils to group data by integer keys
* Input: given input sequence (key,value), (k1,v1), (k2,v2)
* Ouptupt: an array of values data = [v1,v2,v3 .. vn]
* and a group pointer ptr,
* data[ptr[k]:ptr[k+1]] contains values that corresponds to key k
*
* This can be used to construct CSR/CSC matrix from un-ordered input
* The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
* \author Tianqi Chen
*/
namespace xgboost {
namespace utils {
/*!
* \brief multi-thread version of group builder
* \tparam ValueType type of entries in the sparse matrix
* \tparam SizeType type of the index range holder
*/
template<typename ValueType, typename SizeType = size_t>
struct ParallelGroupBuilder {
public:
// parallel group builder of data
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
std::vector<ValueType> *p_data)
: rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
}
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
std::vector<ValueType> *p_data,
std::vector< std::vector<SizeType> > *p_thread_rptr)
: rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
}
public:
/*!
* \brief step 1: initialize the helper, with hint of number keys
* and thread used in the construction
* \param nkeys number of keys in the matrix, can be smaller than expected
* \param nthread number of thread that will be used in construction
*/
inline void InitBudget(size_t nkeys = 0, int nthread = 1) {
thread_rptr.resize(nthread);
for (size_t i = 0; i < thread_rptr.size(); ++i) {
thread_rptr[i].resize(nkeys);
std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
}
}
/*!
* \brief step 2: add budget to each key
* \param key the key
* \param threadid the id of thread that calls this function
* \param nelem number of element budget add to this row
*/
inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) {
std::vector<SizeType> &trptr = thread_rptr[threadid];
if (trptr.size() < key + 1) {
trptr.resize(key + 1, 0);
}
trptr[key] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage(void) {
// set rptr to correct size
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
if (rptr.size() <= thread_rptr[tid].size()) {
rptr.resize(thread_rptr[tid].size()+1);
}
}
// initialize rptr to be beginning of each segment
size_t start = 0;
for (size_t i = 0; i + 1 < rptr.size(); ++i) {
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
std::vector<SizeType> &trptr = thread_rptr[tid];
if (i < trptr.size()) {
size_t ncnt = trptr[i];
trptr[i] = start;
start += ncnt;
}
}
rptr[i + 1] = start;
}
data.resize(start);
}
/*!
* \brief step 4: add data to the allocated space,
* the calls to this function should be exactly match previous call to AddBudget
*
* \param key the key of
* \param threadid the id of thread that calls this function
*/
inline void Push(size_t key, ValueType value, int threadid = 0) {
SizeType &rp = thread_rptr[threadid][key];
data[rp++] = value;
}
private:
/*! \brief pointer to the beginning and end of each continuous key */
std::vector<SizeType> &rptr;
/*! \brief index of nonzero entries in each row */
std::vector<ValueType> &data;
/*! \brief thread local data structure */
std::vector< std::vector<SizeType> > &thread_rptr;
/*! \brief local temp thread ptr, use this if not specified by the constructor */
std::vector< std::vector<SizeType> > tmp_thread_rptr;
};
} // namespace utils
} // namespace xgboost
#endif

View File

@ -88,12 +88,98 @@ class IStream {
}
};
/*! \brief implementation of file i/o stream */
class FileStream : public IStream {
private:
std::FILE *fp;
/*! \brief interface of i/o stream that support seek */
class ISeekStream: public IStream {
public:
explicit FileStream(std::FILE *fp) : fp(fp) {
/*! \brief seek to certain position of the file */
virtual void Seek(size_t pos) = 0;
/*! \brief tell the position of the stream */
virtual size_t Tell(void) = 0;
};
/*! \brief fixed size memory buffer */
struct MemoryFixSizeBuffer : public ISeekStream {
public:
MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
: p_buffer_(reinterpret_cast<char*>(p_buffer)), buffer_size_(buffer_size) {
curr_ptr_ = 0;
}
virtual ~MemoryFixSizeBuffer(void) {}
virtual size_t Read(void *ptr, size_t size) {
utils::Assert(curr_ptr_ + size <= buffer_size_,
"read can not have position excceed buffer length");
size_t nread = std::min(buffer_size_ - curr_ptr_, size);
if (nread != 0) memcpy(ptr, p_buffer_ + curr_ptr_, nread);
curr_ptr_ += nread;
return nread;
}
virtual void Write(const void *ptr, size_t size) {
if (size == 0) return;
utils::Assert(curr_ptr_ + size <= buffer_size_,
"write position exceed fixed buffer size");
memcpy(p_buffer_ + curr_ptr_, ptr, size);
curr_ptr_ += size;
}
virtual void Seek(size_t pos) {
curr_ptr_ = static_cast<size_t>(pos);
}
virtual size_t Tell(void) {
return curr_ptr_;
}
private:
/*! \brief in memory buffer */
char *p_buffer_;
/*! \brief current pointer */
size_t buffer_size_;
/*! \brief current pointer */
size_t curr_ptr_;
}; // class MemoryFixSizeBuffer
/*! \brief a in memory buffer that can be read and write as stream interface */
struct MemoryBufferStream : public ISeekStream {
public:
MemoryBufferStream(std::string *p_buffer)
: p_buffer_(p_buffer) {
curr_ptr_ = 0;
}
virtual ~MemoryBufferStream(void) {}
virtual size_t Read(void *ptr, size_t size) {
utils::Assert(curr_ptr_ <= p_buffer_->length(),
"read can not have position excceed buffer length");
size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
curr_ptr_ += nread;
return nread;
}
virtual void Write(const void *ptr, size_t size) {
if (size == 0) return;
if (curr_ptr_ + size > p_buffer_->length()) {
p_buffer_->resize(curr_ptr_+size);
}
memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
curr_ptr_ += size;
}
virtual void Seek(size_t pos) {
curr_ptr_ = static_cast<size_t>(pos);
}
virtual size_t Tell(void) {
return curr_ptr_;
}
private:
/*! \brief in memory buffer */
std::string *p_buffer_;
/*! \brief current pointer */
size_t curr_ptr_;
}; // class MemoryBufferStream
/*! \brief implementation of file i/o stream */
class FileStream : public ISeekStream {
public:
explicit FileStream(FILE *fp) : fp(fp) {}
explicit FileStream(void) {
this->fp = NULL;
}
virtual size_t Read(void *ptr, size_t size) {
return std::fread(ptr, size, 1, fp);
@ -101,14 +187,21 @@ class FileStream : public IStream {
virtual void Write(const void *ptr, size_t size) {
std::fwrite(ptr, size, 1, fp);
}
inline void Seek(size_t pos) {
std::fseek(fp, 0, SEEK_SET);
virtual void Seek(size_t pos) {
std::fseek(fp, static_cast<long>(pos), SEEK_SET);
}
virtual size_t Tell(void) {
return std::ftell(fp);
}
inline void Close(void) {
std::fclose(fp);
if (fp != NULL){
std::fclose(fp); fp = NULL;
}
}
};
private:
FILE *fp;
};
} // namespace utils
} // namespace xgboost
#endif

View File

@ -6,8 +6,11 @@
* \author Tianqi Chen
*/
#include <vector>
#include <utility>
#include <algorithm>
#include "./io.h"
#include "./utils.h"
#include "./omp.h"
namespace xgboost {
namespace utils {
@ -118,6 +121,140 @@ struct SparseCSRMBuilder {
}
};
/*!
* \brief a class used to help construct CSR format matrix file
* \tparam IndexType type of index used to store the index position
* \tparam SizeType type of size used in row pointer
*/
template<typename IndexType, typename SizeType = size_t>
struct SparseCSRFileBuilder {
public:
explicit SparseCSRFileBuilder(utils::ISeekStream *fo, size_t buffer_size)
: fo(fo), buffer_size(buffer_size) {
}
/*!
* \brief step 1: initialize the number of rows in the data, not necessary exact
* \nrows number of rows in the matrix, can be smaller than expected
*/
inline void InitBudget(size_t nrows = 0) {
rptr.clear();
rptr.resize(nrows + 1, 0);
}
/*!
* \brief step 2: add budget to each rows
* \param row_id the id of the row
* \param nelem number of element budget add to this row
*/
inline void AddBudget(size_t row_id, SizeType nelem = 1) {
if (rptr.size() < row_id + 2) {
rptr.resize(row_id + 2, 0);
}
rptr[row_id + 1] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage(void) {
SizeType nelem = 0;
for (size_t i = 1; i < rptr.size(); i++) {
nelem += rptr[i];
rptr[i] = nelem;
}
begin_data = static_cast<SizeType>(fo->Tell()) + sizeof(SizeType);
SizeType begin_meta = begin_data + nelem * sizeof(IndexType);
fo->Write(&begin_meta, sizeof(begin_meta));
fo->Seek(begin_meta);
fo->Write(rptr);
// setup buffer space
buffer_rptr.resize(rptr.size());
buffer_temp.reserve(buffer_size);
buffer_data.resize(buffer_size);
saved_offset = rptr;
saved_offset.resize(rptr.size() - 1);
this->ClearBuffer();
}
/*! \brief step 4: push element into buffer */
inline void PushElem(SizeType row_id, IndexType col_id) {
if (buffer_temp.size() == buffer_size) {
this->WriteBuffer();
this->ClearBuffer();
}
buffer_rptr[row_id + 1] += 1;
buffer_temp.push_back(std::make_pair(row_id, col_id));
}
/*! \brief finalize the construction */
inline void Finalize(void) {
this->WriteBuffer();
for (size_t i = 0; i < saved_offset.size(); ++i) {
utils::Assert(saved_offset[i] == rptr[i+1], "some block not write out");
}
}
/*! \brief content must be in wb+ */
template<typename Comparator>
inline void SortRows(Comparator comp, size_t step) {
for (size_t i = 0; i < rptr.size() - 1; i += step) {
bst_omp_uint begin = static_cast<bst_omp_uint>(i);
bst_omp_uint end = static_cast<bst_omp_uint>(std::min(rptr.size() - 1, i + step));
if (rptr[end] != rptr[begin]) {
fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
buffer_data.resize(rptr[end] - rptr[begin]);
fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
// do parallel sorting
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = begin; j < end; ++j) {
std::sort(&buffer_data[0] + rptr[j] - rptr[begin],
&buffer_data[0] + rptr[j+1] - rptr[begin],
comp);
}
fo->Seek(begin_data + rptr[begin] * sizeof(IndexType));
fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType));
}
}
}
protected:
inline void WriteBuffer(void) {
SizeType start = 0;
for (size_t i = 1; i < buffer_rptr.size(); ++i) {
size_t rlen = buffer_rptr[i];
buffer_rptr[i] = start;
start += rlen;
}
for (size_t i = 0; i < buffer_temp.size(); ++i) {
SizeType &rp = buffer_rptr[buffer_temp[i].first + 1];
buffer_data[rp++] = buffer_temp[i].second;
}
// write out
for (size_t i = 0; i < buffer_rptr.size() - 1; ++i) {
size_t nelem = buffer_rptr[i+1] - buffer_rptr[i];
if (nelem != 0) {
utils::Assert(saved_offset[i] + nelem <= rptr[i+1], "data exceed bound");
fo->Seek(saved_offset[i] * sizeof(IndexType) + begin_data);
fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType));
saved_offset[i] += nelem;
}
}
}
inline void ClearBuffer(void) {
buffer_temp.clear();
std::fill(buffer_rptr.begin(), buffer_rptr.end(), 0);
}
private:
/*! \brief output file pointer the data */
utils::ISeekStream *fo;
/*! \brief pointer to each of the row */
std::vector<SizeType> rptr;
/*! \brief saved top space of each item */
std::vector<SizeType> saved_offset;
/*! \brief beginning position of data */
size_t begin_data;
// ----- the following are buffer space
/*! \brief maximum size of content buffer*/
size_t buffer_size;
/*! \brief store the data content */
std::vector< std::pair<SizeType, IndexType> > buffer_temp;
/*! \brief saved top space of each item */
std::vector<SizeType> buffer_rptr;
/*! \brief saved top space of each item */
std::vector<IndexType> buffer_data;
};
} // namespace utils
} // namespace xgboost
#endif

747
src/utils/quantile.h Normal file
View File

@ -0,0 +1,747 @@
#ifndef XGBOOST_UTILS_QUANTILE_H_
#define XGBOOST_UTILS_QUANTILE_H_
/*!
* \file quantile.h
* \brief util to compute quantiles
* \author Tianqi Chen
*/
#include <cmath>
#include <vector>
#include <cstring>
#include <algorithm>
#include <iostream>
#include "./io.h"
#include "./utils.h"
namespace xgboost {
namespace utils {
/*!
* \brief experimental wsummary
* \tparam DType type of data content
* \tparam RType type of rank
*/
template<typename DType, typename RType>
struct WQSummary {
/*! \brief an entry in the sketch summary */
struct Entry {
/*! \brief minimum rank */
RType rmin;
/*! \brief maximum rank */
RType rmax;
/*! \brief maximum weight */
RType wmin;
/*! \brief the value of data */
DType value;
// constructor
Entry(void) {}
// constructor
Entry(RType rmin, RType rmax, RType wmin, DType value)
: rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
/*!
* \brief debug function, check Valid
* \param eps the tolerate level for violating the relation
*/
inline void CheckValid(RType eps = 0) const {
utils::Assert(rmin >= 0 && rmax >= 0 && wmin >= 0, "nonneg constraint");
utils::Assert(rmax- rmin - wmin > -eps, "relation constraint: min/max");
}
/*! \return rmin estimation for v strictly bigger than value */
inline RType rmin_next(void) const {
return rmin + wmin;
}
/*! \return rmax estimation for v strictly smaller than value */
inline RType rmax_prev(void) const {
return rmax - wmin;
}
};
/*! \brief input data queue before entering the summary */
struct Queue {
// entry in the queue
struct QEntry {
// value of the instance
DType value;
// weight of instance
RType weight;
// default constructor
QEntry(void) {}
// constructor
QEntry(DType value, RType weight)
: value(value), weight(weight) {}
// comparator on value
inline bool operator<(const QEntry &b) const {
return value < b.value;
}
};
// the input queue
std::vector<QEntry> queue;
// end of the queue
size_t qtail;
// push data to the queue
inline void Push(DType x, RType w) {
if (qtail == 0 || queue[qtail - 1].value != x) {
queue[qtail++] = QEntry(x, w);
} else {
queue[qtail - 1].weight += w;
}
}
inline void MakeSummary(WQSummary *out) {
std::sort(queue.begin(), queue.begin() + qtail);
out->size = 0;
// start update sketch
RType wsum = 0;
// construct data with unique weights
for (size_t i = 0; i < qtail;) {
size_t j = i + 1;
RType w = queue[i].weight;
while (j < qtail && queue[j].value == queue[i].value) {
w += queue[j].weight; ++j;
}
out->data[out->size++] = Entry(wsum, wsum + w, w, queue[i].value);
wsum += w; i = j;
}
}
};
/*! \brief data field */
Entry *data;
/*! \brief number of elements in the summary */
size_t size;
// constructor
WQSummary(Entry *data, size_t size)
: data(data), size(size) {}
/*!
* \return the maximum error of the Summary
*/
inline RType MaxError(void) const {
RType res = data[0].rmax - data[0].rmin - data[0].wmin;
for (size_t i = 1; i < size; ++i) {
res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
}
return res;
}
/*!
* \brief query qvalue, start from istart
* \param qvalue the value we query for
* \param istart starting position
*/
inline Entry Query(DType qvalue, size_t &istart) const {
while (istart < size && qvalue > data[istart].value) {
++istart;
}
if (istart == size) {
RType rmax = data[size - 1].rmax;
return Entry(rmax, rmax, 0.0f, qvalue);
}
if (qvalue == data[istart].value) {
return data[istart];
} else {
if (istart == 0) {
return Entry(0.0f, 0.0f, 0.0f, qvalue);
} else {
return Entry(data[istart - 1].rmin_next(),
data[istart].rmax_prev(),
0.0f, qvalue);
}
}
}
/*! \return maximum rank in the summary */
inline RType MaxRank(void) const {
return data[size - 1].rmax;
}
/*!
* \brief copy content from src
* \param src source sketch
*/
inline void CopyFrom(const WQSummary &src) {
size = src.size;
std::memcpy(data, src.data, sizeof(Entry) * size);
}
/*!
* \brief debug function, validate whether the summary
* run consistency check to check if it is a valid summary
* \param eps the tolerate error level, used when RType is floating point and
* some inconsistency could occur due to rounding error
*/
inline void CheckValid(RType eps) const {
for (size_t i = 0; i < size; ++i) {
data[i].CheckValid(eps);
if (i != 0) {
utils::Assert(data[i].rmin >= data[i - 1].rmin + data[i - 1].wmin, "rmin range constraint");
utils::Assert(data[i].rmax >= data[i - 1].rmax + data[i].wmin, "rmax range constraint");
}
}
}
/*! \brief used for debug purpose, print the summary */
inline void Print(void) const {
for (size_t i = 0; i < size; ++i) {
std::cout << "x=" << data[i].value << "\t"
<< "[" << data[i].rmin << "," << data[i].rmax << "]"
<< " wmin=" << data[i].wmin << std::endl;
}
}
/*!
* \brief set current summary to be pruned summary of src
* assume data field is already allocated to be at least maxsize
* \param src source summary
* \param maxsize size we can afford in the pruned sketch
*/
inline void SetPrune(const WQSummary &src, size_t maxsize) {
if (src.size <= maxsize) {
this->CopyFrom(src); return;
}
const RType begin = src.data[0].rmax;
const RType range = src.data[src.size - 1].rmin - src.data[0].rmax;
const size_t n = maxsize - 1;
data[0] = src.data[0];
this->size = 1;
// lastidx is used to avoid duplicated records
size_t i = 1, lastidx = 0;
for (size_t k = 1; k < n; ++k) {
RType dx2 = 2 * ((k * range) / n + begin);
// find first i such that d < (rmax[i+1] + rmin[i+1]) / 2
while (i < src.size - 1
&& dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
utils::Assert(i != src.size - 1, "this cannot happen");
if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
if (i != lastidx) {
data[size++] = src.data[i]; lastidx = i;
}
} else {
if (i + 1 != lastidx) {
data[size++] = src.data[i + 1]; lastidx = i + 1;
}
}
}
if (lastidx != src.size - 1) {
data[size++] = src.data[src.size - 1];
}
}
/*!
* \brief set current summary to be merged summary of sa and sb
* \param sa first input summary to be merged
* \param sb second input summar to be merged
*/
inline void SetCombine(const WQSummary &sa,
const WQSummary &sb) {
if (sa.size == 0) {
this->CopyFrom(sb); return;
}
if (sb.size == 0) {
this->CopyFrom(sa); return;
}
utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge");
const Entry *a = sa.data, *a_end = sa.data + sa.size;
const Entry *b = sb.data, *b_end = sb.data + sb.size;
// extended rmin value
RType aprev_rmin = 0, bprev_rmin = 0;
Entry *dst = this->data;
while (a != a_end && b != b_end) {
// duplicated value entry
if (a->value == b->value) {
*dst = Entry(a->rmin + b->rmin,
a->rmax + b->rmax,
a->wmin + b->wmin, a->value);
aprev_rmin = a->rmin_next();
bprev_rmin = b->rmin_next();
++dst; ++a; ++b;
} else if (a->value < b->value) {
*dst = Entry(a->rmin + bprev_rmin,
a->rmax + b->rmax_prev(),
a->wmin, a->value);
aprev_rmin = a->rmin_next();
++dst; ++a;
} else {
*dst = Entry(b->rmin + aprev_rmin,
b->rmax + a->rmax_prev(),
b->wmin, b->value);
bprev_rmin = b->rmin_next();
++dst; ++b;
}
}
if (a != a_end) {
RType brmax = (b_end - 1)->rmax;
do {
*dst = Entry(a->rmin + bprev_rmin, a->rmax + brmax, a->wmin, a->value);
++dst; ++a;
} while (a != a_end);
}
if (b != b_end) {
RType armax = (a_end - 1)->rmax;
do {
*dst = Entry(b->rmin + aprev_rmin, b->rmax + armax, b->wmin, b->value);
++dst; ++b;
} while (b != b_end);
}
this->size = dst - data;
utils::Assert(size <= sa.size + sb.size, "bug in combine");
}
};
/*! \brief try to do efficient prunning */
template<typename DType, typename RType>
struct WXQSummary : public WQSummary<DType, RType> {
// redefine entry type
typedef typename WQSummary<DType, RType>::Entry Entry;
// constructor
WXQSummary(Entry *data, size_t size)
: WQSummary<DType, RType>(data, size) {}
// check if the block is large chunk
inline static bool CheckLarge(const Entry &e, RType chunk) {
return e.rmin_next() > e.rmax_prev() + chunk;
}
// set prune
inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
if (src.size <= maxsize) {
this->CopyFrom(src); return;
}
RType begin = src.data[0].rmax;
size_t n = maxsize - 1, nbig = 0;
const RType range = src.data[src.size - 1].rmin - begin;
const RType chunk = 2 * range / n;
// minimized range
RType mrange = 0;
{
// first scan, grab all the big chunk
// moviing block index
size_t bid = 0;
for (size_t i = 1; i < src.size; ++i) {
if (CheckLarge(src.data[i], chunk)) {
if (bid != i - 1) {
mrange += src.data[i].rmax_prev() - src.data[bid].rmin_next();
}
bid = i; ++nbig;
}
}
if (bid != src.size - 2) {
mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
}
}
utils::Assert(nbig < n - 1, "too many large chunk");
this->data[0] = src.data[0];
this->size = 1;
// use smaller size
n = n - nbig;
// find the rest of point
size_t bid = 0, k = 1, lastidx = 0;
for (size_t end = 1; end < src.size; ++end) {
if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
if (bid != end - 1) {
size_t i = bid;
RType maxdx2 = src.data[end].rmax_prev() * 2;
for (; k < n; ++k) {
RType dx2 = 2 * ((k * mrange) / n + begin);
if (dx2 >= maxdx2) break;
while (i < end &&
dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
if (i != lastidx) {
this->data[this->size++] = src.data[i]; lastidx = i;
}
} else {
if (i + 1 != lastidx) {
this->data[this->size++] = src.data[i + 1]; lastidx = i + 1;
}
}
}
}
if (lastidx != end) {
this->data[this->size++] = src.data[end];
lastidx = end;
}
bid = end;
// shift base by the gap
begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev();
}
}
}
};
/*!
* \brief traditional GK summary
*/
template<typename DType, typename RType>
struct GKSummary {
/*! \brief an entry in the sketch summary */
struct Entry {
/*! \brief minimum rank */
RType rmin;
/*! \brief maximum rank */
RType rmax;
/*! \brief the value of data */
DType value;
// constructor
Entry(void) {}
// constructor
Entry(RType rmin, RType rmax, DType value)
: rmin(rmin), rmax(rmax), value(value) {}
};
/*! \brief input data queue before entering the summary */
struct Queue {
// the input queue
std::vector<DType> queue;
// end of the queue
size_t qtail;
// push data to the queue
inline void Push(DType x, RType w) {
queue[qtail++] = x;
}
inline void MakeSummary(GKSummary *out) {
std::sort(queue.begin(), queue.begin() + qtail);
out->size = qtail;
for (size_t i = 0; i < qtail; ++i) {
out->data[i] = Entry(i + 1, i + 1, queue[i]);
}
}
};
/*! \brief data field */
Entry *data;
/*! \brief number of elements in the summary */
size_t size;
GKSummary(Entry *data, size_t size)
: data(data), size(size) {}
/*! \brief the maximum error of the summary */
inline RType MaxError(void) const {
RType res = 0;
for (size_t i = 1; i < size; ++i) {
res = std::max(data[i].rmax - data[i-1].rmin, res);
}
return res;
}
/*! \return maximum rank in the summary */
inline RType MaxRank(void) const {
return data[size - 1].rmax;
}
/*!
* \brief copy content from src
* \param src source sketch
*/
inline void CopyFrom(const GKSummary &src) {
size = src.size;
std::memcpy(data, src.data, sizeof(Entry) * size);
}
inline void CheckValid(RType eps) const {
// assume always valid
}
/*! \brief used for debug purpose, print the summary */
inline void Print(void) const {
for (size_t i = 0; i < size; ++i) {
std::cout << "x=" << data[i].value << "\t"
<< "[" << data[i].rmin << "," << data[i].rmax << "]"
<< std::endl;
}
}
/*!
* \brief set current summary to be pruned summary of src
* assume data field is already allocated to be at least maxsize
* \param src source summary
* \param maxsize size we can afford in the pruned sketch
*/
inline void SetPrune(const GKSummary &src, size_t maxsize) {
if (src.size <= maxsize) {
this->CopyFrom(src); return;
}
const RType max_rank = src.MaxRank();
this->size = maxsize;
data[0] = src.data[0];
size_t n = maxsize - 1;
RType top = 1;
for (size_t i = 1; i < n; ++i) {
RType k = (i * max_rank) / n;
while (k > src.data[top + 1].rmax) ++top;
// assert src.data[top].rmin <= k
// because k > src.data[top].rmax >= src.data[top].rmin
if ((k - src.data[top].rmin) < (src.data[top+1].rmax - k)) {
data[i] = src.data[top];
} else {
data[i] = src.data[top + 1];
}
}
data[n] = src.data[src.size - 1];
}
inline void SetCombine(const GKSummary &sa,
const GKSummary &sb) {
if (sa.size == 0) {
this->CopyFrom(sb); return;
}
if (sb.size == 0) {
this->CopyFrom(sa); return;
}
utils::Assert(sa.size > 0 && sb.size > 0, "invalid input for merge");
const Entry *a = sa.data, *a_end = sa.data + sa.size;
const Entry *b = sb.data, *b_end = sb.data + sb.size;
this->size = sa.size + sb.size;
RType aprev_rmin = 0, bprev_rmin = 0;
Entry *dst = this->data;
while (a != a_end && b != b_end) {
if (a->value < b->value) {
*dst = Entry(bprev_rmin + a->rmin,
a->rmax + b->rmax - 1, a->value);
aprev_rmin = a->rmin;
++dst; ++a;
} else {
*dst = Entry(aprev_rmin + b->rmin,
b->rmax + a->rmax - 1, b->value);
bprev_rmin = b->rmin;
++dst; ++b;
}
}
if (a != a_end) {
RType bprev_rmax = (b_end - 1)->rmax;
do {
*dst = Entry(bprev_rmin + a->rmin, bprev_rmax + a->rmax, a->value);
++dst; ++a;
} while (a != a_end);
}
if (b != b_end) {
RType aprev_rmax = (a_end - 1)->rmax;
do {
*dst = Entry(aprev_rmin + b->rmin, aprev_rmax + b->rmax, b->value);
++dst; ++b;
} while (b != b_end);
}
utils::Assert(dst == data + size, "bug in combine");
}
};
/*!
* \brief template for all quantle sketch algorithm
* that uses merge/prune scheme
* \tparam DType type of data content
* \tparam RType type of rank
* \tparam TSummary actual summary data structure it uses
*/
template<typename DType, typename RType, class TSummary>
class QuantileSketchTemplate {
public:
/*! \brief type of summary type */
typedef TSummary Summary;
/*! \brief the entry type */
typedef typename Summary::Entry Entry;
/*! \brief same as summary, but use STL to backup the space */
struct SummaryContainer : public Summary {
std::vector<Entry> space;
SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) {
this->space = src.space;
this->data = BeginPtr(this->space);
}
SummaryContainer(void) : Summary(NULL, 0) {
}
/*! \brief reserve space for summary */
inline void Reserve(size_t size) {
if (size > space.size()) {
space.resize(size);
this->data = BeginPtr(space);
}
}
/*!
* \brief set the space to be merge of all Summary arrays
* \param begin begining position in th summary array
* \param end ending position in the Summary array
*/
inline void SetMerge(const Summary *begin,
const Summary *end) {
utils::Assert(begin < end, "can not set combine to empty instance");
size_t len = end - begin;
if (len == 1) {
this->Reserve(begin[0].size);
this->CopyFrom(begin[0]);
} else if (len == 2) {
this->Reserve(begin[0].size + begin[1].size);
this->SetMerge(begin[0], begin[1]);
} else {
// recursive merge
SummaryContainer lhs, rhs;
lhs.SetCombine(begin, begin + len / 2);
rhs.SetCombine(begin + len / 2, end);
this->Reserve(lhs.size + rhs.size);
this->SetCombine(lhs, rhs);
}
}
/*!
* \brief do elementwise combination of summary array
* this[i] = combine(this[i], src[i]) for each i
* \param src the source summary
* \param max_nbyte, maximum number of byte allowed in here
*/
inline void Reduce(const Summary &src, size_t max_nbyte) {
this->Reserve((max_nbyte - sizeof(this->size)) / sizeof(Entry));
SummaryContainer temp;
temp.Reserve(this->size + src.size);
temp.SetCombine(*this, src);
this->SetPrune(temp, space.size());
}
/*! \brief return the number of bytes this data structure cost in serialization */
inline static size_t CalcMemCost(size_t nentry) {
return sizeof(size_t) + sizeof(Entry) * nentry;
}
/*! \brief save the data structure into stream */
template<typename TStream>
inline void Save(TStream &fo) const {
fo.Write(&(this->size), sizeof(this->size));
if (this->size != 0) {
fo.Write(this->data, this->size * sizeof(Entry));
}
}
/*! \brief load data structure from input stream */
template<typename TStream>
inline void Load(TStream &fi) {
utils::Check(fi.Read(&this->size, sizeof(this->size)) != 0, "invalid SummaryArray 1");
this->Reserve(this->size);
if (this->size != 0) {
utils::Check(fi.Read(this->data, this->size * sizeof(Entry)) != 0, "invalid SummaryArray 2");
}
}
};
/*!
* \brief intialize the quantile sketch, given the performance specification
* \param maxn maximum number of data points can be feed into sketch
* \param eps accuracy level of summary
*/
inline void Init(size_t maxn, double eps) {
nlevel = 1;
while (true) {
limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
size_t n = (1UL << nlevel);
if (n * limit_size >= maxn) break;
++nlevel;
}
// check invariant
size_t n = (1UL << nlevel);
utils::Assert(n * limit_size >= maxn, "invalid init parameter");
utils::Assert(nlevel <= limit_size * eps, "invalid init parameter");
// lazy reserve the space, if there is only one value, no need to allocate space
inqueue.queue.resize(1);
inqueue.qtail = 0;
data.clear();
level.clear();
}
/*!
* \brief add an element to a sketch
* \param x the elemented added to the sketch
*/
inline void Push(DType x, RType w = 1) {
if (inqueue.qtail == inqueue.queue.size()) {
// jump from lazy one value to limit_size * 2
if (inqueue.queue.size() == 1) {
inqueue.queue.resize(limit_size * 2);
} else {
temp.Reserve(limit_size * 2);
inqueue.MakeSummary(&temp);
// cleanup queue
inqueue.qtail = 0;
this->PushTemp();
}
}
inqueue.Push(x, w);
}
/*! \brief push up temp */
inline void PushTemp(void) {
temp.Reserve(limit_size * 2);
for (size_t l = 1; true; ++l) {
this->InitLevel(l + 1);
// check if level l is empty
if (level[l].size == 0) {
level[l].SetPrune(temp, limit_size);
break;
} else {
// level 0 is actually temp space
level[0].SetPrune(temp, limit_size);
temp.SetCombine(level[0], level[l]);
if (temp.size > limit_size) {
// try next level
level[l].size = 0;
} else {
// if merged record is still smaller, no need to send to next level
level[l].CopyFrom(temp); break;
}
}
}
}
/*! \brief get the summary after finalize */
inline void GetSummary(SummaryContainer *out) {
if (level.size() != 0) {
out->Reserve(limit_size * 2);
} else {
out->Reserve(inqueue.queue.size());
}
inqueue.MakeSummary(out);
if (level.size() != 0) {
level[0].SetPrune(*out, limit_size);
for (size_t l = 1; l < level.size(); ++l) {
if (level[l].size == 0) continue;
if (level[0].size == 0) {
level[0].CopyFrom(level[l]);
} else {
out->SetCombine(level[0], level[l]);
level[0].SetPrune(*out, limit_size);
}
}
out->CopyFrom(level[0]);
} else {
if (out->size > limit_size) {
temp.Reserve(limit_size);
temp.SetPrune(*out, limit_size);
out->CopyFrom(temp);
}
}
}
// used for debug, check if the sketch is valid
inline void CheckValid(RType eps) const {
for (size_t l = 1; l < level.size(); ++l) {
level[l].CheckValid(eps);
}
}
// initialize level space to at least nlevel
inline void InitLevel(size_t nlevel) {
if (level.size() >= nlevel) return;
data.resize(limit_size * nlevel);
level.resize(nlevel, Summary(NULL, 0));
for (size_t l = 0; l < level.size(); ++l) {
level[l].data = BeginPtr(data) + l * limit_size;
}
}
// input data queue
typename Summary::Queue inqueue;
// number of levels
size_t nlevel;
// size of summary in each level
size_t limit_size;
// the level of each summaries
std::vector<Summary> level;
// content of the summary
std::vector<Entry> data;
// temporal summary, used for temp-merge
SummaryContainer temp;
};
/*!
* \brief Quantile sketch use WQSummary
* \tparam DType type of data content
* \tparam RType type of rank
*/
template<typename DType, typename RType=unsigned>
class WQuantileSketch :
public QuantileSketchTemplate<DType, RType, WQSummary<DType, RType> >{
};
/*!
* \brief Quantile sketch use WXQSummary
* \tparam DType type of data content
* \tparam RType type of rank
*/
template<typename DType, typename RType=unsigned>
class WXQuantileSketch :
public QuantileSketchTemplate<DType, RType, WXQSummary<DType, RType> >{
};
/*!
* \brief Quantile sketch use WQSummary
* \tparam DType type of data content
* \tparam RType type of rank
*/
template<typename DType, typename RType=unsigned>
class GKQuantileSketch :
public QuantileSketchTemplate<DType, RType, GKSummary<DType, RType> >{
};
} // utils
} // xgboost
#endif

146
src/utils/thread.h Normal file
View File

@ -0,0 +1,146 @@
#ifndef XGBOOST_UTILS_THREAD_H
#define XGBOOST_UTILS_THREAD_H
/*!
* \file thread.h
* \brief this header include the minimum necessary resource for multi-threading
* \author Tianqi Chen
* Acknowledgement: this file is adapted from SVDFeature project, by same author.
* The MAC support part of this code is provided by Artemy Kolchinsky
*/
#ifdef _MSC_VER
#include "utils.h"
#include <windows.h>
#include <process.h>
namespace xgboost {
namespace utils {
/*! \brief simple semaphore used for synchronization */
class Semaphore {
public :
inline void Init(int init_val) {
sem = CreateSemaphore(NULL, init_val, 10, NULL);
utils::Assert(sem != NULL, "create Semaphore error");
}
inline void Destroy(void) {
CloseHandle(sem);
}
inline void Wait(void) {
utils::Assert(WaitForSingleObject(sem, INFINITE) == WAIT_OBJECT_0, "WaitForSingleObject error");
}
inline void Post(void) {
utils::Assert(ReleaseSemaphore(sem, 1, NULL) != 0, "ReleaseSemaphore error");
}
private:
HANDLE sem;
};
/*! \brief simple thread that wraps windows thread */
class Thread {
private:
HANDLE thread_handle;
unsigned thread_id;
public:
inline void Start(unsigned int __stdcall entry(void*), void *param) {
thread_handle = (HANDLE)_beginthreadex(NULL, 0, entry, param, 0, &thread_id);
}
inline int Join(void) {
WaitForSingleObject(thread_handle, INFINITE);
return 0;
}
};
/*! \brief exit function called from thread */
inline void ThreadExit(void *status) {
_endthreadex(0);
}
#define XGBOOST_THREAD_PREFIX unsigned int __stdcall
} // namespace utils
} // namespace xgboost
#else
// thread interface using g++
#include <semaphore.h>
#include <pthread.h>
namespace xgboost {
namespace utils {
/*!\brief semaphore class */
class Semaphore {
#ifdef __APPLE__
private:
sem_t* semPtr;
char sema_name[20];
private:
inline void GenRandomString(char *s, const int len) {
static const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;
for (int i = 0; i < len; ++i) {
s[i] = alphanum[rand() % (sizeof(alphanum) - 1)];
}
s[len] = 0;
}
public:
inline void Init(int init_val) {
sema_name[0]='/';
sema_name[1]='s';
sema_name[2]='e';
sema_name[3]='/';
GenRandomString(&sema_name[4], 16);
if((semPtr = sem_open(sema_name, O_CREAT, 0644, init_val)) == SEM_FAILED) {
perror("sem_open");
exit(1);
}
utils::Assert(semPtr != NULL, "create Semaphore error");
}
inline void Destroy(void) {
if (sem_close(semPtr) == -1) {
perror("sem_close");
exit(EXIT_FAILURE);
}
if (sem_unlink(sema_name) == -1) {
perror("sem_unlink");
exit(EXIT_FAILURE);
}
}
inline void Wait(void) {
sem_wait(semPtr);
}
inline void Post(void) {
sem_post(semPtr);
}
#else
private:
sem_t sem;
public:
inline void Init(int init_val) {
sem_init(&sem, 0, init_val);
}
inline void Destroy(void) {
sem_destroy(&sem);
}
inline void Wait(void) {
sem_wait(&sem);
}
inline void Post(void) {
sem_post(&sem);
}
#endif
};
/*!\brief simple thread class */
class Thread {
private:
pthread_t thread;
public :
inline void Start(void * entry(void*), void *param) {
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
pthread_create(&thread, &attr, entry, param);
}
inline int Join(void) {
void *status;
return pthread_join(thread, &status);
}
};
inline void ThreadExit(void *status) {
pthread_exit(status);
}
} // namespace utils
} // namespace xgboost
#define XGBOOST_THREAD_PREFIX void *
#endif
#endif

203
src/utils/thread_buffer.h Normal file
View File

@ -0,0 +1,203 @@
#ifndef XGBOOST_UTILS_THREAD_BUFFER_H_
#define XGBOOST_UTILS_THREAD_BUFFER_H_
/*!
* \file thread_buffer.h
* \brief multi-thread buffer, iterator, can be used to create parallel pipeline
* \author Tianqi Chen
*/
#include <vector>
#include <cstring>
#include <cstdlib>
#include "./utils.h"
#include "./thread.h"
namespace xgboost {
namespace utils {
/*!
* \brief buffered loading iterator that uses multithread
* this template method will assume the following paramters
* \tparam Elem elememt type to be buffered
* \tparam ElemFactory factory type to implement in order to use thread buffer
*/
template<typename Elem, typename ElemFactory>
class ThreadBuffer {
public:
/*!\brief constructor */
ThreadBuffer(void) {
this->init_end = false;
this->buf_size = 30;
}
~ThreadBuffer(void) {
if(init_end) this->Destroy();
}
/*!\brief set parameter, will also pass the parameter to factory */
inline void SetParam(const char *name, const char *val) {
if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
factory.SetParam(name, val);
}
/*!
* \brief initalize the buffered iterator
* \param param a initialize parameter that will pass to factory, ignore it if not necessary
* \return false if the initlization can't be done, e.g. buffer file hasn't been created
*/
inline bool Init(void) {
if (!factory.Init()) return false;
for (int i = 0; i < buf_size; ++i) {
bufA.push_back(factory.Create());
bufB.push_back(factory.Create());
}
this->init_end = true;
this->StartLoader();
return true;
}
/*!\brief place the iterator before first value */
inline void BeforeFirst(void) {
// wait till last loader end
loading_end.Wait();
// critcal zone
current_buf = 1;
factory.BeforeFirst();
// reset terminate limit
endA = endB = buf_size;
// wake up loader for first part
loading_need.Post();
// wait til first part is loaded
loading_end.Wait();
// set current buf to right value
current_buf = 0;
// wake loader for next part
data_loaded = false;
loading_need.Post();
// set buffer value
buf_index = 0;
}
/*! \brief destroy the buffer iterator, will deallocate the buffer */
inline void Destroy(void) {
// wait until the signal is consumed
this->destroy_signal = true;
loading_need.Post();
loader_thread.Join();
loading_need.Destroy();
loading_end.Destroy();
for (size_t i = 0; i < bufA.size(); ++i) {
factory.FreeSpace(bufA[i]);
}
for (size_t i = 0; i < bufB.size(); ++i) {
factory.FreeSpace(bufB[i]);
}
bufA.clear(); bufB.clear();
factory.Destroy();
this->init_end = false;
}
/*!
* \brief get the next element needed in buffer
* \param elem element to store into
* \return whether reaches end of data
*/
inline bool Next(Elem &elem) {
// end of buffer try to switch
if (buf_index == buf_size) {
this->SwitchBuffer();
buf_index = 0;
}
if (buf_index >= (current_buf ? endA : endB)) {
return false;
}
std::vector<Elem> &buf = current_buf ? bufA : bufB;
elem = buf[buf_index];
++buf_index;
return true;
}
/*!
* \brief get the factory object
*/
inline ElemFactory &get_factory(void) {
return factory;
}
inline const ElemFactory &get_factory(void) const{
return factory;
}
// size of buffer
int buf_size;
private:
// factory object used to load configures
ElemFactory factory;
// index in current buffer
int buf_index;
// indicate which one is current buffer
int current_buf;
// max limit of visit, also marks termination
int endA, endB;
// double buffer, one is accessed by loader
// the other is accessed by consumer
// buffer of the data
std::vector<Elem> bufA, bufB;
// initialization end
bool init_end;
// singal whether the data is loaded
bool data_loaded;
// signal to kill the thread
bool destroy_signal;
// thread object
Thread loader_thread;
// signal of the buffer
Semaphore loading_end, loading_need;
/*!
* \brief slave thread
* this implementation is like producer-consumer style
*/
inline void RunLoader(void) {
while(!destroy_signal) {
// sleep until loading is needed
loading_need.Wait();
std::vector<Elem> &buf = current_buf ? bufB : bufA;
int i;
for (i = 0; i < buf_size ; ++i) {
if (!factory.LoadNext(buf[i])) {
int &end = current_buf ? endB : endA;
end = i; // marks the termination
break;
}
}
// signal that loading is done
data_loaded = true;
loading_end.Post();
}
}
/*!\brief entry point of loader thread */
inline static XGBOOST_THREAD_PREFIX LoaderEntry(void *pthread) {
static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
ThreadExit(NULL);
return NULL;
}
/*!\brief start loader thread */
inline void StartLoader(void) {
destroy_signal = false;
// set param
current_buf = 1;
loading_need.Init(1);
loading_end .Init(0);
// reset terminate limit
endA = endB = buf_size;
loader_thread.Start(LoaderEntry, this);
// wait until first part of data is loaded
loading_end.Wait();
// set current buf to right value
current_buf = 0;
// wake loader for next part
data_loaded = false;
loading_need.Post();
buf_index = 0;
}
/*!\brief switch double buffer */
inline void SwitchBuffer(void) {
loading_end.Wait();
// loader shall be sleep now, critcal zone!
current_buf = !current_buf;
// wake up loader
data_loaded = false;
loading_need.Post();
}
};
} // namespace utils
} // namespace xgboost
#endif

View File

@ -1,9 +1,10 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
#define NOMINMAX
#include <ctime>
#include <string>
#include <cstring>
#include "./sync/sync.h"
#include "io/io.h"
#include "utils/utils.h"
#include "utils/config.h"
@ -13,13 +14,13 @@ namespace xgboost {
/*!
* \brief wrapping the training process
*/
class BoostLearnTask{
class BoostLearnTask {
public:
inline int Run(int argc, char *argv[]) {
if (argc < 2) {
printf("Usage: <config>\n");
return 0;
}
}
utils::ConfigIterator itr(argv[1]);
while (itr.Next()) {
this->SetParam(itr.name(), itr.val());
@ -30,8 +31,36 @@ class BoostLearnTask{
this->SetParam(name, val);
}
}
// do not save anything when save to stdout
if (model_out == "stdout" || name_pred == "stdout") {
this->SetParam("silent", "1");
save_period = 0;
}
// whether need data rank
bool need_data_rank = strchr(train_path.c_str(), '%') != NULL;
// if need data rank in loading, initialize rabit engine before load data
// otherwise, initialize rabit engine after loading data
// lazy initialization of rabit engine can be helpful in speculative execution
if (need_data_rank) rabit::Init(argc, argv);
this->InitData();
this->InitLearner();
if (!need_data_rank) rabit::Init(argc, argv);
if (rabit::IsDistributed()) {
std::string pname = rabit::GetProcessorName();
fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
}
if (rabit::IsDistributed() && data_split == "NONE") {
this->SetParam("dsplit", "row");
}
if (rabit::GetRank() != 0) {
this->SetParam("silent", "2");
}
if (task == "train") {
// if task is training, will try recover from checkpoint
this->TaskTrain();
return 0;
} else {
this->InitLearner();
}
if (task == "dump") {
this->TaskDump(); return 0;
}
@ -40,8 +69,6 @@ class BoostLearnTask{
}
if (task == "pred") {
this->TaskPred();
} else {
this->TaskTrain();
}
return 0;
}
@ -62,6 +89,7 @@ class BoostLearnTask{
if (!strcmp("fmap", name)) name_fmap = val;
if (!strcmp("name_dump", name)) name_dump = val;
if (!strcmp("name_pred", name)) name_pred = val;
if (!strcmp("dsplit", name)) data_split = val;
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
if (!strncmp("eval[", name, 5)) {
char evname[256];
@ -89,6 +117,8 @@ class BoostLearnTask{
name_pred = "pred.txt";
name_dump = "dump.txt";
model_dir_path = "./";
data_split = "NONE";
load_part = 0;
data = NULL;
}
~BoostLearnTask(void){
@ -99,13 +129,20 @@ class BoostLearnTask{
}
private:
inline void InitData(void) {
if (strchr(train_path.c_str(), '%') != NULL) {
char s_tmp[256];
utils::SPrintf(s_tmp, sizeof(s_tmp), train_path.c_str(), rabit::GetRank());
train_path = s_tmp;
load_part = 1;
}
if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str());
if (task == "dump") return;
if (task == "pred") {
data = io::LoadDataMatrix(test_path.c_str(), silent != 0, use_buffer != 0);
} else {
// training
data = io::LoadDataMatrix(train_path.c_str(), silent != 0, use_buffer != 0);
data = io::LoadDataMatrix(train_path.c_str(), silent != 0 && load_part == 0, use_buffer != 0);
utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG");
for (size_t i = 0; i < eval_data_names.size(); ++i) {
deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0));
@ -120,35 +157,61 @@ class BoostLearnTask{
learner.SetCacheData(dcache);
// add training set to evaluation set if needed
if( eval_train != 0 ) {
if (eval_train != 0) {
devalall.push_back(data);
eval_data_names.push_back(std::string("train"));
}
}
}
inline void InitLearner(void) {
if (model_in != "NULL"){
utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb"));
learner.LoadModel(fi);
fi.Close();
if (model_in != "NULL") {
learner.LoadModel(model_in.c_str());
} else {
utils::Assert(task == "train", "model_in not specified");
learner.InitModel();
}
}
inline void TaskTrain(void) {
int version = rabit::LoadCheckPoint(&learner);
if (version == 0) this->InitLearner();
const time_t start = time(NULL);
unsigned long elapsed = 0;
learner.CheckInit(data);
for (int i = 0; i < num_round; ++i) {
bool allow_lazy = learner.AllowLazyCheckPoint();
for (int i = version / 2; i < num_round; ++i) {
elapsed = (unsigned long)(time(NULL) - start);
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
learner.UpdateOneIter(i, *data);
if (version % 2 == 0) {
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
learner.UpdateOneIter(i, *data);
if (allow_lazy) {
rabit::LazyCheckPoint(&learner);
} else {
rabit::CheckPoint(&learner);
}
version += 1;
}
utils::Assert(version == rabit::VersionNumber(), "consistent check");
std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
fprintf(stderr, "%s\n", res.c_str());
if (rabit::IsDistributed()){
if (rabit::GetRank() == 0) {
rabit::TrackerPrintf("%s\n", res.c_str());
}
} else {
if (silent < 2) {
fprintf(stderr, "%s\n", res.c_str());
}
}
if (save_period != 0 && (i + 1) % save_period == 0) {
this->SaveModel(i);
}
if (allow_lazy) {
rabit::LazyCheckPoint(&learner);
} else {
rabit::CheckPoint(&learner);
}
version += 1;
utils::Assert(version == rabit::VersionNumber(), "consistent check");
elapsed = (unsigned long)(time(NULL) - start);
}
// always save final round
@ -176,9 +239,8 @@ class BoostLearnTask{
fclose(fo);
}
inline void SaveModel(const char *fname) const {
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
learner.SaveModel(fo);
fo.Close();
if (rabit::GetRank() != 0) return;
learner.SaveModel(fname);
}
inline void SaveModel(int i) const {
char fname[256];
@ -189,16 +251,23 @@ class BoostLearnTask{
std::vector<float> preds;
if (!silent) printf("start prediction...\n");
learner.Predict(*data, pred_margin != 0, &preds, ntree_limit);
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
FILE *fo = utils::FopenCheck(name_pred.c_str(), "w");
for (size_t i = 0; i < preds.size(); i++) {
fprintf(fo, "%f\n", preds[i]);
if (!silent) printf("writing prediction to %s\n", name_pred.c_str());
FILE *fo;
if (name_pred != "stdout") {
fo = utils::FopenCheck(name_pred.c_str(), "w");
} else {
fo = stdout;
}
fclose(fo);
for (size_t i = 0; i < preds.size(); ++i) {
fprintf(fo, "%g\n", preds[i]);
}
if (fo != stdout) fclose(fo);
}
private:
/*! \brief whether silent */
int silent;
/*! \brief special load */
int load_part;
/*! \brief whether use auto binary buffer */
int use_buffer;
/*! \brief whether evaluate training statistics */
@ -219,6 +288,8 @@ class BoostLearnTask{
std::string task;
/*! \brief name of predict file */
std::string name_pred;
/*! \brief data split mode */
std::string data_split;
/*!\brief limit number of trees in prediction */
int ntree_limit;
/*!\brief whether to directly output margin value */
@ -243,7 +314,9 @@ class BoostLearnTask{
}
int main(int argc, char *argv[]){
xgboost::random::Seed(0);
xgboost::BoostLearnTask tsk;
return tsk.Run(argc, argv);
tsk.SetParam("seed", "0");
int ret = tsk.Run(argc, argv);
rabit::Finalize();
return ret;
}

5
subtree/README.md Normal file
View File

@ -0,0 +1,5 @@
This folder contains git subtree projects of xgboost.
Do not make changes to the subtree projects in xgboost,
push changes to the original project instead and changes will be pulled back to this folder
* rabit: https://github.com/tqchen/rabit

Some files were not shown because too many files have changed in this diff Show More