diff --git a/.gitignore b/.gitignore
index ee5928043..789c6b7c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
*.slo
*.lo
*.o
-
+*.page
# Compiled Dynamic libraries
*.so
*.dylib
@@ -45,5 +45,13 @@ Debug
*save
*csv
.Rproj.user
+*.cpage.col
+*.cpage
xgboost
+xgboost.mpi
xgboost.mock
+train*
+rabit
+.Rbuildignore
+R-package.Rproj
+
diff --git a/CHANGES.md b/CHANGES.md
index 027a077c6..d834ce79d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -20,3 +20,9 @@ xgboost-0.3
* Linear booster is now parallelized, using parallel coordinated descent.
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
* Add R module
+
+in progress version
+=====
+* Distributed version
+* Feature importance visualization in R module, thanks to Michael Benesty
+* Predict leaf inde
diff --git a/Makefile b/Makefile
index 3230661d4..a4bbe876f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
export CC = gcc
export CXX = g++
+export MPICXX = mpicxx
export LDFLAGS= -pthread -lm
-
-export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
+export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC
ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
@@ -10,56 +10,90 @@ else
CFLAGS += -fopenmp
endif
+# by default use c++11
+ifeq ($(cxx11),1)
+ CFLAGS += -std=c++11
+else
+endif
+
# specify tensor path
BIN = xgboost
-OBJ = updater.o gbm.o io.o
+MOCKBIN = xgboost.mock
+OBJ = updater.o gbm.o io.o main.o
+MPIBIN = xgboost.mpi
SLIB = wrapper/libxgboostwrapper.so
-.PHONY: clean all python Rpack
+.PHONY: clean all mpi python Rpack
-all: $(BIN) $(OBJ) $(SLIB)
+all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
+mpi: $(MPIBIN)
python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
-updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
-gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
+updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
+gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
-xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
-wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
+main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
+xgboost.mpi: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mpi.a
+xgboost.mock: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mock.a
+xgboost: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit.a
+wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o subtree/rabit/lib/librabit.a
+
+# dependency on rabit
+subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
+ cd subtree/rabit;make lib/librabit.a; cd ../..
+subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
+ cd subtree/rabit;make lib/librabit_empty.a; cd ../..
+subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
+ cd subtree/rabit;make lib/librabit_mock.a; cd ../..
+subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
+ cd subtree/rabit;make lib/librabit_mpi.a; cd ../..
$(BIN) :
- $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
+ $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
+
+$(MOCKBIN) :
+ $(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
$(SLIB) :
- $(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
+ $(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
$(OBJ) :
- $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+ $(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
+
+$(MPIOBJ) :
+ $(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
+
+$(MPIBIN) :
+ $(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
install:
cp -f -r $(BIN) $(INSTALL_PATH)
Rpack:
make clean
+ cd subtree/rabit;make clean;cd ..
rm -rf xgboost xgboost*.tar.gz
cp -r R-package xgboost
rm -rf xgboost/inst/examples/*.buffer
rm -rf xgboost/inst/examples/*.model
rm -rf xgboost/inst/examples/dump*
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
+ rm -rf subtree/rabit/src/*.o
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
rm -rf xgboost/demo/runall.R
cp -r src xgboost/src/src
+ cp -r subtree xgboost/src/subtree
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
cp ./LICENSE xgboost
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
- cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
+ cp xgboost/src/Makevars xgboost/src/Makevars.win
R CMD build xgboost
rm -rf xgboost
R CMD check --as-cran xgboost*.tar.gz
clean:
- $(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
+ $(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
+ cd subtree/rabit; make clean; cd ..
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index cc1c22087..63ed9581c 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -22,7 +22,7 @@ Depends:
Imports:
Matrix (>= 1.1-0),
methods,
- data.table (>= 1.9),
+ data.table (>= 1.9.4),
magrittr (>= 1.5),
stringr,
- DiagrammeR
\ No newline at end of file
+ DiagrammeR
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index d29ad7a18..12225c966 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -1,4 +1,4 @@
-# Generated by roxygen2 (4.1.0): do not edit by hand
+# Generated by roxygen2 (4.0.1): do not edit by hand
export(getinfo)
export(setinfo)
diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R
index ed61ba654..6e291fe62 100644
--- a/R-package/R/getinfo.xgb.DMatrix.R
+++ b/R-package/R/getinfo.xgb.DMatrix.R
@@ -32,10 +32,15 @@ setMethod("getinfo", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
- if (name != "label" && name != "weight" && name != "base_margin") {
+ if (name != "label" && name != "weight" &&
+ name != "base_margin" && name != "nrow") {
stop(paste("xgb.getinfo: unknown info name", name))
}
- ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
+ if (name != "nrow"){
+ ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
+ } else {
+ ret <- xgb.numrow(object)
+ }
return(ret)
})
diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R
index d57017b65..1e458e708 100644
--- a/R-package/R/predict.xgb.Booster.R
+++ b/R-package/R/predict.xgb.Booster.R
@@ -7,6 +7,8 @@ setClass("xgb.Booster")
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
+#' @param missing Missing is only used when input is dense matrix, pick a float
+#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
@@ -14,6 +16,7 @@ setClass("xgb.Booster")
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
+#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
@@ -25,7 +28,8 @@ setClass("xgb.Booster")
#' @export
#'
setMethod("predict", signature = "xgb.Booster",
- definition = function(object, newdata, missing = NULL, outputmargin = FALSE, ntreelimit = NULL) {
+ definition = function(object, newdata, missing = NULL,
+ outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
if (class(newdata) != "xgb.DMatrix") {
if (is.null(missing)) {
newdata <- xgb.DMatrix(newdata)
@@ -40,7 +44,24 @@ setMethod("predict", signature = "xgb.Booster",
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
- ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
+ option = 0
+ if (outputmargin) {
+ option <- option + 1
+ }
+ if (predleaf) {
+ option <- option + 2
+ }
+ ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(option),
+ as.integer(ntreelimit), PACKAGE = "xgboost")
+ if (predleaf){
+ len <- getinfo(newdata, "nrow")
+ if (length(ret) == len){
+ ret <- matrix(ret,ncol = 1)
+ } else {
+ ret <- matrix(ret, ncol = len)
+ ret <- t(ret)
+ }
+ }
return(ret)
})
diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R
index 419170a66..b70a8ee92 100644
--- a/R-package/R/slice.xgb.DMatrix.R
+++ b/R-package/R/slice.xgb.DMatrix.R
@@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
- ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
+ ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
+ PACKAGE = "xgboost")
+
+ attr_list <- attributes(object)
+ nr <- xgb.numrow(object)
+ len <- sapply(attr_list,length)
+ ind <- which(len==nr)
+ if (length(ind)>0) {
+ nms <- names(attr_list)[ind]
+ for (i in 1:length(ind)) {
+ attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
+ }
+ }
return(structure(ret, class = "xgb.DMatrix"))
})
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 34ce003db..412132891 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -131,7 +131,7 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
}
# iteratively evaluate one iteration
-xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
+xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
@@ -169,18 +169,27 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
} else {
msg <- ""
}
+ if (prediction){
+ preds <- predict(booster,watchlist[[2]])
+ return(list(msg,preds))
+ }
return(msg)
-}
+}
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param) {
- randidx <- sample(1 : xgb.numrow(dall))
- kstep <- length(randidx) / nfold
- idset <- list()
- for (i in 1:nfold) {
- idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
+ if (nfold <= 1) {
+ stop("nfold must be bigger than 1")
}
+ randidx <- sample(1 : xgb.numrow(dall))
+ kstep <- length(randidx) %/% nfold
+ idset <- list()
+ for (i in 1:(nfold-1)) {
+ idset[[i]] = randidx[1:kstep]
+ randidx = setdiff(randidx,idset[[i]])
+ }
+ idset[[nfold]] = randidx
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]])
@@ -193,7 +202,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest)
- ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
+ ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
}
return (ret)
}
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index b7a5a9897..8c3ea80bc 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -6,7 +6,7 @@
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
-# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index b071f08a7..ed088df52 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -31,6 +31,9 @@
#' @param nrounds the max number of iterations
#' @param nfold number of folds used
#' @param label option field, when data is Matrix
+#' @param missing Missing is only used when input is dense matrix, pick a float
+#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
@@ -47,8 +50,6 @@
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
-#' @param missing Missing is only used when input is dense matrix, pick a float
-# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param verbose \code{boolean}, print the statistics during the process.
#' @param ... other parameters to pass to \code{params}.
#'
@@ -71,7 +72,8 @@
#' @export
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
- showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
+ prediction = FALSE, showsd = TRUE, metrics=list(),
+ obj = NULL, feval = NULL, verbose = T,...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
@@ -90,13 +92,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
}
folds <- xgb.cv.mknfold(dtrain, nfold, params)
+ predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- folds[[k]]
- succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
- msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
+ succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
+ if (!prediction){
+ msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
+ } else {
+ res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
+ predictValues[fd$index] <- res[[2]]
+ msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
+ }
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
@@ -115,5 +124,14 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
split <- str_split(string = history, pattern = "\t")
for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
- dt
-}
\ No newline at end of file
+
+ if (prediction) {
+ return(list(dt = dt,pred = predictValues))
+ }
+ return(dt)
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(".")
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 3df8c9605..a0938ded1 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -29,7 +29,7 @@
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
#' # save the model in file 'xgb.model.dump'
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#'
#' # print the model without saving it to a file
#' print(xgb.dump(bst))
@@ -37,11 +37,15 @@
#'
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
if (class(model) != "xgb.Booster") {
- stop("xgb.dump: first argument must be type xgb.Booster")
+ stop("model: argument must be type xgb.Booster")
}
- if (!class(fname) %in% c("character", "NULL")) {
- stop("xgb.dump: second argument must be type character when provided")
+ if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) {
+ stop("fname: argument must be type character (when provided)")
}
+ if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
+ stop("fmap: argument must be type character (when provided)")
+ }
+
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
if(is.null(fname)) {
@@ -50,4 +54,9 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
result %>% str_split("\n") %>% unlist %>% Filter(function(x) x != "", .) %>% writeLines(fname)
return(TRUE)
}
-}
\ No newline at end of file
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(".")
\ No newline at end of file
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index 189ee03b4..094171382 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -9,6 +9,7 @@
#' @importFrom magrittr %>%
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
+#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#'
#' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
#'
@@ -31,41 +32,57 @@
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#'
-#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#' #Both dataset are list with two items, a sparse matrix and labels
+#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#' test <- agaricus.test
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.importance(agaricus.test$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.importance(agaricus.test$data@@Dimnames[[2]], model = bst)
#'
#' @export
-xgb.importance <- function(feature_names = NULL, filename_dump = NULL){
+xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
- if (class(filename_dump) != "character" || !file.exists(filename_dump)) {
+
+ if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
stop("filename_dump: Has to be a path to the model dump file.")
}
- text <- readLines(filename_dump)
+
+ if (!class(model) %in% c("xgb.Booster", "NULL")) {
+ stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+ }
+
+ if(is.null(model)){
+ text <- readLines(filename_dump)
+ } else {
+ text <- xgb.dump(model = model, with.stats = T)
+ }
+
if(text[2] == "bias:"){
- result <- linearDump(feature_names, text)
+ result <- readLines(filename_dump) %>% linearDump(feature_names, .)
} else {
- result <- treeDump(feature_names, text)
+ result <- treeDump(feature_names, text = text)
}
result
}
treeDump <- function(feature_names, text){
- result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",][,.(sum(Quality), sum(Cover), .N),by = Feature][,V1:=V1/sum(V1)][,V2:=V2/sum(V2)][,N:=N/sum(N)][order(-rank(V1))]
- setnames(result, c("Feature", "Gain", "Cover", "Frequence"))
+ result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain=Gain/sum(Gain),Cover=Cover/sum(Cover),Frequence=Frequence/sum(Frequence))][order(-Gain)]
+
result
}
linearDump <- function(feature_names, text){
which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
-}
\ No newline at end of file
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(".")
\ No newline at end of file
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 3e0723c61..930538c5b 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -16,6 +16,8 @@
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
+#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
@@ -40,38 +42,47 @@
#' @examples
#' data(agaricus.train, package='xgboost')
#'
-#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#' #Both dataset are list with two items, a sparse matrix and labels
+#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+#' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], filename_dump = 'xgb.model.dump')
#'
#' @export
-xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text = NULL, n_first_tree = NULL){
+xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
if (!class(feature_names) %in% c("character", "NULL")) {
stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
}
- if (!class(filename_dump) %in% c("character", "NULL")) {
- stop("filename_dump: Has to be a character vector representing the path to the model dump file.")
- } else if (class(filename_dump) == "character" && !file.exists(filename_dump)) {
+ if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
+ stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
+ } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
stop("filename_dump: path to the model doesn't exist.")
- } else if(is.null(filename_dump) & is.null(text)){
- stop("filename_dump: no path and no string version of the model dump have been provided.")
+ } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
+ stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
}
- if (!class(text) %in% c("character", "NULL")) {
+
+ if (!class(model) %in% c("xgb.Booster", "NULL")) {
+ stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+ }
+
+ if (!class(text) %in% c("character", "NULL")) {
stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
}
+
if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
stop("n_first_tree: Has to be a numeric vector of size 1.")
}
- if(is.null(text)){
+ if(!is.null(model)){
+ text = xgb.dump(model = model, with.stats = T)
+ } else if(!is.null(filename_dump)){
text <- readLines(filename_dump) %>% str_trim(side = "both")
}
@@ -89,6 +100,9 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
tree <- text[(position[i]+1):(position[i+1]-1)]
+ # avoid tree made of a leaf only (no split)
+ if(length(tree) <2) next
+
treeID <- i-1
notLeaf <- str_match(tree, "leaf") %>% is.na
@@ -119,7 +133,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
}
yes <- allTrees[!is.na(Yes),Yes]
-
+
set(allTrees, i = which(allTrees[,Feature]!= "Leaf"),
j = "Yes.Feature",
value = allTrees[ID == yes,Feature])
@@ -148,3 +162,8 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, text =
allTrees
}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
\ No newline at end of file
diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R
index 1a8a04e8a..662ccb21b 100644
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -17,9 +17,12 @@
#' @importFrom stringr str_trim
#' @importFrom DiagrammeR DiagrammeR
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
+#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
+#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
-#' @param style a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
+#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
+#' @param width the width of the diagram in pixels.
+#' @param height the height of the diagram in pixels.
#'
#' @return A \code{DiagrammeR} of the model.
#'
@@ -39,39 +42,53 @@
#' @examples
#' data(agaricus.train, package='xgboost')
#'
-#' #Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#' #Both dataset are list with two items, a sparse matrix and labels
+#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nround = 2,objective = "binary:logistic")
-#' xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], 'xgb.model.dump')
+#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export
-xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, n_first_tree = NULL, styles = NULL){
+#'
+xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){
- if (!class(styles) %in% c("character", "NULL") | length(styles) > 1) {
+ if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
stop("style: Has to be a character vector of size 1.")
}
+
+ if (!class(model) %in% c("xgb.Booster", "NULL")) {
+ stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+ }
- allTrees <- xgb.model.dt.tree(feature_names, filename_dump, n_first_tree)
+ if(is.null(model)){
+ allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)
+ } else {
+ allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
+ }
allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
- if(is.null(styles)){
- styles <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
+ if(is.null(CSSstyle)){
+ CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"
}
yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
- path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(styles, yes, no, sep = ";")
- DiagrammeR(path)
+ path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
+ DiagrammeR(path, width, height)
}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))
\ No newline at end of file
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 02a554f68..c72c4d5b0 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -5,7 +5,7 @@
#' @param data takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param label the response variable. User should not set this field,
-# if data is local data file or \code{xgb.DMatrix}.
+#' if data is local data file or \code{xgb.DMatrix}.
#' @param params the list of parameters. Commonly used ones are:
#' \itemize{
#' \item \code{objective} objective function, common ones are
@@ -24,8 +24,8 @@
#' @param verbose If 0, xgboost will stay silent. If 1, xgboost will print
#' information of performance. If 2, xgboost will print information of both
#' performance and construction progress information
-#' @param missing Missing is only used when input is dense matrix, pick a float
-# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
+#' @param missing Missing is only used when input is dense matrix, pick a float
+#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
diff --git a/R-package/data/agaricus.test.rda b/R-package/data/agaricus.test.rda
index bffe6de21..ad8d50af7 100644
Binary files a/R-package/data/agaricus.test.rda and b/R-package/data/agaricus.test.rda differ
diff --git a/R-package/data/agaricus.train.rda b/R-package/data/agaricus.train.rda
index c471d0173..3f5f24144 100644
Binary files a/R-package/data/agaricus.train.rda and b/R-package/data/agaricus.train.rda differ
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
index 345d7ca4f..969da0d91 100644
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -4,4 +4,5 @@ boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
generalized_linear_model Generalized Linear Model
cross_validation Cross validation
-create_sparse_matrix
+create_sparse_matrix Create Sparse Matrix
+predict_leaf_indices Predicting the corresponding leaves
diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R
index 4060d1c48..ac96510a3 100644
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -1,7 +1,7 @@
require(xgboost)
require(Matrix)
require(data.table)
-require(vcd) #Available in Cran. Used for its dataset with categorical values.
+if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
# According to its documentation, Xgboost works only on numbers.
# Sometimes the dataset we have to work on have categorical data.
@@ -86,4 +86,4 @@ print(chisq.test(df$AgeCat, df$Y))
# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
# However it's almost always worse when you add some arbitrary rules.
-# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
\ No newline at end of file
+# Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R
index c7e7ba537..47a0adea0 100644
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -45,3 +45,7 @@ param <- list(max.depth=2,eta=1,silent=1)
xgb.cv(param, dtrain, nround, nfold = 5,
obj = logregobj, feval=evalerror)
+# do cross validation with prediction values for each fold
+res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
+res$dt
+length(res$pred)
diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
new file mode 100644
index 000000000..480578c1d
--- /dev/null
+++ b/R-package/demo/predict_leaf_indices.R
@@ -0,0 +1,21 @@
+require(xgboost)
+# load in the agaricus dataset
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
+watchlist <- list(eval = dtest, train = dtrain)
+nround = 5
+
+# training the model for two rounds
+bst = xgb.train(param, dtrain, nround, watchlist)
+cat('start testing prediction from first n trees\n')
+
+### predict using first 2 tree
+pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
+head(pred_with_leaf)
+# by default, we predict using all the trees
+pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
+head(pred_with_leaf)
diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd
index 556425379..c050d3ecd 100644
--- a/R-package/man/agaricus.test.Rd
+++ b/R-package/man/agaricus.test.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgboost.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.test}
\alias{agaricus.test}
diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd
index 879b3d5df..02571cf54 100644
--- a/R-package/man/agaricus.train.Rd
+++ b/R-package/man/agaricus.train.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgboost.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{data}
\name{agaricus.train}
\alias{agaricus.train}
diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd
index 37e0ad0be..23e3adc84 100644
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/getinfo.xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{getinfo}
\alias{getinfo}
@@ -13,9 +12,9 @@ getinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
-\item{...}{other parameters}
-
\item{name}{the name of the field to get}
+
+\item{...}{other parameters}
}
\description{
Get information of an xgb.DMatrix object
diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd
index afa0c70a5..204a8167f 100644
--- a/R-package/man/predict-xgb.Booster-method.Rd
+++ b/R-package/man/predict-xgb.Booster-method.Rd
@@ -1,12 +1,11 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/predict.xgb.Booster.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{predict,xgb.Booster-method}
\alias{predict,xgb.Booster-method}
\title{Predict method for eXtreme Gradient Boosting model}
\usage{
\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
- outputmargin = FALSE, ntreelimit = NULL)
+ outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
}
\arguments{
\item{object}{Object of class "xgb.Boost"}
@@ -14,6 +13,9 @@
\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+
\item{outputmargin}{whether the prediction should be shown in the original
value of sum of functions, when outputmargin=TRUE, the prediction is
untransformed margin value. In logistic regression, outputmargin=T will
@@ -22,6 +24,8 @@ output value before logistic transformation.}
\item{ntreelimit}{limit number of trees used in prediction, this parameter is
only valid for gbtree, but not for gblinear. set it to be value bigger
than 0. It will use all trees by default.}
+
+\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
}
\description{
Predicted values based on xgboost model object.
diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd
index 4ed262b46..7ea992110 100644
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/setinfo.xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{setinfo}
\alias{setinfo}
@@ -13,11 +12,11 @@ setinfo(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
-\item{...}{other parameters}
-
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
+
+\item{...}{other parameters}
}
\description{
Set information of an xgb.DMatrix object
diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd
index a7812e886..a749aa8ff 100644
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/slice.xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\docType{methods}
\name{slice}
\alias{slice}
@@ -14,9 +13,9 @@ slice(object, ...)
\arguments{
\item{object}{Object of class "xgb.DMatrix"}
-\item{...}{other parameters}
-
\item{idxset}{a integer vector of indices of rows needed}
+
+\item{...}{other parameters}
}
\description{
Get a new DMatrix containing the specified rows of
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 86000220f..31efde687 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.DMatrix.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix}
\alias{xgb.DMatrix}
\title{Contruct xgb.DMatrix object}
@@ -12,7 +11,8 @@ indicating the data file.}
\item{info}{a list of information of the xgb.DMatrix object}
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
\item{...}{other information to pass to \code{info}.}
}
diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
index 6bbc277b3..803de912b 100644
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.DMatrix.save.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.DMatrix.save}
\alias{xgb.DMatrix.save}
\title{Save xgb.DMatrix object to binary file}
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 7ba5eb727..149ec392f 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -1,12 +1,11 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.cv.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.cv}
\alias{xgb.cv}
\title{Cross Validation}
\usage{
xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
- missing = NULL, showsd = TRUE, metrics = list(), obj = NULL,
- feval = NULL, verbose = T, ...)
+ missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
+ obj = NULL, feval = NULL, verbose = T, ...)
}
\arguments{
\item{params}{the list of parameters. Commonly used ones are:
@@ -32,7 +31,10 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
\item{label}{option field, when data is Matrix}
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+
+\item{prediction}{A logical value indicating whether to return the prediction vector.}
\item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 473227357..d1968217b 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.dump.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.dump}
\alias{xgb.dump}
\title{Save xgboost model to text file}
@@ -37,7 +36,7 @@ test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
# save the model in file 'xgb.model.dump'
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
# print the model without saving it to a file
print(xgb.dump(bst))
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index 78be4b91b..1588639b4 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -1,15 +1,16 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.importance.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.importance}
\alias{xgb.importance}
\title{Show importance of features in a model}
\usage{
-xgb.importance(feature_names = NULL, filename_dump = NULL)
+xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
+
+\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
}
\value{
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
@@ -36,16 +37,16 @@ There are 3 columns :
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
-#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.importance(agaricus.test$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.importance(agaricus.test$data@Dimnames[[2]], model = bst)
}
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 433b38c79..d2c5d94b6 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.load.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.load}
\alias{xgb.load}
\title{Load xgboost model from binary file}
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index 2bc48c4d0..51c965970 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -1,17 +1,20 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.model.dt.tree.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table}
\usage{
-xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL, text = NULL,
- n_first_tree = NULL)
+xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
+ model = NULL, text = NULL, n_first_tree = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+
+\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
}
\value{
@@ -40,15 +43,16 @@ The content of the \code{data.table} is organised that way:
\examples{
data(agaricus.train, package='xgboost')
-#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
+xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], filename_dump = 'xgb.model.dump')
}
diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd
index ba65cdd7c..dc95dfec0 100644
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -1,20 +1,25 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.plot.tree.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.plot.tree}
\alias{xgb.plot.tree}
\title{Plot a boosted tree model}
\usage{
-xgb.plot.tree(feature_names = NULL, filename_dump = NULL,
- n_first_tree = NULL, styles = NULL)
+xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
+ n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
+
+\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
-\item{style}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
+\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
+
+\item{width}{the width of the diagram in pixels.}
+
+\item{height}{the height of the diagram in pixels.}
}
\value{
A \code{DiagrammeR} of the model.
@@ -38,15 +43,15 @@ It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpos
\examples{
data(agaricus.train, package='xgboost')
-#Both dataset are list with two items, a sparse matrix and labels (labels = outcome column which will be learned).
+#Both dataset are list with two items, a sparse matrix and labels
+#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nround = 2,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.plot.tree(agaricus.train$data@Dimnames[[2]], 'xgb.model.dump')
+xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
}
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index ded444446..0ccdf13da 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.save.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.save}
\alias{xgb.save}
\title{Save xgboost model to binary file}
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 58ef94135..a05e2eeb9 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgb.train.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgb.train}
\alias{xgb.train}
\title{eXtreme Gradient Boosting Training}
diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd
index 21b1ad220..035eec9e7 100644
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -1,5 +1,4 @@
-% Generated by roxygen2 (4.1.0): do not edit by hand
-% Please edit documentation in R/xgboost.R
+% Generated by roxygen2 (4.0.1): do not edit by hand
\name{xgboost}
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
@@ -11,9 +10,8 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
\code{xgb.DMatrix}.}
-\item{label}{the response variable. User should not set this field,}
-
-\item{missing}{Missing is only used when input is dense matrix, pick a float}
+\item{label}{the response variable. User should not set this field,
+if data is local data file or \code{xgb.DMatrix}.}
\item{params}{the list of parameters. Commonly used ones are:
\itemize{
@@ -36,6 +34,9 @@ xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
information of performance. If 2, xgboost will print information of both
performance and construction progress information}
+\item{missing}{Missing is only used when input is dense matrix, pick a float
+value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
+
\item{...}{other parameters to pass to \code{params}.}
}
\description{
diff --git a/R-package/src/Makevars b/R-package/src/Makevars
index 44dce490e..cc933f099 100644
--- a/R-package/src/Makevars
+++ b/R-package/src/Makevars
@@ -1,9 +1,7 @@
# package root
PKGROOT=../../
# _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
+PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
-
-
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 289f1a15a..0f7bc06ec 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -1,7 +1,19 @@
# package root
-PKGROOT=../../
+PKGROOT=./
# _*_ mode: Makefile; _*_
-PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -I$(PKGROOT)
+
+# This file is only used for windows compilation from github
+# It will be replaced by Makevars in CRAN version
+.PHONY: all xgblib
+all: $(SHLIB)
+$(SHLIB): xgblib
+xgblib:
+ cp -r ../../src .
+ cp -r ../../wrapper .
+ cp -r ../../subtree .
+
+PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../..
PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
-OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o
+OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o
+$(OBJECTS) : xgblib
diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
index 9320547df..b4757542d 100644
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -3,10 +3,12 @@
#include
#include
#include
-#include "xgboost_R.h"
+#include
#include "wrapper/xgboost_wrapper.h"
#include "src/utils/utils.h"
#include "src/utils/omp.h"
+#include "xgboost_R.h"
+
using namespace std;
using namespace xgboost;
@@ -246,12 +248,12 @@ extern "C" {
asInteger(iter),
BeginPtr(vec_dmats), BeginPtr(vec_sptr), len));
}
- SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin, SEXP ntree_limit) {
+ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask, SEXP ntree_limit) {
_WrapperBegin();
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
- asInteger(output_margin),
+ asInteger(option_mask),
asInteger(ntree_limit),
&olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
@@ -280,14 +282,13 @@ extern "C" {
asInteger(with_stats),
&olen);
SEXP out = PROTECT(allocVector(STRSXP, olen));
- char buffer [2000];
for (size_t i = 0; i < olen; ++i) {
- memset(buffer, 0, sizeof buffer);
- sprintf (buffer, "booster[%u]:\n%s", static_cast(i), res[i]);
- SET_STRING_ELT(out, i, mkChar(buffer));
+ stringstream stream;
+ stream << "booster["<```
+ - mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
+* run ```bash mushroom-col-rabit-mock.sh ```
+ - mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
+
+How to Use
+====
+* First split the data by column,
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable column split mode by ```dsplit=col```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one process per node
+* The code will work correctly as long as union of each column subset is all the columns we are interested in.
+ - The column subset can overlap with each other.
+* It uses exactly the same algorithm as single node version, to examine all potential split points.
diff --git a/multi-node/col-split/mushroom-col-rabit-mock.sh b/multi-node/col-split/mushroom-col-rabit-mock.sh
new file mode 100755
index 000000000..b4208f04c
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-rabit-mock.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+ echo "Usage: nprocess"
+ exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+
+
+#cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col-rabit.sh b/multi-node/col-split/mushroom-col-rabit.sh
new file mode 100755
index 000000000..77e0c904c
--- /dev/null
+++ b/multi-node/col-split/mushroom-col-rabit.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+ echo "Usage: nprocess"
+ exit -1
+fi
+
+#
+# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
+# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
+#
+rm -rf train.col* *.model
+k=$1
+
+# split the lib svm file into k subfiles
+python splitsvm.py ../../demo/data/agaricus.txt.train train $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
+
+# the model can be directly loaded by single machine xgboost solver, as usuall
+../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
+
+# run for one round, and continue training
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col num_round=1
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model
+
+cat dump.nice.$k.txt
diff --git a/multi-node/col-split/mushroom-col.conf b/multi-node/col-split/mushroom-col.conf
new file mode 100644
index 000000000..2c779a44d
--- /dev/null
+++ b/multi-node/col-split/mushroom-col.conf
@@ -0,0 +1,35 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0
+# minimum loss reduction required to make a further partition
+gamma = 1.0
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1
+# maximum depth of a tree
+max_depth = 3
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0
+use_buffer = 0
+
+# The path of training data %d is the wildcard for the rank of the data
+# The idea is each process take a feature matrix with subset of columns
+#
+data = "train.col%d"
+
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/data/agaricus.txt.test"
+# evaluate on training data as well each round
+eval_train = 1
+
+# The path of test data, need to use full data of test, try not use it, or keep an subsampled version
+test:data = "../../demo/data/agaricus.txt.test"
diff --git a/multi-node/col-split/splitsvm.py b/multi-node/col-split/splitsvm.py
new file mode 100644
index 000000000..365aef610
--- /dev/null
+++ b/multi-node/col-split/splitsvm.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different subcolumns
+if len(sys.argv) < 4:
+ print ('Usage: k')
+ exit(0)
+
+random.seed(10)
+fmap = {}
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+ fos.append(open( sys.argv[2]+'.col%d' % i, 'w' ))
+
+for l in open(sys.argv[1]):
+ arr = l.split()
+ for f in fos:
+ f.write(arr[0])
+ for it in arr[1:]:
+ fid = int(it.split(':')[0])
+ if fid not in fmap:
+ fmap[fid] = random.randint(0, k-1)
+ fos[fmap[fid]].write(' '+it)
+ for f in fos:
+ f.write('\n')
+for f in fos:
+ f.close()
diff --git a/multi-node/hadoop/README.md b/multi-node/hadoop/README.md
new file mode 100644
index 000000000..d1dde8ba3
--- /dev/null
+++ b/multi-node/hadoop/README.md
@@ -0,0 +1,43 @@
+Distributed XGBoost: Hadoop Version
+====
+* The script in this fold shows an example of how to run distributed xgboost on hadoop platform.
+* It relies on [Rabit Library](https://github.com/tqchen/rabit) (Reliable Allreduce and Broadcast Interface) and Hadoop Streaming. Rabit provides an interface to aggregate gradient values and split statistics, that allow xgboost to run reliably on hadoop. You do not need to care how to update model in each iteration, just use the script ```rabit_hadoop.py```. For those who want to know how it exactly works, plz refer to the main page of [Rabit](https://github.com/tqchen/rabit).
+* Quick start: run ```bash run_mushroom.sh ```
+ - This is the hadoop version of binary classification example in the demo folder.
+ - More info of the usage of xgboost can be refered to [wiki page](https://github.com/tqchen/xgboost/wiki)
+
+Before you run the script
+====
+* Make sure you have set up the hadoop environment.
+* If you want to only use single machine multi-threading, try single machine examples in the [demo folder](../../demo).
+* Build: run ```bash build.sh``` in the root folder, it will automatically download rabit and build xgboost.
+* Check whether the environment variable $HADOOP_HOME exists (e.g. run ```echo $HADOOP_HOME```). If not, please set up hadoop-streaming.jar path in rabit_hadoop.py.
+
+How to Use
+====
+* Input data format: LIBSVM format. The example here uses generated data in demo/data folder.
+* Put the training data in HDFS (hadoop distributed file system).
+* Use rabit ```rabit_hadoop.py``` to submit training task to hadoop, and save the final model file.
+* Get the final model file from HDFS, and locally do prediction as well as visualization of model.
+
+Single machine vs Hadoop version
+====
+If you have used xgboost (single machine version) before, this section will show you how to run xgboost on hadoop with a slight modification on conf file.
+* Hadoop version needs to set up how many slave nodes/machines/workers you would like to use at first.
+* IO: instead of reading and writing file locally, hadoop version use "stdin" to read training file and use "stdout" to store the final model file. Therefore, you should change the parameters "data" and "model_out" in conf file to ```data=stdin``` and ```model_out=stdout```.
+* File cache: ```rabit_hadoop.py``` also provide several ways to cache necesary files, including binary file (xgboost), conf file, small size of dataset which used for eveluation during the training process, and so on.
+ - Any file used in config file, excluding stdin, should be cached in the script. ```rabit_hadoop.py``` will automatically cache files in the command line. For example, ```rabit_hadoop.py -n 3 -i $hdfsPath/agaricus.txt.train -o $hdfsPath/mushroom.final.model $localPath/xgboost mushroom.hadoop.conf``` will cache "xgboost" and "mushroom.hadoop.conf".
+ - You could also use "-f" to manually cache one or more files, like ```-f file1 -f file2``` or ```-f file1#file2``` (use "#" to spilt file names).
+ - The local path of cached files in command is "./".
+ - Since the cached files will be packaged and delivered to hadoop slave nodes, the cached file should not be large. For instance, trying to cache files of GB size may reduce the performance.
+* Hadoop version also support evaluting each training round. You just need to modify parameters "eval_train".
+* More details of submission can be referred to the usage of ```rabit_hadoop.py```.
+* The model saved by hadoop version is compatible with single machine version.
+
+Notes
+====
+* The code has been tested on MapReduce 1 (MRv1) and YARN.
+ - We recommend to run it on MapReduce 2 (MRv2, YARN) so that multi-threading can be enabled.
+* The code is optimized with multi-threading, so you will want to run one xgboost per node/worker for best performance.
+ - You will want to set to be number of cores you have on each machine.
+ - You will need YARN to set specify number of cores of each worker
diff --git a/multi-node/hadoop/mushroom.hadoop.conf b/multi-node/hadoop/mushroom.hadoop.conf
new file mode 100644
index 000000000..a4e885d54
--- /dev/null
+++ b/multi-node/hadoop/mushroom.hadoop.conf
@@ -0,0 +1,39 @@
+# General Parameters, see comment for each definition
+# choose the booster, can be gbtree or gblinear
+booster = gbtree
+# choose logistic regression loss function for binary classification
+objective = binary:logistic
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0
+# minimum loss reduction required to make a further partition
+gamma = 1.0
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1
+# maximum depth of a tree
+max_depth = 3
+
+# Task Parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0
+# evaluate on training data as well each round
+# eval_train = 1
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+# eval[test] = "agaricus.txt.test"
+
+# Plz donot modify the following parameters
+# The path of training data
+data = stdin
+# The path of model file
+model_out = stdout
+# split pattern of xgboost
+dsplit = row
+
+<<<<<<< HEAD
+# evaluate on training data as well each round
+eval_train = 1
+=======
+>>>>>>> df3f87c182cc12ccc9ac1f9cafbe01ea7ebf0ac4
diff --git a/multi-node/hadoop/run_mushroom.sh b/multi-node/hadoop/run_mushroom.sh
new file mode 100755
index 000000000..9cb73ec25
--- /dev/null
+++ b/multi-node/hadoop/run_mushroom.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+if [ "$#" -lt 3 ];
+then
+ echo "Usage: "
+ exit -1
+fi
+
+# put the local training file to HDFS
+hadoop fs -mkdir $3/data
+hadoop fs -put ../../demo/data/agaricus.txt.train $3/data
+
+../../subtree/rabit/tracker/rabit_hadoop.py -n $1 -nt $2 -i $3/data/agaricus.txt.train -o $3/mushroom.final.model ../../xgboost mushroom.hadoop.conf nthread=$2
+
+# get the final model file
+hadoop fs -get $3/mushroom.final.model/part-00000 ./final.model
+
+# output prediction task=pred
+../../xgboost mushroom.hadoop.conf task=pred model_in=final.model test:data=../../demo/data/agaricus.txt.test
+# print the boosters of final.model in dump.raw.txt
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model name_dump=dump.raw.txt
+# use the feature map in printing for better visualization
+../../xgboost mushroom.hadoop.conf task=dump model_in=final.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.txt
+cat dump.nice.txt
diff --git a/multi-node/row-split/README.md b/multi-node/row-split/README.md
new file mode 100644
index 000000000..30e2528d3
--- /dev/null
+++ b/multi-node/row-split/README.md
@@ -0,0 +1,18 @@
+Distributed XGBoost: Row Split Version
+====
+* You might be interested to checkout the [Hadoop example](../hadoop)
+* Machine Rabit: run ```bash machine-row-rabit.sh ```
+ - machine-col-rabit.sh starts xgboost job using rabit
+
+How to Use
+====
+* First split the data by rows
+* In the config, specify data file as containing a wildcard %d, where %d is the rank of the node, each node will load their part of data
+* Enable ow split mode by ```dsplit=row```
+
+Notes
+====
+* The code is multi-threaded, so you want to run one xgboost-mpi per node
+* Row-based solver split data by row, each node work on subset of rows, it uses an approximate histogram count algorithm,
+ and will only examine subset of potential split points as opposed to all split points.
+
diff --git a/multi-node/row-split/machine-row-rabit-mock.sh b/multi-node/row-split/machine-row-rabit-mock.sh
new file mode 100755
index 000000000..ed1178dc9
--- /dev/null
+++ b/multi-node/row-split/machine-row-rabit-mock.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+ echo "Usage: nprocess"
+ exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 mock=0,0,3,0 mock=2,2,3,0
diff --git a/multi-node/row-split/machine-row-rabit.sh b/multi-node/row-split/machine-row-rabit.sh
new file mode 100755
index 000000000..fb3e3ba60
--- /dev/null
+++ b/multi-node/row-split/machine-row-rabit.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+if [[ $# -ne 1 ]]
+then
+ echo "Usage: nprocess"
+ exit -1
+fi
+
+rm -rf train-machine.row* *.model
+k=$1
+# make machine data
+cd ../../demo/regression/
+python mapfeat.py
+python mknfold.py machine.txt 1
+cd -
+
+# split the lib svm file into k subfiles
+python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
+
+# run xgboost mpi
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=3 eval_train=1
+
+# run xgboost-mpi save model 0001, continue to run from existing model
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=1
+../../subtree/rabit/tracker/rabit_demo.py -n $k ../../xgboost machine-row.conf dsplit=row num_round=2 model_in=0001.model
diff --git a/multi-node/row-split/machine-row.conf b/multi-node/row-split/machine-row.conf
new file mode 100644
index 000000000..c0cba3da8
--- /dev/null
+++ b/multi-node/row-split/machine-row.conf
@@ -0,0 +1,30 @@
+# General Parameters, see comment for each definition
+# choose the tree booster, can also change to gblinear
+booster = gbtree
+# this is the only difference with classification, use reg:linear to do linear classification
+# when labels are in [0,1] we can also use reg:logistic
+objective = reg:linear
+
+# Tree Booster Parameters
+# step size shrinkage
+eta = 1.0
+# minimum loss reduction required to make a further partition
+gamma = 1.0
+# minimum sum of instance weight(hessian) needed in a child
+min_child_weight = 1
+# maximum depth of a tree
+max_depth = 3
+# Task parameters
+# the number of round to do boosting
+num_round = 2
+# 0 means do not save any model except the final round model
+save_period = 0
+use_buffer = 0
+
+# The path of training data
+data = "train-machine.row%d"
+# The path of validation data, used to monitor training process, here [test] sets name of the validation set
+eval[test] = "../../demo/regression/machine.txt.test"
+# The path of test data
+test:data = "../../demo/regression/machine.txt.test"
+
diff --git a/multi-node/row-split/splitrows.py b/multi-node/row-split/splitrows.py
new file mode 100644
index 000000000..2e9d1184d
--- /dev/null
+++ b/multi-node/row-split/splitrows.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python
+import sys
+import random
+
+# split libsvm file into different rows
+if len(sys.argv) < 4:
+ print ('Usage: k')
+ exit(0)
+
+random.seed(10)
+
+k = int(sys.argv[3])
+fi = open( sys.argv[1], 'r' )
+fos = []
+
+for i in range(k):
+ fos.append(open( sys.argv[2]+'.row%d' % i, 'w' ))
+
+for l in open(sys.argv[1]):
+ i = random.randint(0, k-1)
+ fos[i].write(l)
+
+for f in fos:
+ f.close()
diff --git a/src/data.h b/src/data.h
index 2ea5f222a..162a31bfe 100644
--- a/src/data.h
+++ b/src/data.h
@@ -138,9 +138,10 @@ class IFMatrix {
virtual utils::IIterator *ColIterator(const std::vector &fset) = 0;
/*!
* \brief check if column access is supported, if not, initialize column access
+ * \param enabled whether certain feature should be included in column access
* \param subsample subsample ratio when generating column access
*/
- virtual void InitColAccess(float subsample) = 0;
+ virtual void InitColAccess(const std::vector &enabled, float subsample) = 0;
// the following are column meta data, should be able to answer them fast
/*! \return whether column access is enabled */
virtual bool HaveColAccess(void) const = 0;
diff --git a/src/gbm/gblinear-inl.hpp b/src/gbm/gblinear-inl.hpp
index 473914b6e..de9ee6173 100644
--- a/src/gbm/gblinear-inl.hpp
+++ b/src/gbm/gblinear-inl.hpp
@@ -33,16 +33,17 @@ class GBLinear : public IGradBooster {
model.param.SetParam(name, val);
}
}
- virtual void LoadModel(utils::IStream &fi) {
+ virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
model.LoadModel(fi);
}
- virtual void SaveModel(utils::IStream &fo) const {
+ virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
model.SaveModel(fo);
}
virtual void InitModel(void) {
model.InitModel();
}
virtual void DoBoost(IFMatrix *p_fmat,
+ int64_t buffer_offset,
const BoosterInfo &info,
std::vector *in_gpair) {
std::vector &gpair = *in_gpair;
@@ -135,8 +136,22 @@ class GBLinear : public IGradBooster {
}
}
}
-
- virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) {
+ virtual void Predict(const SparseBatch::Inst &inst,
+ std::vector *out_preds,
+ unsigned ntree_limit,
+ unsigned root_index) {
+ const int ngroup = model.param.num_output_group;
+ for (int gid = 0; gid < ngroup; ++gid) {
+ this->Pred(inst, BeginPtr(*out_preds));
+ }
+ }
+ virtual void PredictLeaf(IFMatrix *p_fmat,
+ const BoosterInfo &info,
+ std::vector *out_preds,
+ unsigned ntree_limit = 0) {
+ utils::Error("gblinear does not support predict leaf index");
+ }
+ virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) {
std::stringstream fo("");
fo << "bias:\n";
for (int i = 0; i < model.param.num_output_group; ++i) {
@@ -151,8 +166,8 @@ class GBLinear : public IGradBooster {
std::vector v;
v.push_back(fo.str());
return v;
- }
-
+ }
+
protected:
inline void Pred(const RowBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp
index e280fdd4a..fe8d778e4 100644
--- a/src/gbm/gbm.cpp
+++ b/src/gbm/gbm.cpp
@@ -1,5 +1,6 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
#include
#include "./gbm.h"
#include "./gbtree-inl.hpp"
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 07dade4ac..f07d277ac 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -27,25 +27,44 @@ class IGradBooster {
/*!
* \brief load model from stream
* \param fi input stream
+ * \param with_pbuffer whether the incoming data contains pbuffer
*/
- virtual void LoadModel(utils::IStream &fi) = 0;
+ virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) = 0;
/*!
* \brief save model to stream
* \param fo output stream
+ * \param with_pbuffer whether save out pbuffer
*/
- virtual void SaveModel(utils::IStream &fo) const = 0;
+ virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const = 0;
/*!
* \brief initialize the model
*/
virtual void InitModel(void) = 0;
+ /*!
+ * \brief reset the predict buffer
+ * this will invalidate all the previous cached results
+ * and recalculate from scratch
+ */
+ virtual void ResetPredBuffer(size_t num_pbuffer) {}
+ /*!
+ * \brief whether the model allow lazy checkpoint
+ * return true if model is only updated in DoBoost
+ * after all Allreduce calls
+ */
+ virtual bool AllowLazyCheckPoint(void) const {
+ return false;
+ }
/*!
* \brief peform update to the model(boosting)
* \param p_fmat feature matrix that provide access to features
+ * \param buffer_offset buffer index offset of these instances, if equals -1
+ * this means we do not have buffer index allocated to the gbm
* \param info meta information about training
* \param in_gpair address of the gradient pair statistics of the data
* the booster may change content of gpair
*/
virtual void DoBoost(IFMatrix *p_fmat,
+ int64_t buffer_offset,
const BoosterInfo &info,
std::vector *in_gpair) = 0;
/*!
@@ -64,7 +83,36 @@ class IGradBooster {
int64_t buffer_offset,
const BoosterInfo &info,
std::vector *out_preds,
- unsigned ntree_limit = 0) = 0;
+ unsigned ntree_limit = 0) = 0;
+ /*!
+ * \brief online prediction funciton, predict score for one instance at a time
+ * NOTE: use the batch prediction interface if possible, batch prediction is usually
+ * more efficient than online prediction
+ * This function is NOT threadsafe, make sure you only call from one thread
+ *
+ * \param inst the instance you want to predict
+ * \param out_preds output vector to hold the predictions
+ * \param ntree_limit limit the number of trees used in prediction
+ * \param root_index the root index
+ * \sa Predict
+ */
+ virtual void Predict(const SparseBatch::Inst &inst,
+ std::vector *out_preds,
+ unsigned ntree_limit = 0,
+ unsigned root_index = 0) = 0;
+ /*!
+ * \brief predict the leaf index of each tree, the output will be nsample * ntree vector
+ * this is only valid in gbtree predictor
+ * \param p_fmat feature matrix
+ * \param info extra side information that may be needed for prediction
+ * \param out_preds output vector to hold the predictions
+ * \param ntree_limit limit the number of trees used in prediction, when it equals 0, this means
+ * we do not limit number of trees, this parameter is only valid for gbtree, but not for gblinear
+ */
+ virtual void PredictLeaf(IFMatrix *p_fmat,
+ const BoosterInfo &info,
+ std::vector *out_preds,
+ unsigned ntree_limit = 0) = 0;
/*!
* \brief dump the model in text format
* \param fmap feature map that may help give interpretations of feature
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index ed52afa7d..66b03dd87 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -19,6 +19,8 @@ namespace gbm {
*/
class GBTree : public IGradBooster {
public:
+ GBTree(void) {
+ }
virtual ~GBTree(void) {
this->Clear();
}
@@ -37,7 +39,7 @@ class GBTree : public IGradBooster {
tparam.SetParam(name, val);
if (trees.size() == 0) mparam.SetParam(name, val);
}
- virtual void LoadModel(utils::IStream &fi) {
+ virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) {
this->Clear();
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"GBTree: invalid model file");
@@ -51,7 +53,7 @@ class GBTree : public IGradBooster {
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
"GBTree: invalid model file");
}
- if (mparam.num_pbuffer != 0) {
+ if (mparam.num_pbuffer != 0 && with_pbuffer) {
pred_buffer.resize(mparam.PredBufferSize());
pred_counter.resize(mparam.PredBufferSize());
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
@@ -60,7 +62,7 @@ class GBTree : public IGradBooster {
"GBTree: invalid model file");
}
}
- virtual void SaveModel(utils::IStream &fo) const {
+ virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
utils::Assert(mparam.num_trees == static_cast(trees.size()), "GBTree");
fo.Write(&mparam, sizeof(ModelParam));
for (size_t i = 0; i < trees.size(); ++i) {
@@ -69,7 +71,7 @@ class GBTree : public IGradBooster {
if (tree_info.size() != 0) {
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
}
- if (mparam.num_pbuffer != 0) {
+ if (mparam.num_pbuffer != 0 && with_pbuffer) {
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
}
@@ -82,12 +84,23 @@ class GBTree : public IGradBooster {
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
}
+ virtual void ResetPredBuffer(size_t num_pbuffer) {
+ mparam.num_pbuffer = static_cast(num_pbuffer);
+ pred_buffer.clear(); pred_counter.clear();
+ pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
+ pred_counter.resize(mparam.PredBufferSize(), 0);
+ }
+ virtual bool AllowLazyCheckPoint(void) const {
+ return !(tparam.distcol_mode != 0 && mparam.num_output_group != 1);
+ }
virtual void DoBoost(IFMatrix *p_fmat,
+ int64_t buffer_offset,
const BoosterInfo &info,
std::vector *in_gpair) {
const std::vector &gpair = *in_gpair;
- if (mparam.num_output_group == 1) {
- this->BoostNewTrees(gpair, p_fmat, info, 0);
+ std::vector > new_trees;
+ if (mparam.num_output_group == 1) {
+ new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
} else {
const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0,
@@ -99,15 +112,18 @@ class GBTree : public IGradBooster {
for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
- this->BoostNewTrees(tmp, p_fmat, info, gid);
+ new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
}
}
+ for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+ this->CommitModel(new_trees[gid], gid);
+ }
}
virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector *out_preds,
- unsigned ntree_limit = 0) {
+ unsigned ntree_limit = 0) {
int nthread;
#pragma omp parallel
{
@@ -117,7 +133,6 @@ class GBTree : public IGradBooster {
for (int i = 0; i < nthread; ++i) {
thread_temp[i].Init(mparam.num_feature);
}
-
std::vector &preds = *out_preds;
const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
@@ -144,6 +159,38 @@ class GBTree : public IGradBooster {
}
}
}
+ }
+ virtual void Predict(const SparseBatch::Inst &inst,
+ std::vector *out_preds,
+ unsigned ntree_limit,
+ unsigned root_index) {
+ if (thread_temp.size() == 0) {
+ thread_temp.resize(1, tree::RegTree::FVec());
+ thread_temp[0].Init(mparam.num_feature);
+ }
+ out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
+ // loop over output groups
+ for (int gid = 0; gid < mparam.num_output_group; ++gid) {
+ this->Pred(inst, -1, gid, root_index, &thread_temp[0],
+ &(*out_preds)[gid], mparam.num_output_group,
+ ntree_limit);
+ }
+ }
+ virtual void PredictLeaf(IFMatrix *p_fmat,
+ const BoosterInfo &info,
+ std::vector *out_preds,
+ unsigned ntree_limit) {
+ int nthread;
+ #pragma omp parallel
+ {
+ nthread = omp_get_num_threads();
+ }
+ thread_temp.resize(nthread, tree::RegTree::FVec());
+ for (int i = 0; i < nthread; ++i) {
+ thread_temp[i].Init(mparam.num_feature);
+ }
+ this->PredPath(p_fmat, info, out_preds, ntree_limit);
+
}
virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) {
std::vector dump;
@@ -184,13 +231,15 @@ class GBTree : public IGradBooster {
tparam.updater_initialized = 1;
}
// do group specific group
- inline void BoostNewTrees(const std::vector &gpair,
- IFMatrix *p_fmat,
- const BoosterInfo &info,
- int bst_group) {
+ inline std::vector
+ BoostNewTrees(const std::vector &gpair,
+ IFMatrix *p_fmat,
+ int64_t buffer_offset,
+ const BoosterInfo &info,
+ int bst_group) {
+ std::vector new_trees;
this->InitUpdater();
// create the trees
- std::vector new_trees;
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
new_trees.push_back(new tree::RegTree());
for (size_t j = 0; j < cfg.size(); ++j) {
@@ -201,13 +250,52 @@ class GBTree : public IGradBooster {
// update the trees
for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, p_fmat, info, new_trees);
+ }
+ // optimization, update buffer, if possible
+ // this is only under distributed column mode
+ // for safety check of lazy checkpoint
+ if (
+ buffer_offset >= 0 &&
+ new_trees.size() == 1 && updaters.size() > 0 &&
+ updaters.back()->GetLeafPosition() != NULL) {
+ utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
+ "distributed mode is not compatible with prob_buffer_row");
+ this->UpdateBufferByPosition(p_fmat,
+ buffer_offset, bst_group,
+ *new_trees[0],
+ updaters.back()->GetLeafPosition());
}
- // push back to model
+ return new_trees;
+ }
+ // commit new trees all at once
+ inline void CommitModel(const std::vector &new_trees, int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(new_trees[i]);
tree_info.push_back(bst_group);
}
- mparam.num_trees += tparam.num_parallel_tree;
+ mparam.num_trees += static_cast(new_trees.size());
+ }
+ // update buffer by pre-cached position
+ inline void UpdateBufferByPosition(IFMatrix *p_fmat,
+ int64_t buffer_offset,
+ int bst_group,
+ const tree::RegTree &new_tree,
+ const int* leaf_position) {
+ const std::vector &rowset = p_fmat->buffered_rowset();
+ const bst_omp_uint ndata = static_cast(rowset.size());
+ #pragma omp parallel for schedule(static)
+ for (bst_omp_uint i = 0; i < ndata; ++i) {
+ const bst_uint ridx = rowset[i];
+ const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
+ const int tid = leaf_position[ridx];
+ utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date");
+ utils::Assert(tid >= 0, "invalid leaf position");
+ pred_buffer[bid] += new_tree[tid].leaf_value();
+ for (int i = 0; i < mparam.size_leaf_vector; ++i) {
+ pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
+ }
+ pred_counter[bid] += tparam.num_parallel_tree;
+ }
}
// make a prediction for a single instance
inline void Pred(const RowBatch::Inst &inst,
@@ -215,7 +303,8 @@ class GBTree : public IGradBooster {
int bst_group,
unsigned root_index,
tree::RegTree::FVec *p_feats,
- float *out_pred, size_t stride, unsigned ntree_limit) {
+ float *out_pred, size_t stride,
+ unsigned ntree_limit) {
size_t itop = 0;
float psum = 0.0f;
// sum of leaf vector
@@ -258,6 +347,39 @@ class GBTree : public IGradBooster {
out_pred[stride * (i + 1)] = vec_psum[i];
}
}
+ // predict independent leaf index
+ inline void PredPath(IFMatrix *p_fmat,
+ const BoosterInfo &info,
+ std::vector *out_preds,
+ unsigned ntree_limit) {
+ // number of valid trees
+ if (ntree_limit == 0 || ntree_limit > trees.size()) {
+ ntree_limit = static_cast(trees.size());
+ }
+ std::vector &preds = *out_preds;
+ preds.resize(info.num_row * ntree_limit);
+ // start collecting the prediction
+ utils::IIterator *iter = p_fmat->RowIterator();
+ iter->BeforeFirst();
+ while (iter->Next()) {
+ const RowBatch &batch = iter->Value();
+ // parallel over local batch
+ const bst_omp_uint nsize = static_cast(batch.size);
+ #pragma omp parallel for schedule(static)
+ for (bst_omp_uint i = 0; i < nsize; ++i) {
+ const int tid = omp_get_thread_num();
+ int64_t ridx = static_cast(batch.base_rowid + i);
+ tree::RegTree::FVec &feats = thread_temp[tid];
+ feats.Fill(batch[i]);
+ for (unsigned j = 0; j < ntree_limit; ++j) {
+ int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
+ preds[ridx * ntree_limit + j] = static_cast(tid);
+ }
+ feats.Drop(batch[i]);
+ }
+ }
+ }
+
// --- data structure ---
/*! \brief training parameters */
struct TrainParam {
@@ -270,6 +392,8 @@ class GBTree : public IGradBooster {
int num_parallel_tree;
/*! \brief whether updater is already initialized */
int updater_initialized;
+ /*! \brief distributed column mode */
+ int distcol_mode;
/*! \brief tree updater sequence */
std::string updater_seq;
// construction
@@ -278,6 +402,7 @@ class GBTree : public IGradBooster {
updater_seq = "grow_colmaker,prune";
num_parallel_tree = 1;
updater_initialized = 0;
+ distcol_mode = 0;
}
inline void SetParam(const char *name, const char *val){
using namespace std;
@@ -286,6 +411,9 @@ class GBTree : public IGradBooster {
updater_seq = val;
updater_initialized = 0;
}
+ if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
+ distcol_mode = 1;
+ }
if (!strcmp(name, "nthread")) {
omp_set_num_threads(nthread = atoi(val));
}
diff --git a/src/io/io.cpp b/src/io/io.cpp
index d251d7a96..0072618c6 100644
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -1,15 +1,32 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
#include
#include "./io.h"
#include "../utils/io.h"
#include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp"
+#include "page_dmatrix-inl.hpp"
+#include "page_fmatrix-inl.hpp"
+
// implements data loads using dmatrix simple for now
namespace xgboost {
namespace io {
DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
+ if (!strcmp(fname, "stdin")) {
+ DMatrixSimple *dmat = new DMatrixSimple();
+ dmat->LoadText(fname, silent);
+ return dmat;
+ }
+ std::string tmp_fname;
+ const char *fname_ext = NULL;
+ if (strchr(fname, ';') != NULL) {
+ tmp_fname = fname;
+ char *ptr = strchr(&tmp_fname[0], ';');
+ ptr[0] = '\0'; fname = &tmp_fname[0];
+ fname_ext = ptr + 1;
+ }
int magic;
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
@@ -20,7 +37,27 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
dmat->LoadBinary(fs, silent, fname);
fs.Close();
return dmat;
- }
+ }
+ if (magic == DMatrixPage::kMagic) {
+ if (fname_ext == NULL) {
+ DMatrixPage *dmat = new DMatrixPage();
+ dmat->Load(fs, silent, fname);
+ return dmat;
+ } else {
+ DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
+ dmat->Load(fs, silent, fname, true);
+ return dmat;
+ }
+ }
+ if (magic == DMatrixColPage::kMagic) {
+ std::string sfname = fname;
+ if (fname_ext == NULL) {
+ sfname += ".col"; fname_ext = sfname.c_str();
+ }
+ DMatrixColPage *dmat = new DMatrixColPage(fname_ext);
+ dmat->Load(fs, silent, fname);
+ return dmat;
+ }
fs.Close();
DMatrixSimple *dmat = new DMatrixSimple();
@@ -29,11 +66,21 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
}
void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) {
+ if (!strcmp(fname + strlen(fname) - 5, ".page")) {
+ DMatrixPage::Save(fname, dmat, silent);
+ return;
+ }
+ if (!strcmp(fname + strlen(fname) - 6, ".cpage")) {
+ DMatrixColPage::Save(fname, dmat, silent);
+ return;
+ }
if (dmat.magic == DMatrixSimple::kMagic) {
const DMatrixSimple *p_dmat = static_cast(&dmat);
p_dmat->SaveBinary(fname, silent);
} else {
- utils::Error("not implemented");
+ DMatrixSimple smat;
+ smat.CopyFrom(dmat);
+ smat.SaveBinary(fname, silent);
}
}
diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp
new file mode 100644
index 000000000..4f70ff2e9
--- /dev/null
+++ b/src/io/page_dmatrix-inl.hpp
@@ -0,0 +1,278 @@
+#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
+/*!
+ * \file page_row_iter-inl.hpp
+ * row iterator based on sparse page
+ * \author Tianqi Chen
+ */
+#include
+#include "../data.h"
+#include "../utils/iterator.h"
+#include "../utils/thread_buffer.h"
+#include "./simple_fmatrix-inl.hpp"
+
+namespace xgboost {
+namespace io {
+/*! \brief page structure that can be used to store a rowbatch */
+struct RowBatchPage {
+ public:
+ explicit RowBatchPage(size_t page_size) : kPageSize(page_size) {
+ data_ = new int[kPageSize];
+ utils::Assert(data_ != NULL, "fail to allocate row batch page");
+ this->Clear();
+ }
+ ~RowBatchPage(void) {
+ if (data_ != NULL) delete [] data_;
+ }
+ /*!
+ * \brief Push one row into page
+ * \param row an instance row
+ * \return false or true to push into
+ */
+ inline bool PushRow(const RowBatch::Inst &row) {
+ const size_t dsize = row.length * sizeof(RowBatch::Entry);
+ if (FreeBytes() < dsize+ sizeof(int)) return false;
+ row_ptr(Size() + 1) = row_ptr(Size()) + row.length;
+ memcpy(data_ptr(row_ptr(Size())) , row.data, dsize);
+ ++data_[0];
+ return true;
+ }
+ /*!
+ * \brief get a row batch representation from the page
+ * \param p_rptr a temporal space that can be used to provide
+ * ind_ptr storage for RowBatch
+ * \return a new RowBatch object
+ */
+ inline RowBatch GetRowBatch(std::vector *p_rptr, size_t base_rowid) {
+ RowBatch batch;
+ batch.base_rowid = base_rowid;
+ batch.data_ptr = this->data_ptr(0);
+ batch.size = static_cast(this->Size());
+ std::vector &rptr = *p_rptr;
+ rptr.resize(this->Size() + 1);
+ for (size_t i = 0; i < rptr.size(); ++i) {
+ rptr[i] = static_cast(this->row_ptr(static_cast(i)));
+ }
+ batch.ind_ptr = &rptr[0];
+ return batch;
+ }
+ /*! \brief get i-th row from the batch */
+ inline RowBatch::Inst operator[](int i) {
+ return RowBatch::Inst(data_ptr(0) + row_ptr(i),
+ static_cast(row_ptr(i+1) - row_ptr(i)));
+ }
+ /*!
+ * \brief clear the page, cleanup the content
+ */
+ inline void Clear(void) {
+ memset(&data_[0], 0, sizeof(int) * kPageSize);
+ }
+ /*!
+ * \brief load one page form instream
+ * \return true if loading is successful
+ */
+ inline bool Load(utils::IStream &fi) {
+ return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0;
+ }
+ /*! \brief save one page into outstream */
+ inline void Save(utils::IStream &fo) {
+ fo.Write(&data_[0], sizeof(int) * kPageSize);
+ }
+ /*! \return number of elements */
+ inline int Size(void) const {
+ return data_[0];
+ }
+
+ protected:
+ /*! \return number of elements */
+ inline size_t FreeBytes(void) {
+ return (kPageSize - (Size() + 2)) * sizeof(int) -
+ row_ptr(Size()) * sizeof(RowBatch::Entry);
+ }
+ /*! \brief equivalent row pointer at i */
+ inline int& row_ptr(int i) {
+ return data_[kPageSize - i - 1];
+ }
+ inline RowBatch::Entry* data_ptr(int i) {
+ return (RowBatch::Entry*)(&data_[1]) + i;
+ }
+ // content of data
+ int *data_;
+ // page size
+ const size_t kPageSize;
+};
+/*! \brief thread buffer iterator */
+class ThreadRowPageIterator: public utils::IIterator {
+ public:
+ ThreadRowPageIterator(void) {
+ itr.SetParam("buffer_size", "2");
+ page_ = NULL;
+ base_rowid_ = 0;
+ }
+ virtual ~ThreadRowPageIterator(void) {}
+ virtual void Init(void) {
+ }
+ virtual void BeforeFirst(void) {
+ itr.BeforeFirst();
+ base_rowid_ = 0;
+ }
+ virtual bool Next(void) {
+ if (!itr.Next(page_)) return false;
+ out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_);
+ base_rowid_ += out_.size;
+ return true;
+ }
+ virtual const RowBatch &Value(void) const {
+ return out_;
+ }
+ /*! \brief load and initialize the iterator with fi */
+ inline void Load(const utils::FileStream &fi) {
+ itr.get_factory().SetFile(fi);
+ itr.Init();
+ this->BeforeFirst();
+ }
+ /*!
+ * \brief save a row iterator to output stream, in row iterator format
+ */
+ inline static void Save(utils::IIterator *iter,
+ utils::IStream &fo) {
+ RowBatchPage page(kPageSize);
+ iter->BeforeFirst();
+ while (iter->Next()) {
+ const RowBatch &batch = iter->Value();
+ for (size_t i = 0; i < batch.size; ++i) {
+ if (!page.PushRow(batch[i])) {
+ page.Save(fo);
+ page.Clear();
+ utils::Check(page.PushRow(batch[i]), "row is too big");
+ }
+ }
+ }
+ if (page.Size() != 0) page.Save(fo);
+ }
+ /*! \brief page size 64 MB */
+ static const size_t kPageSize = 64 << 18;
+
+ private:
+ // base row id
+ size_t base_rowid_;
+ // temporal ptr
+ std::vector tmp_ptr_;
+ // output data
+ RowBatch out_;
+ // page pointer type
+ typedef RowBatchPage* PagePtr;
+ // loader factory for page
+ struct Factory {
+ public:
+ size_t file_begin_;
+ utils::FileStream fi;
+ Factory(void) {}
+ inline void SetFile(const utils::FileStream &fi) {
+ this->fi = fi;
+ file_begin_ = this->fi.Tell();
+ }
+ inline bool Init(void) {
+ return true;
+ }
+ inline void SetParam(const char *name, const char *val) {}
+ inline bool LoadNext(PagePtr &val) {
+ return val->Load(fi);
+ }
+ inline PagePtr Create(void) {
+ PagePtr a = new RowBatchPage(kPageSize);
+ return a;
+ }
+ inline void FreeSpace(PagePtr &a) {
+ delete a;
+ }
+ inline void Destroy(void) {
+ fi.Close();
+ }
+ inline void BeforeFirst(void) {
+ fi.Seek(file_begin_);
+ }
+ };
+
+ protected:
+ PagePtr page_;
+ utils::ThreadBuffer itr;
+};
+
+/*! \brief data matrix using page */
+template
+class DMatrixPageBase : public DataMatrix {
+ public:
+ DMatrixPageBase(void) : DataMatrix(kMagic) {
+ iter_ = new ThreadRowPageIterator();
+ }
+ // virtual destructor
+ virtual ~DMatrixPageBase(void) {
+ // do not delete row iterator, since it is owned by fmat
+ // to be cleaned up in a more clear way
+ }
+ /*! \brief load and initialize the iterator with fi */
+ inline void Load(utils::FileStream &fi,
+ bool silent = false,
+ const char *fname = NULL,
+ bool skip_magic_check = false) {
+ int tmagic;
+ utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
+ if (!skip_magic_check) {
+ utils::Check(tmagic == magic, "invalid format,magic number mismatch");
+ }
+ this->info.LoadBinary(fi);
+ iter_->Load(fi);
+ if (!silent) {
+ utils::Printf("DMatrixPage: %lux%lu matrix is loaded",
+ static_cast(info.num_row()),
+ static_cast(info.num_col()));
+ if (fname != NULL) {
+ utils::Printf(" from %s\n", fname);
+ } else {
+ utils::Printf("\n");
+ }
+ if (info.group_ptr.size() != 0) {
+ utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1);
+ }
+ }
+ }
+ /*! \brief save a DataMatrix as DMatrixPage*/
+ inline static void Save(const char* fname, const DataMatrix &mat, bool silent) {
+ utils::FileStream fs(utils::FopenCheck(fname, "wb"));
+ int magic = kMagic;
+ fs.Write(&magic, sizeof(magic));
+ mat.info.SaveBinary(fs);
+ ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs);
+ fs.Close();
+ if (!silent) {
+ utils::Printf("DMatrixPage: %lux%lu is saved to %s\n",
+ static_cast(mat.info.num_row()),
+ static_cast(mat.info.num_col()), fname);
+ }
+ }
+ /*! \brief magic number used to identify DMatrix */
+ static const int kMagic = TKMagic;
+
+ protected:
+ /*! \brief row iterator */
+ ThreadRowPageIterator *iter_;
+};
+
+class DMatrixPage : public DMatrixPageBase<0xffffab02> {
+ public:
+ DMatrixPage(void) {
+ fmat_ = new FMatrixS(iter_);
+ }
+ virtual ~DMatrixPage(void) {
+ delete fmat_;
+ }
+ virtual IFMatrix *fmat(void) const {
+ return fmat_;
+ }
+ /*! \brief the real fmatrix */
+ IFMatrix *fmat_;
+};
+} // namespace io
+} // namespace xgboost
+#endif // XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
new file mode 100644
index 000000000..44cb9abdc
--- /dev/null
+++ b/src/io/page_fmatrix-inl.hpp
@@ -0,0 +1,382 @@
+#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
+#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
+/*!
+ * \file page_fmatrix-inl.hpp
+ * sparse page manager for fmatrix
+ * \author Tianqi Chen
+ */
+#include
+#include
+#include
+#include "../data.h"
+#include "../utils/iterator.h"
+#include "../utils/io.h"
+#include "../utils/matrix_csr.h"
+#include "../utils/thread_buffer.h"
+namespace xgboost {
+namespace io {
+class CSCMatrixManager {
+ public:
+ /*! \brief in memory page */
+ struct Page {
+ public:
+ /*! \brief initialize the page */
+ explicit Page(size_t size) {
+ buffer.resize(size);
+ col_index.reserve(10);
+ col_data.reserve(10);
+ }
+ /*! \brief clear the page */
+ inline void Clear(void) {
+ num_entry = 0;
+ col_index.clear();
+ col_data.clear();
+ }
+ /*! \brief number of used entries */
+ size_t num_entry;
+ /*! \brief column index */
+ std::vector col_index;
+ /*! \brief column data */
+ std::vector col_data;
+ /*! \brief number of free entries */
+ inline size_t NumFreeEntry(void) const {
+ return buffer.size() - num_entry;
+ }
+ inline ColBatch::Entry* AllocEntry(size_t len) {
+ ColBatch::Entry *p_data = &buffer[0] + num_entry;
+ num_entry += len;
+ return p_data;
+ }
+ /*! \brief get underlying batch */
+ inline ColBatch GetBatch(void) const {
+ ColBatch batch;
+ batch.size = col_index.size();
+ batch.col_index = BeginPtr(col_index);
+ batch.col_data = BeginPtr(col_data);
+ return batch;
+ }
+
+ private:
+ /*! \brief buffer space, not to be changed since ready */
+ std::vector buffer;
+ };
+ /*! \brief define type of page pointer */
+ typedef Page *PagePtr;
+ // constructor
+ CSCMatrixManager(void) {
+ fi_ = NULL;
+ }
+ /*! \brief get column pointer */
+ inline const std::vector &col_ptr(void) const {
+ return col_ptr_;
+ }
+ inline void SetParam(const char *name, const char *val) {
+ }
+ inline PagePtr Create(void) {
+ return new Page(page_size_);
+ }
+ inline void FreeSpace(PagePtr &a) {
+ delete a;
+ }
+ inline void Destroy(void) {
+ }
+ inline void BeforeFirst(void) {
+ col_index_ = col_todo_;
+ read_top_ = 0;
+ }
+ inline bool LoadNext(PagePtr &val) {
+ val->Clear();
+ if (read_top_ >= col_index_.size()) return false;
+ while (read_top_ < col_index_.size()) {
+ if (!this->TryFill(col_index_[read_top_], val)) {
+ return true;
+ }
+ ++read_top_;
+ }
+ return true;
+ }
+ inline bool Init(void) {
+ this->BeforeFirst();
+ return true;
+ }
+ inline void Setup(utils::ISeekStream *fi, double page_ratio) {
+ fi_ = fi;
+ fi_->Read(&begin_meta_ , sizeof(begin_meta_));
+ begin_data_ = static_cast(fi->Tell());
+ fi_->Seek(begin_meta_);
+ fi_->Read(&col_ptr_);
+ size_t psmax = 0;
+ for (size_t i = 0; i < col_ptr_.size() - 1; ++i) {
+ psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]);
+ }
+ utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1");
+ page_size_ = std::max(static_cast(psmax * page_ratio), psmax);
+ }
+ inline void SetColSet(const std::vector &cset, bool setall) {
+ if (!setall) {
+ col_todo_.resize(0);
+ for (size_t i = 0; i < cset.size(); ++i) {
+ if (col_todo_[i] < static_cast(col_ptr_.size() - 1)) {
+ col_todo_.push_back(cset[i]);
+ }
+ }
+ std::sort(col_todo_.begin(), col_todo_.end());
+ } else {
+ col_todo_.resize(col_ptr_.size()-1);
+ for (size_t i = 0; i < col_todo_.size(); ++i) {
+ col_todo_[i] = static_cast(i);
+ }
+ }
+ }
+
+ private:
+ /*! \brief fill a page with */
+ inline bool TryFill(size_t cidx, Page *p_page) {
+ size_t len = col_ptr_[cidx+1] - col_ptr_[cidx];
+ if (p_page->NumFreeEntry() < len) return false;
+ ColBatch::Entry *p_data = p_page->AllocEntry(len);
+ fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_);
+ utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0,
+ "invalid column buffer format");
+ p_page->col_data.push_back(ColBatch::Inst(p_data, static_cast(len)));
+ p_page->col_index.push_back(static_cast(cidx));
+ return true;
+ }
+ // the following are in memory auxiliary data structure
+ /*! \brief top of reader position */
+ size_t read_top_;
+ /*! \brief size of page */
+ size_t page_size_;
+ /*! \brief column index to be loaded */
+ std::vector col_index_;
+ /*! \brief column index to be after calling before first */
+ std::vector col_todo_;
+ // the following are input content
+ /*! \brief beginning position of data content */
+ size_t begin_data_;
+ /*! \brief size of data content */
+ size_t begin_meta_;
+ /*! \brief input stream */
+ utils::ISeekStream *fi_;
+ /*! \brief column pointer of CSC format */
+ std::vector col_ptr_;
+};
+
+class ThreadColPageIterator : public utils::IIterator {
+ public:
+ explicit ThreadColPageIterator(utils::ISeekStream *fi,
+ float page_ratio, bool silent) {
+ itr_.SetParam("buffer_size", "2");
+ itr_.get_factory().Setup(fi, page_ratio);
+ itr_.Init();
+ if (!silent) {
+ utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n",
+ static_cast(col_ptr().size() - 1));
+ }
+ }
+ virtual ~ThreadColPageIterator(void) {
+ }
+ virtual void BeforeFirst(void) {
+ itr_.BeforeFirst();
+ }
+ virtual bool Next(void) {
+ // page to be loaded
+ CSCMatrixManager::PagePtr page;
+ if (!itr_.Next(page)) return false;
+ out_ = page->GetBatch();
+ return true;
+ }
+ virtual const ColBatch &Value(void) const {
+ return out_;
+ }
+ inline const std::vector &col_ptr(void) const {
+ return itr_.get_factory().col_ptr();
+ }
+ inline void SetColSet(const std::vector &cset,
+ bool setall = false) {
+ itr_.get_factory().SetColSet(cset, setall);
+ }
+
+ private:
+ // output data
+ ColBatch out_;
+ // internal iterator
+ utils::ThreadBuffer itr_;
+};
+/*!
+ * \brief sparse matrix that support column access
+ */
+class FMatrixPage : public IFMatrix {
+ public:
+ /*! \brief constructor */
+ FMatrixPage(utils::IIterator *iter, std::string fname_buffer)
+ : fname_cbuffer_(fname_buffer) {
+ this->row_iter_ = iter;
+ this->col_iter_ = NULL;
+ this->fi_ = NULL;
+ }
+ // destructor
+ virtual ~FMatrixPage(void) {
+ if (row_iter_ != NULL) delete row_iter_;
+ if (col_iter_ != NULL) delete col_iter_;
+ if (fi_ != NULL) {
+ fi_->Close(); delete fi_;
+ }
+ }
+ /*! \return whether column access is enabled */
+ virtual bool HaveColAccess(void) const {
+ return col_iter_ != NULL;
+ }
+ /*! \brief get number of colmuns */
+ virtual size_t NumCol(void) const {
+ utils::Check(this->HaveColAccess(), "NumCol:need column access");
+ return col_iter_->col_ptr().size() - 1;
+ }
+ /*! \brief get number of buffered rows */
+ virtual const std::vector &buffered_rowset(void) const {
+ return buffered_rowset_;
+ }
+ /*! \brief get column size */
+ virtual size_t GetColSize(size_t cidx) const {
+ const std::vector &col_ptr = col_iter_->col_ptr();
+ return col_ptr[cidx+1] - col_ptr[cidx];
+ }
+ /*! \brief get column density */
+ virtual float GetColDensity(size_t cidx) const {
+ const std::vector &col_ptr = col_iter_->col_ptr();
+ size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]);
+ return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size();
+ }
+ virtual void InitColAccess(const std::vector &enabled, float pkeep = 1.0f) {
+ if (this->HaveColAccess()) return;
+ utils::Printf("start to initialize page col access\n");
+ if (this->LoadColData()) {
+ utils::Printf("loading previously saved col data\n");
+ return;
+ }
+ this->InitColData(pkeep, fname_cbuffer_.c_str(),
+ 1 << 30, 5);
+ utils::Check(this->LoadColData(), "fail to read in column data");
+ utils::Printf("finish initialize page col access\n");
+ }
+ /*!
+ * \brief get the row iterator associated with FMatrix
+ */
+ virtual utils::IIterator* RowIterator(void) {
+ row_iter_->BeforeFirst();
+ return row_iter_;
+ }
+ /*!
+ * \brief get the column based iterator
+ */
+ virtual utils::IIterator* ColIterator(void) {
+ std::vector cset;
+ col_iter_->SetColSet(cset, true);
+ col_iter_->BeforeFirst();
+ return col_iter_;
+ }
+ /*!
+ * \brief colmun based iterator
+ */
+ virtual utils::IIterator *ColIterator(const std::vector &fset) {
+ col_iter_->SetColSet(fset, false);
+ col_iter_->BeforeFirst();
+ return col_iter_;
+ }
+
+ protected:
+ /*!
+ * \brief try load column data from file
+ */
+ inline bool LoadColData(void) {
+ FILE *fp = fopen64(fname_cbuffer_.c_str(), "rb");
+ if (fp == NULL) return false;
+ fi_ = new utils::FileStream(fp);
+ static_cast(fi_)->Read(&buffered_rowset_);
+ col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false);
+ return true;
+ }
+ /*!
+ * \brief intialize column data
+ * \param pkeep probability to keep a row
+ */
+ inline void InitColData(float pkeep, const char *fname,
+ size_t buffer_size, size_t col_step) {
+ buffered_rowset_.clear();
+ utils::FileStream fo(utils::FopenCheck(fname, "wb+"));
+ // use 64M buffer
+ utils::SparseCSRFileBuilder builder(&fo, buffer_size);
+ // start working
+ row_iter_->BeforeFirst();
+ while (row_iter_->Next()) {
+ const RowBatch &batch = row_iter_->Value();
+ for (size_t i = 0; i < batch.size; ++i) {
+ if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
+ buffered_rowset_.push_back(static_cast(batch.base_rowid+i));
+ RowBatch::Inst inst = batch[i];
+ for (bst_uint j = 0; j < inst.length; ++j) {
+ builder.AddBudget(inst[j].index);
+ }
+ }
+ }
+ }
+ // write buffered rowset
+ static_cast(&fo)->Write(buffered_rowset_);
+ builder.InitStorage();
+ row_iter_->BeforeFirst();
+ size_t ktop = 0;
+ while (row_iter_->Next()) {
+ const RowBatch &batch = row_iter_->Value();
+ for (size_t i = 0; i < batch.size; ++i) {
+ if (ktop < buffered_rowset_.size() &&
+ buffered_rowset_[ktop] == batch.base_rowid + i) {
+ ++ktop;
+ RowBatch::Inst inst = batch[i];
+ for (bst_uint j = 0; j < inst.length; ++j) {
+ builder.PushElem(inst[j].index,
+ ColBatch::Entry((bst_uint)(batch.base_rowid+i),
+ inst[j].fvalue));
+ }
+ if (ktop % 100000 == 0) {
+ utils::Printf("\r \r");
+ utils::Printf("InitCol: %lu rows ", static_cast(ktop));
+ }
+ }
+ }
+ }
+ builder.Finalize();
+ builder.SortRows(ColBatch::Entry::CmpValue, col_step);
+ fo.Close();
+ }
+
+ private:
+ // row iterator
+ utils::IIterator *row_iter_;
+ // column iterator
+ ThreadColPageIterator *col_iter_;
+ // file pointer to data
+ utils::FileStream *fi_;
+ // file name of column buffer
+ std::string fname_cbuffer_;
+ /*! \brief list of row index that are buffered */
+ std::vector buffered_rowset_;
+};
+
+class DMatrixColPage : public DMatrixPageBase<0xffffab03> {
+ public:
+ explicit DMatrixColPage(const char *fname) {
+ fmat_ = new FMatrixPage(iter_, fname);
+ }
+ virtual ~DMatrixColPage(void) {
+ delete fmat_;
+ }
+ virtual IFMatrix *fmat(void) const {
+ return fmat_;
+ }
+ /*! \brief the real fmatrix */
+ IFMatrix *fmat_;
+};
+
+} // namespace io
+} // namespace xgboost
+#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index 374d621e9..a793c779f 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -44,8 +44,8 @@ class DMatrixSimple : public DataMatrix {
}
/*! \brief copy content data from source matrix */
inline void CopyFrom(const DataMatrix &src) {
- this->info = src.info;
this->Clear();
+ this->info = src.info;
// clone data content in thos matrix
utils::IIterator *iter = src.fmat()->RowIterator();
iter->BeforeFirst();
@@ -84,7 +84,12 @@ class DMatrixSimple : public DataMatrix {
inline void LoadText(const char* fname, bool silent = false) {
using namespace std;
this->Clear();
- FILE* file = utils::FopenCheck(fname, "r");
+ FILE* file;
+ if (!strcmp(fname, "stdin")) {
+ file = stdin;
+ } else {
+ file = utils::FopenCheck(fname, "r");
+ }
float label; bool init = true;
char tmp[1024];
std::vector feats;
@@ -112,7 +117,9 @@ class DMatrixSimple : public DataMatrix {
static_cast(info.num_col()),
static_cast(row_data_.size()), fname);
}
- fclose(file);
+ if (file != stdin) {
+ fclose(file);
+ }
// try to load in additional file
std::string name = fname;
std::string gname = name + ".group";
@@ -152,7 +159,7 @@ class DMatrixSimple : public DataMatrix {
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
int tmagic;
utils::Check(fs.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format");
- utils::Check(tmagic == kMagic, "invalid format,magic number mismatch");
+ utils::Check(tmagic == kMagic, "\"%s\" invalid format, magic number mismatch", fname == NULL ? "" : fname);
info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index 7c8631a29..08e25e28b 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -48,9 +48,10 @@ class FMatrixS : public IFMatrix{
size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size();
}
- virtual void InitColAccess(float pkeep = 1.0f) {
+ virtual void InitColAccess(const std::vector &enabled,
+ float pkeep = 1.0f) {
if (this->HaveColAccess()) return;
- this->InitColData(pkeep);
+ this->InitColData(pkeep, enabled);
}
/*!
* \brief get the row iterator associated with FMatrix
@@ -75,7 +76,11 @@ class FMatrixS : public IFMatrix{
* \brief colmun based iterator
*/
virtual utils::IIterator *ColIterator(const std::vector &fset) {
- col_iter_.col_index_ = fset;
+ size_t ncol = this->NumCol();
+ col_iter_.col_index_.resize(0);
+ for (size_t i = 0; i < fset.size(); ++i) {
+ if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]);
+ }
col_iter_.SetBatch(col_ptr_, col_data_);
return &col_iter_;
}
@@ -141,7 +146,7 @@ class FMatrixS : public IFMatrix{
* \brief intialize column data
* \param pkeep probability to keep a row
*/
- inline void InitColData(float pkeep) {
+ inline void InitColData(float pkeep, const std::vector &enabled) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder builder(col_ptr_, col_data_);
@@ -150,12 +155,14 @@ class FMatrixS : public IFMatrix{
iter_->BeforeFirst();
while (iter_->Next()) {
const RowBatch &batch = iter_->Value();
- for (size_t i = 0; i < batch.size; ++i) {
+ for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(static_cast(batch.base_rowid+i));
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
- builder.AddBudget(inst[j].index);
+ if (enabled[inst[j].index]){
+ builder.AddBudget(inst[j].index);
+ }
}
}
}
@@ -172,9 +179,11 @@ class FMatrixS : public IFMatrix{
++ktop;
RowBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
- builder.PushElem(inst[j].index,
- Entry((bst_uint)(batch.base_rowid+i),
- inst[j].fvalue));
+ if (enabled[inst[j].index]) {
+ builder.PushElem(inst[j].index,
+ Entry((bst_uint)(batch.base_rowid+i),
+ inst[j].fvalue));
+ }
}
}
}
diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp
index fb0b8953d..60a8da8f1 100644
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -11,6 +11,7 @@
#include
#include
#include
+#include "../sync/sync.h"
#include "./evaluation.h"
#include "./helper_utils.h"
@@ -23,7 +24,8 @@ namespace learner {
template
struct EvalEWiseBase : public IEvaluator {
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed) const {
utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
@@ -37,7 +39,11 @@ struct EvalEWiseBase : public IEvaluator {
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
wsum += wt;
}
- return Derived::GetFinal(sum, wsum);
+ float dat[2]; dat[0] = sum, dat[1] = wsum;
+ if (distributed) {
+ rabit::Allreduce(dat, 2);
+ }
+ return Derived::GetFinal(dat[0], dat[1]);
}
/*!
* \brief to be implemented by subclass,
@@ -113,7 +119,9 @@ struct EvalCTest: public IEvaluator {
return name_.c_str();
}
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed) const {
+ utils::Check(!distributed, "metric %s do not support distributed evaluation", name_.c_str());
utils::Check(preds.size() % info.labels.size() == 0,
"label and prediction size not match");
size_t ngroup = preds.size() / info.labels.size() - 1;
@@ -150,7 +158,9 @@ struct EvalAMS : public IEvaluator {
utils::Check(std::sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
}
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed) const {
+ utils::Check(!distributed, "metric AMS do not support distributed evaluation");
using namespace std;
const bst_omp_uint ndata = static_cast(info.labels.size());
@@ -212,7 +222,9 @@ struct EvalPrecisionRatio : public IEvaluator{
}
}
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed) const {
+ utils::Check(!distributed, "metric %s do not support distributed evaluation", Name());
utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Assert(preds.size() % info.labels.size() == 0,
"label size predict size not match");
@@ -252,7 +264,8 @@ struct EvalPrecisionRatio : public IEvaluator{
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed) const {
utils::Check(info.labels.size() != 0, "label set cannot be empty");
utils::Check(preds.size() % info.labels.size() == 0,
"label size predict size not match");
@@ -299,8 +312,16 @@ struct EvalAuc : public IEvaluator {
sum_auc += sum_pospair / (sum_npos*sum_nneg);
}
}
- // return average AUC over list
- return static_cast(sum_auc) / ngroup;
+ if (distributed) {
+ float dat[2];
+ dat[0] = static_cast(sum_auc);
+ dat[1] = static_cast(ngroup);
+ // approximately estimate auc using mean
+ rabit::Allreduce(dat, 2);
+ return dat[0] / dat[1];
+ } else {
+ return static_cast(sum_auc) / ngroup;
+ }
}
virtual const char *Name(void) const {
return "auc";
@@ -311,7 +332,8 @@ struct EvalAuc : public IEvaluator {
struct EvalRankList : public IEvaluator {
public:
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed) const {
utils::Check(preds.size() == info.labels.size(),
"label size predict size not match");
// quick consistency when group is not available
@@ -336,7 +358,16 @@ struct EvalRankList : public IEvaluator {
sum_metric += this->EvalMetric(rec);
}
}
- return static_cast(sum_metric) / ngroup;
+ if (distributed) {
+ float dat[2];
+ dat[0] = static_cast(sum_metric);
+ dat[1] = static_cast(ngroup);
+ // approximately estimate auc using mean
+ rabit::Allreduce(dat, 2);
+ return dat[0] / dat[1];
+ } else {
+ return static_cast(sum_metric) / ngroup;
+ }
}
virtual const char *Name(void) const {
return name_.c_str();
diff --git a/src/learner/evaluation.h b/src/learner/evaluation.h
index 33370e706..4d59e270a 100644
--- a/src/learner/evaluation.h
+++ b/src/learner/evaluation.h
@@ -19,9 +19,13 @@ struct IEvaluator{
* \brief evaluate a specific metric
* \param preds prediction
* \param info information, including label etc.
+ * \param distributed whether a call to Allreduce is needed to gather
+ * the average statistics across all the node,
+ * this is only supported by some metrics
*/
virtual float Eval(const std::vector &preds,
- const MetaInfo &info) const = 0;
+ const MetaInfo &info,
+ bool distributed = false) const = 0;
/*! \return name of metric */
virtual const char *Name(void) const = 0;
/*! \brief virtual destructor */
@@ -70,10 +74,11 @@ class EvalSet{
}
inline std::string Eval(const char *evname,
const std::vector &preds,
- const MetaInfo &info) const {
+ const MetaInfo &info,
+ bool distributed = false) {
std::string result = "";
for (size_t i = 0; i < evals_.size(); ++i) {
- float res = evals_[i]->Eval(preds, info);
+ float res = evals_[i]->Eval(preds, info, distributed);
char tmp[1024];
utils::SPrintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
result += tmp;
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index 88026975d..616cf03e9 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -10,6 +10,9 @@
#include
#include
#include
+#include "../sync/sync.h"
+#include "../utils/io.h"
+#include "../utils/base64.h"
#include "./objective.h"
#include "./evaluation.h"
#include "../gbm/gbm.h"
@@ -21,7 +24,7 @@ namespace learner {
* \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction
*/
-class BoostLearner {
+class BoostLearner : public rabit::ISerializable {
public:
BoostLearner(void) {
obj_ = NULL;
@@ -30,8 +33,13 @@ class BoostLearner {
name_gbm_ = "gbtree";
silent= 0;
prob_buffer_row = 1.0f;
+ distributed_mode = 0;
+ pred_buffer_size = 0;
+ seed_per_iteration = 0;
+ seed = 0;
+ save_base64 = 0;
}
- ~BoostLearner(void) {
+ virtual ~BoostLearner(void) {
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
}
@@ -44,11 +52,9 @@ class BoostLearner {
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector& mats) {
- // estimate feature bound
- unsigned num_feature = 0;
+ utils::Assert(cache_.size() == 0, "can only call cache data once");
// assign buffer index
size_t buffer_size = 0;
- utils::Assert(cache_.size() == 0, "can only call cache data once");
for (size_t i = 0; i < mats.size(); ++i) {
bool dupilicate = false;
for (size_t j = 0; j < i; ++j) {
@@ -59,19 +65,12 @@ class BoostLearner {
mats[i]->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
buffer_size += mats[i]->info.num_row();
- num_feature = std::max(num_feature, static_cast(mats[i]->info.num_col()));
}
char str_temp[25];
- if (num_feature > mparam.num_feature) {
- utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
- this->SetParam("bst:num_feature", str_temp);
- }
- utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
- static_cast(buffer_size));
+ utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
+ static_cast(buffer_size));
this->SetParam("num_pbuffer", str_temp);
- if (!silent) {
- utils::Printf("buffer_size=%ld\n", static_cast(buffer_size));
- }
+ this->pred_buffer_size = buffer_size;
}
/*!
* \brief set parameters from outside
@@ -86,9 +85,29 @@ class BoostLearner {
this->SetParam(n.c_str(), val);
}
if (!strcmp(name, "silent")) silent = atoi(val);
- if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast(atof(val));
+ if (!strcmp(name, "dsplit")) {
+ if (!strcmp(val, "col")) {
+ this->SetParam("updater", "distcol");
+ distributed_mode = 1;
+ } else if (!strcmp(val, "row")) {
+ this->SetParam("updater", "grow_histmaker,prune");
+ distributed_mode = 2;
+ } else {
+ utils::Error("%s is invalid value for dsplit, should be row or col", val);
+ }
+ }
+ if (!strcmp(name, "prob_buffer_row")) {
+ prob_buffer_row = static_cast(atof(val));
+ utils::Check(distributed_mode == 0,
+ "prob_buffer_row can only be used in single node mode so far");
+ this->SetParam("updater", "grow_colmaker,refresh,prune");
+ }
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
- if (!strcmp("seed", name)) random::Seed(atoi(val));
+ if (!strcmp("seed", name)) {
+ this->seed = seed; random::Seed(atoi(val));
+ }
+ if (!strcmp("seed_per_iter", name)) seed_per_iteration = atoi(val);
+ if (!strcmp("save_base64", name)) save_base64 = atoi(val);
if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
if (!strcmp(name, "nthread")) {
omp_set_num_threads(atoi(val));
@@ -104,10 +123,29 @@ class BoostLearner {
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
}
}
+ // this is an internal function
+ // initialize the trainer, called at InitModel and LoadModel
+ inline void InitTrainer(bool calc_num_feature = true) {
+ if (calc_num_feature) {
+ // estimate feature bound
+ unsigned num_feature = 0;
+ for (size_t i = 0; i < cache_.size(); ++i) {
+ num_feature = std::max(num_feature,
+ static_cast(cache_[i].mat_->info.num_col()));
+ }
+ // run allreduce on num_feature to find the maximum value
+ rabit::Allreduce(&num_feature, 1);
+ if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
+ }
+ char str_temp[25];
+ utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
+ this->SetParam("bst:num_feature", str_temp);
+ }
/*!
* \brief initialize the model
*/
inline void InitModel(void) {
+ this->InitTrainer();
// initialize model
this->InitObjGBM();
// reset the base score
@@ -118,8 +156,10 @@ class BoostLearner {
/*!
* \brief load model from stream
* \param fi input stream
+ * \param with_pbuffer whether to load with predict buffer
+ * \param calc_num_feature whether call InitTrainer with calc_num_feature
*/
- inline void LoadModel(utils::IStream &fi) {
+ inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true, bool calc_num_feature = true) {
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"BoostLearner: wrong model format");
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
@@ -127,32 +167,90 @@ class BoostLearner {
// delete existing gbm if any
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
+ this->InitTrainer(calc_num_feature);
this->InitObjGBM();
- gbm_->LoadModel(fi);
+ gbm_->LoadModel(fi, with_pbuffer);
+ if (!with_pbuffer || distributed_mode == 2) {
+ gbm_->ResetPredBuffer(pred_buffer_size);
+ }
+ }
+ // rabit load model from rabit checkpoint
+ virtual void Load(rabit::IStream &fi) {
+ RabitStreamAdapter fs(fi);
+ // for row split, we should not keep pbuffer
+ this->LoadModel(fs, distributed_mode != 2, false);
+ }
+ // rabit save model to rabit checkpoint
+ virtual void Save(rabit::IStream &fo) const {
+ RabitStreamAdapter fs(fo);
+ // for row split, we should not keep pbuffer
+ this->SaveModel(fs, distributed_mode != 2);
}
/*!
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname) {
- utils::FileStream fi(utils::FopenCheck(fname, "rb"));
+ FILE *fp = utils::FopenCheck(fname, "rb");
+ std::string header; header.resize(4);
+ utils::FileStream fi(fp);
+ // check header for different binary encode
+ // can be base64 or binary
+ if (fi.Read(&header[0], 4) != 0) {
+ // base64 format
+ if (header == "bs64") {
+ utils::Base64InStream bsin(fp);
+ bsin.InitPosition();
+ this->LoadModel(bsin);
+ fclose(fp);
+ return;
+ }
+ if (header == "binf") {
+ this->LoadModel(fi);
+ fclose(fp);
+ return;
+ }
+ }
+ fi.Seek(0);
this->LoadModel(fi);
- fi.Close();
+ fclose(fp);
}
- inline void SaveModel(utils::IStream &fo) const {
+ inline void SaveModel(utils::IStream &fo, bool with_pbuffer = true) const {
fo.Write(&mparam, sizeof(ModelParam));
fo.Write(name_obj_);
fo.Write(name_gbm_);
- gbm_->SaveModel(fo);
+ gbm_->SaveModel(fo, with_pbuffer);
}
/*!
* \brief save model into file
* \param fname file name
*/
inline void SaveModel(const char *fname) const {
- utils::FileStream fo(utils::FopenCheck(fname, "wb"));
- this->SaveModel(fo);
- fo.Close();
+ FILE *fp;
+ bool use_stdout = false;;
+#ifndef XGBOOST_STRICT_CXX98_
+ if (!strcmp(fname, "stdout")) {
+ fp = stdout;
+ use_stdout = true;
+ } else
+#endif
+ {
+ fp = utils::FopenCheck(fname, "wb");
+ }
+ utils::FileStream fo(fp);
+ std::string header;
+ if (save_base64 != 0|| use_stdout) {
+ fo.Write("bs64\t", 5);
+ utils::Base64OutStream bout(fp);
+ this->SaveModel(bout);
+ bout.Finish('\n');
+ } else {
+ fo.Write("binf", 4);
+ this->SaveModel(fo);
+ }
+ if (!use_stdout) {
+ fclose(fp);
+ }
}
/*!
* \brief check if data matrix is ready to be used by training,
@@ -160,7 +258,10 @@ class BoostLearner {
* \param p_train pointer to the matrix used by training
*/
inline void CheckInit(DMatrix *p_train) {
- p_train->fmat()->InitColAccess(prob_buffer_row);
+ int ncol = static_cast(p_train->info.info.num_col);
+ std::vector enabled(ncol, true);
+ // initialize column access
+ p_train->fmat()->InitColAccess(enabled, prob_buffer_row);
}
/*!
* \brief update the model for one iteration
@@ -168,9 +269,18 @@ class BoostLearner {
* \param p_train pointer to the data matrix
*/
inline void UpdateOneIter(int iter, const DMatrix &train) {
+ if (seed_per_iteration || rabit::IsDistributed()) {
+ random::Seed(this->seed * kRandSeedMagic);
+ }
this->PredictRaw(train, &preds_);
obj_->GetGradient(preds_, train.info, iter, &gpair_);
- gbm_->DoBoost(train.fmat(), train.info.info, &gpair_);
+ gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
+ }
+ /*!
+ * \brief whether model allow lazy checkpoint
+ */
+ inline bool AllowLazyCheckPoint(void) const {
+ return gbm_->AllowLazyCheckPoint();
}
/*!
* \brief evaluate the model for specific iteration
@@ -189,7 +299,7 @@ class BoostLearner {
for (size_t i = 0; i < evals.size(); ++i) {
this->PredictRaw(*evals[i], &preds_);
obj_->EvalTransform(&preds_);
- res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
+ res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info, distributed_mode == 2);
}
return res;
}
@@ -217,10 +327,41 @@ class BoostLearner {
* predictor, when it equals 0, this means we are using all the trees
*/
inline void Predict(const DMatrix &data,
+ bool output_margin,
+ std::vector *out_preds,
+ unsigned ntree_limit = 0,
+ bool pred_leaf = false
+ ) const {
+ if (pred_leaf) {
+ gbm_->PredictLeaf(data.fmat(), data.info.info, out_preds, ntree_limit);
+ } else {
+ this->PredictRaw(data, out_preds, ntree_limit);
+ if (!output_margin) {
+ obj_->PredTransform(out_preds);
+ }
+ }
+ }
+ /*!
+ * \brief online prediction funciton, predict score for one instance at a time
+ * NOTE: use the batch prediction interface if possible, batch prediction is usually
+ * more efficient than online prediction
+ * This function is NOT threadsafe, make sure you only call from one thread
+ *
+ * \param inst the instance you want to predict
+ * \param output_margin whether to only predict margin value instead of transformed prediction
+ * \param out_preds output vector to hold the predictions
+ * \param ntree_limit limit the number of trees used in prediction
+ * \param root_index the root index
+ * \sa Predict
+ */
+ inline void Predict(const SparseBatch::Inst &inst,
bool output_margin,
std::vector *out_preds,
unsigned ntree_limit = 0) const {
- this->PredictRaw(data, out_preds, ntree_limit);
+ gbm_->Predict(inst, out_preds, ntree_limit);
+ if (out_preds->size() == 1) {
+ (*out_preds)[0] += mparam.base_score;
+ }
if (!output_margin) {
obj_->PredTransform(out_preds);
}
@@ -240,6 +381,7 @@ class BoostLearner {
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster(name_gbm_.c_str());
+
for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
@@ -287,7 +429,7 @@ class BoostLearner {
/* \brief number of class, if it is multi-class classification */
int num_class;
/*! \brief reserved field */
- int reserved[32];
+ int reserved[31];
/*! \brief constructor */
ModelParam(void) {
base_score = 0.5f;
@@ -308,14 +450,26 @@ class BoostLearner {
}
};
// data fields
+ // stored random seed
+ int seed;
+ // whether seed the PRNG each iteration
+ // this is important for restart from existing iterations
+ // default set to no, but will auto switch on in distributed mode
+ int seed_per_iteration;
+ // save model in base64 encoding
+ int save_base64;
// silent during training
int silent;
+ // distributed learning mode, if any, 0:none, 1:col, 2:row
+ int distributed_mode;
+ // cached size of predict buffer
+ size_t pred_buffer_size;
// maximum buffred row value
float prob_buffer_row;
// evaluation set
EvalSet evaluator_;
// model parameter
- ModelParam mparam;
+ ModelParam mparam;
// gbm model that back everything
gbm::IGradBooster *gbm_;
// name of gbm model used for training
@@ -331,7 +485,9 @@ class BoostLearner {
// gradient pairs
std::vector gpair_;
- private:
+ protected:
+ // magic number to transform random seed
+ const static int kRandSeedMagic = 127;
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix *mat_;
@@ -354,6 +510,23 @@ class BoostLearner {
// data structure field
/*! \brief the entries indicates that we have internal prediction cache */
std::vector cache_;
+
+ private:
+ // adapt rabit stream to utils stream
+ struct RabitStreamAdapter : public utils::IStream {
+ // rabit stream
+ rabit::IStream &fs;
+ // constructr
+ RabitStreamAdapter(rabit::IStream &fs) : fs(fs) {}
+ // destructor
+ virtual ~RabitStreamAdapter(void){}
+ virtual size_t Read(void *ptr, size_t size) {
+ return fs.Read(ptr, size);
+ }
+ virtual void Write(const void *ptr, size_t size) {
+ fs.Write(ptr, size);
+ }
+ };
};
} // namespace learner
} // namespace xgboost
diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp
index 96aacf12d..9887e7a05 100644
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -41,6 +41,25 @@ struct LossType {
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
+ /*!
+ * \brief check if label range is valid
+ */
+ inline bool CheckLabel(float x) const {
+ if (loss_type != kLinearSquare) {
+ return x >= 0.0f && x <= 1.0f;
+ }
+ return true;
+ }
+ /*!
+ * \brief error message displayed when check label fail
+ */
+ inline const char * CheckLabelErrorMsg(void) const {
+ if (loss_type != kLinearSquare) {
+ return "label must be in [0,1] for logistic regression";
+ } else {
+ return "";
+ }
+ }
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
@@ -115,6 +134,8 @@ class RegLossObj : public IObjFunction{
"labels are not correctly provided");
std::vector &gpair = *out_gpair;
gpair.resize(preds.size());
+ // check if label in range
+ bool label_correct = true;
// start calculating gradient
const unsigned nstep = static_cast(info.labels.size());
const bst_omp_uint ndata = static_cast(preds.size());
@@ -124,9 +145,11 @@ class RegLossObj : public IObjFunction{
float p = loss.PredTransform(preds[i]);
float w = info.GetWeight(j);
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
+ if (!loss.CheckLabel(info.labels[j])) label_correct = false;
gpair[i] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
loss.SecondOrderGradient(p, info.labels[j]) * w);
}
+ utils::Check(label_correct, loss.CheckLabelErrorMsg());
}
virtual const char* DefaultEvalMetric(void) const {
return loss.DefaultEvalMetric();
@@ -183,7 +206,8 @@ class SoftmaxMultiClassObj : public IObjFunction {
Softmax(&rec);
const unsigned j = i % nstep;
int label = static_cast(info.labels[j]);
- utils::Check(label < nclass, "SoftmaxMultiClassObj: label exceed num_class");
+ utils::Check(label >= 0 && label < nclass,
+ "SoftmaxMultiClassObj: label must be in [0, num_class)");
const float wt = info.GetWeight(j);
for (int k = 0; k < nclass; ++k) {
float p = rec[k];
@@ -325,9 +349,9 @@ class LambdaRankObj : public IObjFunction {
float h = loss.SecondOrderGradient(p, 1.0f);
// accumulate gradient and hessian in both pid, and nid
gpair[pos.rindex].grad += g * w;
- gpair[pos.rindex].hess += 2.0f * h;
+ gpair[pos.rindex].hess += 2.0f * w * h;
gpair[neg.rindex].grad -= g * w;
- gpair[neg.rindex].hess += 2.0f * h;
+ gpair[neg.rindex].hess += 2.0f * w * h;
}
}
}
diff --git a/src/sync/sync.h b/src/sync/sync.h
new file mode 100644
index 000000000..aec5e2abd
--- /dev/null
+++ b/src/sync/sync.h
@@ -0,0 +1,12 @@
+#ifndef XGBOOST_SYNC_H_
+#define XGBOOST_SYNC_H_
+/*!
+ * \file sync.h
+ * \brief the synchronization module of rabit
+ * redirects to subtree rabit header
+ * \author Tianqi Chen
+ */
+#include "../../subtree/rabit/include/rabit.h"
+#endif // XGBOOST_SYNC_H_
+
+
diff --git a/src/tree/model.h b/src/tree/model.h
index aa9ad2794..f3575488a 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -68,8 +68,9 @@ class TreeModel {
}
};
/*! \brief tree node */
- class Node{
+ class Node {
public:
+ Node(void) : sindex_(0) {}
/*! \brief index of left child */
inline int cleft(void) const {
return this->cleft_;
@@ -110,6 +111,10 @@ class TreeModel {
inline bool is_left_child(void) const {
return (parent_ & (1U << 31)) != 0;
}
+ /*! \brief whether this node is deleted */
+ inline bool is_deleted(void) const {
+ return sindex_ == std::numeric_limits::max();
+ }
/*! \brief whether current node is root */
inline bool is_root(void) const {
return parent_ == -1;
@@ -144,7 +149,11 @@ class TreeModel {
this->cleft_ = -1;
this->cright_ = right;
}
-
+ /*! \brief mark that this node is deleted */
+ inline void mark_delete(void) {
+ this->sindex_ = std::numeric_limits::max();
+ }
+
private:
friend class TreeModel;
/*!
@@ -197,11 +206,11 @@ class TreeModel {
leaf_vector.resize(param.num_nodes * param.size_leaf_vector);
return nd;
}
- // delete a tree node
+ // delete a tree node, keep the parent field to allow trace back
inline void DeleteNode(int nid) {
utils::Assert(nid >= param.num_roots, "can not delete root");
deleted_nodes.push_back(nid);
- nodes[nid].set_parent(-1);
+ nodes[nid].mark_delete();
++param.num_deleted;
}
@@ -296,11 +305,12 @@ class TreeModel {
}
// chg deleted nodes
deleted_nodes.resize(0);
- for (int i = param.num_roots; i < param.num_nodes; i ++) {
- if (nodes[i].is_root()) deleted_nodes.push_back(i);
+ for (int i = param.num_roots; i < param.num_nodes; ++i) {
+ if (nodes[i].is_deleted()) deleted_nodes.push_back(i);
}
utils::Assert(static_cast(deleted_nodes.size()) == param.num_deleted,
- "number of deleted nodes do not match");
+ "number of deleted nodes do not match, num_deleted=%d, dnsize=%lu, num_nodes=%d",
+ param.num_deleted, deleted_nodes.size(), param.num_nodes);
}
/*!
* \brief save model to stream
diff --git a/src/tree/param.h b/src/tree/param.h
index 04ea5277f..2c2362095 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -36,8 +36,14 @@ struct TrainParam{
float colsample_bytree;
// speed optimization for dense column
float opt_dense_col;
+ // accuracy of sketch
+ float sketch_eps;
+ // accuracy of sketch
+ float sketch_ratio;
// leaf vector size
- int size_leaf_vector;
+ int size_leaf_vector;
+ // option for parallelization
+ int parallel_option;
// number of threads to be used for tree construction,
// if OpenMP is enabled, if equals 0, use system default
int nthread;
@@ -55,6 +61,9 @@ struct TrainParam{
opt_dense_col = 1.0f;
nthread = 0;
size_leaf_vector = 0;
+ parallel_option = 2;
+ sketch_eps = 0.1f;
+ sketch_ratio = 2.0f;
}
/*!
* \brief set parameters from outside
@@ -76,10 +85,13 @@ struct TrainParam{
if (!strcmp(name, "subsample")) subsample = static_cast(atof(val));
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast(atof(val));
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast(atof(val));
+ if (!strcmp(name, "sketch_eps")) sketch_eps = static_cast(atof(val));
+ if (!strcmp(name, "sketch_ratio")) sketch_ratio = static_cast(atof(val));
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast(atof(val));
if (!strcmp(name, "size_leaf_vector")) size_leaf_vector = atoi(val);
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
if (!strcmp(name, "nthread")) nthread = atoi(val);
+ if (!strcmp(name, "parallel_option")) parallel_option = atoi(val);
if (!strcmp(name, "default_direction")) {
if (!strcmp(val, "learn")) default_direction = 0;
if (!strcmp(val, "left")) default_direction = 1;
@@ -132,6 +144,12 @@ struct TrainParam{
inline bool cannot_split(double sum_hess, int depth) const {
return sum_hess < this->min_child_weight * 2.0;
}
+ /*! \brief maximum sketch size */
+ inline unsigned max_sketch_size(void) const {
+ unsigned ret = static_cast(sketch_ratio / sketch_eps);
+ utils::Check(ret > 0, "sketch_ratio/sketch_eps must be bigger than 1");
+ return ret;
+ }
protected:
// functions for L1 cost
@@ -186,6 +204,10 @@ struct GradStats {
inline void Add(const GradStats &b) {
this->Add(b.sum_grad, b.sum_hess);
}
+ /*! \brief same as add, reduce is used in All Reduce */
+ inline static void Reduce(GradStats &a, const GradStats &b) {
+ a.Add(b);
+ }
/*! \brief set current value to a - b */
inline void SetSubstract(const GradStats &a, const GradStats &b) {
sum_grad = a.sum_grad - b.sum_grad;
@@ -262,6 +284,10 @@ struct CVGradStats : public GradStats {
valid[i].Add(b.valid[i]);
}
}
+ /*! \brief same as add, reduce is used in All Reduce */
+ inline static void Reduce(CVGradStats &a, const CVGradStats &b) {
+ a.Add(b);
+ }
/*! \brief set current value to a - b */
inline void SetSubstract(const CVGradStats &a, const CVGradStats &b) {
GradStats::SetSubstract(a, b);
@@ -341,6 +367,10 @@ struct SplitEntry{
return false;
}
}
+ /*! \brief same as update, used by AllReduce*/
+ inline static void Reduce(SplitEntry &dst, const SplitEntry &src) {
+ dst.Update(src);
+ }
/*!\return feature index to split on */
inline unsigned split_index(void) const {
return sindex & ((1U << 31) - 1U);
diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp
index 2cb6552fe..53b3d6aa1 100644
--- a/src/tree/updater.cpp
+++ b/src/tree/updater.cpp
@@ -1,10 +1,16 @@
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE
+#define NOMINMAX
#include
#include "./updater.h"
#include "./updater_prune-inl.hpp"
#include "./updater_refresh-inl.hpp"
#include "./updater_colmaker-inl.hpp"
+#ifndef XGBOOST_STRICT_CXX98_
+#include "./updater_sync-inl.hpp"
+#include "./updater_distcol-inl.hpp"
+#include "./updater_histmaker-inl.hpp"
+#endif
namespace xgboost {
namespace tree {
@@ -13,6 +19,11 @@ IUpdater* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner();
if (!strcmp(name, "refresh")) return new TreeRefresher();
if (!strcmp(name, "grow_colmaker")) return new ColMaker();
+#ifndef XGBOOST_STRICT_CXX98_
+ if (!strcmp(name, "sync")) return new TreeSyncher();
+ if (!strcmp(name, "grow_histmaker")) return new CQHistMaker();
+ if (!strcmp(name, "distcol")) return new DistColMaker();
+#endif
utils::Error("unknown updater:%s", name);
return NULL;
}
diff --git a/src/tree/updater.h b/src/tree/updater.h
index e3a05c84f..49adc8dca 100644
--- a/src/tree/updater.h
+++ b/src/tree/updater.h
@@ -37,6 +37,16 @@ class IUpdater {
IFMatrix *p_fmat,
const BoosterInfo &info,
const std::vector &trees) = 0;
+
+ /*!
+ * \brief this is simply a function for optimizing performance
+ * this function asks the updater to return the leaf position of each instance in the p_fmat,
+ * if it is cached in the updater, if it is not available, return NULL
+ * \return array of leaf position of each instance in the last updated tree
+ */
+ virtual const int* GetLeafPosition(void) const {
+ return NULL;
+ }
// destructor
virtual ~IUpdater(void) {}
};
diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp
new file mode 100644
index 000000000..f8816dd6e
--- /dev/null
+++ b/src/tree/updater_basemaker-inl.hpp
@@ -0,0 +1,409 @@
+#ifndef XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+#define XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
+/*!
+ * \file updater_basemaker-inl.hpp
+ * \brief implement a common tree constructor
+ * \author Tianqi Chen
+ */
+#include
+#include
+#include
+#include "../sync/sync.h"
+#include "../utils/random.h"
+#include "../utils/quantile.h"
+
+namespace xgboost {
+namespace tree {
+/*!
+ * \brief base tree maker class that defines common operation
+ * needed in tree making
+ */
+class BaseMaker: public IUpdater {
+ public:
+ // destructor
+ virtual ~BaseMaker(void) {}
+ // set training parameter
+ virtual void SetParam(const char *name, const char *val) {
+ param.SetParam(name, val);
+ }
+
+ protected:
+ // helper to collect and query feature meta information
+ struct FMetaHelper {
+ public:
+ /*! \brief find type of each feature, use column format */
+ inline void InitByCol(IFMatrix *p_fmat,
+ const RegTree &tree) {
+ fminmax.resize(tree.param.num_feature * 2);
+ std::fill(fminmax.begin(), fminmax.end(),
+ -std::numeric_limits::max());
+ // start accumulating statistics
+ utils::IIterator *iter = p_fmat->ColIterator();
+ iter->BeforeFirst();
+ while (iter->Next()) {
+ const ColBatch &batch = iter->Value();
+ for (bst_uint i = 0; i < batch.size; ++i) {
+ const bst_uint fid = batch.col_index[i];
+ const ColBatch::Inst &c = batch[i];
+ if (c.length != 0) {
+ fminmax[fid * 2 + 0] = std::max(-c[0].fvalue, fminmax[fid * 2 + 0]);
+ fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]);
+ }
+ }
+ }
+ rabit::Allreduce(BeginPtr(fminmax), fminmax.size());
+ }
+ // get feature type, 0:empty 1:binary 2:real
+ inline int Type(bst_uint fid) const {
+ utils::Assert(fid * 2 + 1 < fminmax.size(),
+ "FeatHelper fid exceed query bound ");
+ bst_float a = fminmax[fid * 2];
+ bst_float b = fminmax[fid * 2 + 1];
+ if (a == -std::numeric_limits::max()) return 0;
+ if (-a == b) return 1;
+ else return 2;
+ }
+ inline bst_float MaxValue(bst_uint fid) const {
+ return fminmax[fid *2 + 1];
+ }
+ inline void SampleCol(float p, std::vector *p_findex) const {
+ std::vector &findex = *p_findex;
+ findex.clear();
+ for (size_t i = 0; i < fminmax.size(); i += 2) {
+ const bst_uint fid = static_cast(i / 2);
+ if (this->Type(fid) != 0) findex.push_back(fid);
+ }
+ unsigned n = static_cast(p * findex.size());
+ random::Shuffle(findex);
+ findex.resize(n);
+ // sync the findex if it is subsample
+ std::string s_cache;
+ utils::MemoryBufferStream fc(&s_cache);
+ utils::IStream &fs = fc;
+ if (rabit::GetRank() == 0) {
+ fs.Write(findex);
+ }
+ rabit::Broadcast(&s_cache, 0);
+ fs.Read(&findex);
+ }
+
+ private:
+ std::vector fminmax;
+ };
+ // ------static helper functions ------
+ // helper function to get to next level of the tree
+ /*! \brief this is helper function for row based data*/
+ inline static int NextLevel(const RowBatch::Inst &inst, const RegTree &tree, int nid) {
+ const RegTree::Node &n = tree[nid];
+ bst_uint findex = n.split_index();
+ for (unsigned i = 0; i < inst.length; ++i) {
+ if (findex == inst[i].index) {
+ if (inst[i].fvalue < n.split_cond()) {
+ return n.cleft();
+ } else {
+ return n.cright();
+ }
+ }
+ }
+ return n.cdefault();
+ }
+ /*! \brief get number of omp thread in current context */
+ inline static int get_nthread(void) {
+ int nthread;
+ #pragma omp parallel
+ {
+ nthread = omp_get_num_threads();
+ }
+ return nthread;
+ }
+ // ------class member helpers---------
+ /*! \brief initialize temp data structure */
+ inline void InitData(const std::vector &gpair,
+ const IFMatrix &fmat,
+ const std::vector &root_index,
+ const RegTree &tree) {
+ utils::Assert(tree.param.num_nodes == tree.param.num_roots,
+ "TreeMaker: can only grow new tree");
+ {// setup position
+ position.resize(gpair.size());
+ if (root_index.size() == 0) {
+ std::fill(position.begin(), position.end(), 0);
+ } else {
+ for (size_t i = 0; i < position.size(); ++i) {
+ position[i] = root_index[i];
+ utils::Assert(root_index[i] < (unsigned)tree.param.num_roots,
+ "root index exceed setting");
+ }
+ }
+ // mark delete for the deleted datas
+ for (size_t i = 0; i < position.size(); ++i) {
+ if (gpair[i].hess < 0.0f) position[i] = ~position[i];
+ }
+ // mark subsample
+ if (param.subsample < 1.0f) {
+ for (size_t i = 0; i < position.size(); ++i) {
+ if (gpair[i].hess < 0.0f) continue;
+ if (random::SampleBinary(param.subsample) == 0) position[i] = ~position[i];
+ }
+ }
+ }
+ {// expand query
+ qexpand.reserve(256); qexpand.clear();
+ for (int i = 0; i < tree.param.num_roots; ++i) {
+ qexpand.push_back(i);
+ }
+ this->UpdateNode2WorkIndex(tree);
+ }
+ }
+ /*! \brief update queue expand add in new leaves */
+ inline void UpdateQueueExpand(const RegTree &tree) {
+ std::vector newnodes;
+ for (size_t i = 0; i < qexpand.size(); ++i) {
+ const int nid = qexpand[i];
+ if (!tree[nid].is_leaf()) {
+ newnodes.push_back(tree[nid].cleft());
+ newnodes.push_back(tree[nid].cright());
+ }
+ }
+ // use new nodes for qexpand
+ qexpand = newnodes;
+ this->UpdateNode2WorkIndex(tree);
+ }
+ // return decoded position
+ inline int DecodePosition(bst_uint ridx) const{
+ const int pid = position[ridx];
+ return pid < 0 ? ~pid : pid;
+ }
+ // encode the encoded position value for ridx
+ inline void SetEncodePosition(bst_uint ridx, int nid) {
+ if (position[ridx] < 0) {
+ position[ridx] = ~nid;
+ } else {
+ position[ridx] = nid;
+ }
+ }
+ /*!
+ * \brief this is helper function uses column based data structure,
+ * reset the positions to the lastest one
+ * \param nodes the set of nodes that contains the split to be used
+ * \param p_fmat feature matrix needed for tree construction
+ * \param tree the regression tree structure
+ */
+ inline void ResetPositionCol(const std::vector &nodes, IFMatrix *p_fmat, const RegTree &tree) {
+ // set the positions in the nondefault
+ this->SetNonDefaultPositionCol(nodes, p_fmat, tree);
+ // set rest of instances to default position
+ const std::vector &rowset = p_fmat->buffered_rowset();
+ // set default direct nodes to default
+ // for leaf nodes that are not fresh, mark then to ~nid,
+ // so that they are ignored in future statistics collection
+ const bst_omp_uint ndata = static_cast(rowset.size());
+
+ #pragma omp parallel for schedule(static)
+ for (bst_omp_uint i = 0; i < ndata; ++i) {
+ const bst_uint ridx = rowset[i];
+ const int nid = this->DecodePosition(ridx);
+ if (tree[nid].is_leaf()) {
+ // mark finish when it is not a fresh leaf
+ if (tree[nid].cright() == -1) {
+ position[ridx] = ~nid;
+ }
+ } else {
+ // push to default branch
+ if (tree[nid].default_left()) {
+ this->SetEncodePosition(ridx, tree[nid].cleft());
+ } else {
+ this->SetEncodePosition(ridx, tree[nid].cright());
+ }
+ }
+ }
+ }
+ /*!
+ * \brief this is helper function uses column based data structure,
+ * update all positions into nondefault branch, if any, ignore the default branch
+ * \param nodes the set of nodes that contains the split to be used
+ * \param p_fmat feature matrix needed for tree construction
+ * \param tree the regression tree structure
+ */
+ virtual void SetNonDefaultPositionCol(const std::vector &nodes,
+ IFMatrix *p_fmat, const RegTree &tree) {
+ // step 1, classify the non-default data into right places
+ std::vector fsplits;
+ for (size_t i = 0; i < nodes.size(); ++i) {
+ const int nid = nodes[i];
+ if (!tree[nid].is_leaf()) {
+ fsplits.push_back(tree[nid].split_index());
+ }
+ }
+ std::sort(fsplits.begin(), fsplits.end());
+ fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
+
+ utils::IIterator *iter = p_fmat->ColIterator(fsplits);
+ while (iter->Next()) {
+ const ColBatch &batch = iter->Value();
+ for (size_t i = 0; i < batch.size; ++i) {
+ ColBatch::Inst col = batch[i];
+ const bst_uint fid = batch.col_index[i];
+ const bst_omp_uint ndata = static_cast(col.length);
+ #pragma omp parallel for schedule(static)
+ for (bst_omp_uint j = 0; j < ndata; ++j) {
+ const bst_uint ridx = col[j].index;
+ const float fvalue = col[j].fvalue;
+ const int nid = this->DecodePosition(ridx);
+ // go back to parent, correct those who are not default
+ if (!tree[nid].is_leaf() && tree[nid].split_index() == fid) {
+ if(fvalue < tree[nid].split_cond()) {
+ this->SetEncodePosition(ridx, tree[nid].cleft());
+ } else {
+ this->SetEncodePosition(ridx, tree[nid].cright());
+ }
+ }
+ }
+ }
+ }
+ }
+ /*! \brief helper function to get statistics from a tree */
+ template
+ inline void GetNodeStats(const std::vector &gpair,
+ const IFMatrix &fmat,
+ const RegTree &tree,
+ const BoosterInfo &info,
+ std::vector< std::vector > *p_thread_temp,
+ std::vector *p_node_stats) {
+ std::vector< std::vector > &thread_temp = *p_thread_temp;
+ thread_temp.resize(this->get_nthread());
+ p_node_stats->resize(tree.param.num_nodes);
+ #pragma omp parallel
+ {
+ const int tid = omp_get_thread_num();
+ thread_temp[tid].resize(tree.param.num_nodes, TStats(param));
+ for (size_t i = 0; i < qexpand.size(); ++i) {
+ const unsigned nid = qexpand[i];
+ thread_temp[tid][nid].Clear();
+ }
+ }
+ const std::vector &rowset = fmat.buffered_rowset();
+ // setup position
+ const bst_omp_uint ndata = static_cast(rowset.size());
+ #pragma omp parallel for schedule(static)
+ for (bst_omp_uint i = 0; i < ndata; ++i) {
+ const bst_uint ridx = rowset[i];
+ const int nid = position[ridx];
+ const int tid = omp_get_thread_num();
+ if (nid >= 0) {
+ thread_temp[tid][nid].Add(gpair, info, ridx);
+ }
+ }
+ // sum the per thread statistics together
+ for (size_t j = 0; j < qexpand.size(); ++j) {
+ const int nid = qexpand[j];
+ TStats &s = (*p_node_stats)[nid];
+ s.Clear();
+ for (size_t tid = 0; tid < thread_temp.size(); ++tid) {
+ s.Add(thread_temp[tid][nid]);
+ }
+ }
+ }
+ /*! \brief common helper data structure to build sketch*/
+ struct SketchEntry {
+ /*! \brief total sum of amount to be met */
+ bst_float sum_total;
+ /*! \brief statistics used in the sketch */
+ bst_float rmin, wmin;
+ /*! \brief last seen feature value */
+ bst_float last_fvalue;
+ /*! \brief current size of sketch */
+ bst_float next_goal;
+ // pointer to the sketch to put things in
+ utils::WXQuantileSketch *sketch;
+ // initialize the space
+ inline void Init(unsigned max_size) {
+ next_goal = -1.0f;
+ rmin = wmin = 0.0f;
+ sketch->temp.Reserve(max_size + 1);
+ sketch->temp.size = 0;
+ }
+ /*!
+ * \brief push a new element to sketch
+ * \param fvalue feature value, comes in sorted ascending order
+ * \param w weight
+ * \param max_size
+ */
+ inline void Push(bst_float fvalue, bst_float w, unsigned max_size) {
+ if (next_goal == -1.0f) {
+ next_goal = 0.0f;
+ last_fvalue = fvalue;
+ wmin = w;
+ return;
+ }
+ if (last_fvalue != fvalue) {
+ bst_float rmax = rmin + wmin;
+ if (rmax >= next_goal) {
+ if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+ // push to sketch
+ sketch->temp.data[sketch->temp.size] =
+ utils::WXQuantileSketch::
+ Entry(rmin, rmax, wmin, last_fvalue);
+ utils::Assert(sketch->temp.size < max_size,
+ "invalid maximum size max_size=%u, stemp.size=%lu\n",
+ max_size, sketch->temp.size);
+ ++sketch->temp.size;
+ }
+ if (sketch->temp.size == max_size) {
+ next_goal = sum_total * 2.0f + 1e-5f;
+ } else{
+ next_goal = static_cast(sketch->temp.size * sum_total / max_size);
+ }
+ }
+ rmin = rmax;
+ wmin = w;
+ last_fvalue = fvalue;
+ } else {
+ wmin += w;
+ }
+ }
+ /*! \brief push final unfinished value to the sketch */
+ inline void Finalize(unsigned max_size) {
+ bst_float rmax = rmin + wmin;
+ if (sketch->temp.size == 0 || last_fvalue > sketch->temp.data[sketch->temp.size-1].value) {
+ utils::Assert(sketch->temp.size <= max_size,
+ "Finalize: invalid maximum size, max_size=%u, stemp.size=%lu",
+ sketch->temp.size, max_size );
+ // push to sketch
+ sketch->temp.data[sketch->temp.size] =
+ utils::WXQuantileSketch::
+ Entry(rmin, rmax, wmin, last_fvalue);
+ ++sketch->temp.size;
+ }
+ sketch->PushTemp();
+ }
+ };
+ /*! \brief training parameter of tree grower */
+ TrainParam param;
+ /*! \brief queue of nodes to be expanded */
+ std::vector qexpand;
+ /*!
+ * \brief map active node to is working index offset in qexpand,
+ * can be -1, which means the node is node actively expanding
+ */
+ std::vector node2workindex;
+ /*!
+ * \brief position of each instance in the tree
+ * can be negative, which means this position is no longer expanding
+ * see also Decode/EncodePosition
+ */
+ std::vector position;
+
+ private:
+ inline void UpdateNode2WorkIndex(const RegTree &tree) {
+ // update the node2workindex
+ std::fill(node2workindex.begin(), node2workindex.end(), -1);
+ node2workindex.resize(tree.param.num_nodes);
+ for (size_t i = 0; i < qexpand.size(); ++i) {
+ node2workindex[qexpand[i]] = static_cast(i);
+ }
+ }
+};
+} // namespace tree
+} // namespace xgboost
+#endif // XGBOOST_TREE_UPDATER_BASEMAKER_INL_HPP_
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index 2d7c5311e..bbf6242c5 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -14,7 +14,7 @@
namespace xgboost {
namespace tree {
-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief colunwise update to construct a tree */
template
class ColMaker: public IUpdater {
public:
@@ -36,24 +36,29 @@ class ColMaker: public IUpdater {
Builder builder(param);
builder.Update(gpair, p_fmat, info, trees[i]);
}
+
param.learning_rate = lr;
}
- private:
+ protected:
// training parameter
TrainParam param;
// data structure
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry {
- /*! \brief statistics of data*/
+ /*! \brief statistics of data */
TStats stats;
+ /*! \brief extra statistics of data */
+ TStats stats_extra;
/*! \brief last feature value scanned */
float last_fvalue;
+ /*! \brief first feature value scanned */
+ float first_fvalue;
/*! \brief current best solution */
SplitEntry best;
// constructor
explicit ThreadEntry(const TrainParam ¶m)
- : stats(param) {
+ : stats(param), stats_extra(param) {
}
};
struct NodeEntry {
@@ -104,7 +109,7 @@ class ColMaker: public IUpdater {
}
}
- private:
+ protected:
// initialize temp data structure
inline void InitData(const std::vector &gpair,
const IFMatrix &fmat,
@@ -127,17 +132,17 @@ class ColMaker: public IUpdater {
// mark delete for the deleted datas
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
- if (gpair[ridx].hess < 0.0f) position[ridx] = -1;
+ if (gpair[ridx].hess < 0.0f) position[ridx] = ~position[ridx];
}
// mark subsample
if (param.subsample < 1.0f) {
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f) continue;
- if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
+ if (random::SampleBinary(param.subsample) == 0) position[ridx] = ~position[ridx];
}
}
- }
+ }
{
// initialize feature index
unsigned ncol = static_cast(fmat.NumCol());
@@ -148,7 +153,8 @@ class ColMaker: public IUpdater {
}
unsigned n = static_cast(param.colsample_bytree * feat_index.size());
random::Shuffle(feat_index);
- utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
+ //utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
+ utils::Check(n > 0, "colsample_bytree=%g is too small that no feature can be included", param.colsample_bytree);
feat_index.resize(n);
}
{// setup temp space for each thread
@@ -219,7 +225,138 @@ class ColMaker: public IUpdater {
}
// use new nodes for qexpand
qexpand = newnodes;
- }
+ }
+ // parallel find the best split of current fid
+ // this function does not support nested functions
+ inline void ParallelFindSplit(const ColBatch::Inst &col,
+ bst_uint fid,
+ const IFMatrix &fmat,
+ const std::vector &gpair,
+ const BoosterInfo &info) {
+ bool need_forward = param.need_forward_search(fmat.GetColDensity(fid));
+ bool need_backward = param.need_backward_search(fmat.GetColDensity(fid));
+ const std::vector &qexpand = qexpand_;
+ int nthread;
+ #pragma omp parallel
+ {
+ const int tid = omp_get_thread_num();
+ std::vector &temp = stemp[tid];
+ // cleanup temp statistics
+ for (size_t j = 0; j < qexpand.size(); ++j) {
+ temp[qexpand[j]].stats.Clear();
+ }
+ nthread = omp_get_num_threads();
+ bst_uint step = (col.length + nthread - 1) / nthread;
+ bst_uint end = std::min(col.length, step * (tid + 1));
+ for (bst_uint i = tid * step; i < end; ++i) {
+ const bst_uint ridx = col[i].index;
+ const int nid = position[ridx];
+ if (nid < 0) continue;
+ const float fvalue = col[i].fvalue;
+ if (temp[nid].stats.Empty()) {
+ temp[nid].first_fvalue = fvalue;
+ }
+ temp[nid].stats.Add(gpair, info, ridx);
+ temp[nid].last_fvalue = fvalue;
+ }
+ }
+ // start collecting the partial sum statistics
+ bst_omp_uint nnode = static_cast(qexpand.size());
+ #pragma omp parallel for schedule(static)
+ for (bst_omp_uint j = 0; j < nnode; ++j) {
+ const int nid = qexpand[j];
+ TStats sum(param), tmp(param), c(param);
+ for (int tid = 0; tid < nthread; ++tid) {
+ tmp = stemp[tid][nid].stats;
+ stemp[tid][nid].stats = sum;
+ sum.Add(tmp);
+ if (tid != 0) {
+ std::swap(stemp[tid - 1][nid].last_fvalue, stemp[tid][nid].first_fvalue);
+ }
+ }
+ for (int tid = 0; tid < nthread; ++tid) {
+ stemp[tid][nid].stats_extra = sum;
+ ThreadEntry &e = stemp[tid][nid];
+ float fsplit;
+ if (tid != 0) {
+ if(fabsf(stemp[tid - 1][nid].last_fvalue - e.first_fvalue) > rt_2eps) {
+ fsplit = (stemp[tid - 1][nid].last_fvalue - e.first_fvalue) * 0.5f;
+ } else {
+ continue;
+ }
+ } else {
+ fsplit = e.first_fvalue - rt_eps;
+ }
+ if (need_forward && tid != 0) {
+ c.SetSubstract(snode[nid].stats, e.stats);
+ if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+ bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+ e.best.Update(loss_chg, fid, fsplit, false);
+ }
+ }
+ if (need_backward) {
+ tmp.SetSubstract(sum, e.stats);
+ c.SetSubstract(snode[nid].stats, tmp);
+ if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+ bst_float loss_chg = static_cast(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+ e.best.Update(loss_chg, fid, fsplit, true);
+ }
+ }
+ }
+ if (need_backward) {
+ tmp = sum;
+ ThreadEntry &e = stemp[nthread-1][nid];
+ c.SetSubstract(snode[nid].stats, tmp);
+ if (c.sum_hess >= param.min_child_weight && tmp.sum_hess >= param.min_child_weight) {
+ bst_float loss_chg = static_cast(tmp.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+ e.best.Update(loss_chg, fid, e.last_fvalue + rt_eps, true);
+ }
+ }
+ }
+ // rescan, generate candidate split
+ #pragma omp parallel
+ {
+ TStats c(param), cright(param);
+ const int tid = omp_get_thread_num();
+ std::vector &temp = stemp[tid];
+ nthread = static_cast(omp_get_num_threads());
+ bst_uint step = (col.length + nthread - 1) / nthread;
+ bst_uint end = std::min(col.length, step * (tid + 1));
+ for (bst_uint i = tid * step; i < end; ++i) {
+ const bst_uint ridx = col[i].index;
+ const int nid = position[ridx];
+ if (nid < 0) continue;
+ const float fvalue = col[i].fvalue;
+ // get the statistics of nid
+ ThreadEntry &e = temp[nid];
+ if (e.stats.Empty()) {
+ e.stats.Add(gpair, info, ridx);
+ e.first_fvalue = fvalue;
+ } else {
+ // forward default right
+ if (fabsf(fvalue - e.first_fvalue) > rt_2eps){
+ if (need_forward) {
+ c.SetSubstract(snode[nid].stats, e.stats);
+ if (c.sum_hess >= param.min_child_weight && e.stats.sum_hess >= param.min_child_weight) {
+ bst_float loss_chg = static_cast(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+ e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, false);
+ }
+ }
+ if (need_backward) {
+ cright.SetSubstract(e.stats_extra, e.stats);
+ c.SetSubstract(snode[nid].stats, cright);
+ if (c.sum_hess >= param.min_child_weight && cright.sum_hess >= param.min_child_weight) {
+ bst_float loss_chg = static_cast(cright.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
+ e.best.Update(loss_chg, fid, (fvalue + e.first_fvalue) * 0.5f, true);
+ }
+ }
+ }
+ e.stats.Add(gpair, info, ridx);
+ e.first_fvalue = fvalue;
+ }
+ }
+ }
+ }
// enumerate the split values of specific feature
inline void EnumerateSplit(const ColBatch::Entry *begin,
const ColBatch::Entry *end,
@@ -273,6 +410,42 @@ class ColMaker: public IUpdater {
}
}
}
+ // update the solution candidate
+ virtual void UpdateSolution(const ColBatch &batch,
+ const std::vector &gpair,
+ const IFMatrix &fmat,
+ const BoosterInfo &info) {
+ // start enumeration
+ const bst_omp_uint nsize = static_cast(batch.size);
+ #if defined(_OPENMP)
+ const int batch_size = std::max(static_cast(nsize / this->nthread / 32), 1);
+ #endif
+ int poption = param.parallel_option;
+ if (poption == 2) {
+ poption = nsize * 2 < nthread ? 1 : 0;
+ }
+ if (poption == 0) {
+ #pragma omp parallel for schedule(dynamic, batch_size)
+ for (bst_omp_uint i = 0; i < nsize; ++i) {
+ const bst_uint fid = batch.col_index[i];
+ const int tid = omp_get_thread_num();
+ const ColBatch::Inst c = batch[i];
+ if (param.need_forward_search(fmat.GetColDensity(fid))) {
+ this->EnumerateSplit(c.data, c.data + c.length, +1,
+ fid, gpair, info, stemp[tid]);
+ }
+ if (param.need_backward_search(fmat.GetColDensity(fid))) {
+ this->EnumerateSplit(c.data + c.length - 1, c.data - 1, -1,
+ fid, gpair, info, stemp[tid]);
+ }
+ }
+ } else {
+ for (bst_omp_uint i = 0; i < nsize; ++i) {
+ this->ParallelFindSplit(batch[i], batch.col_index[i],
+ fmat, gpair, info);
+ }
+ }
+ }
// find splits at current level, do split per level
inline void FindSplit(int depth,
const std::vector &qexpand,
@@ -289,66 +462,76 @@ class ColMaker: public IUpdater {
}
utils::IIterator *iter = p_fmat->ColIterator(feat_set);
while (iter->Next()) {
- const ColBatch &batch = iter->Value();
- // start enumeration
- const bst_omp_uint nsize = static_cast(batch.size);
- #if defined(_OPENMP)
- const int batch_size = std::max(static_cast(nsize / this->nthread / 32), 1);
- #endif
- #pragma omp parallel for schedule(dynamic, batch_size)
- for (bst_omp_uint i = 0; i < nsize; ++i) {
- const bst_uint fid = batch.col_index[i];
- const int tid = omp_get_thread_num();
- const ColBatch::Inst c = batch[i];
- if (param.need_forward_search(p_fmat->GetColDensity(fid))) {
- this->EnumerateSplit(c.data, c.data + c.length, +1,
- fid, gpair, info, stemp[tid]);
+ this->UpdateSolution(iter->Value(), gpair, *p_fmat, info);
+ }
+ // after this each thread's stemp will get the best candidates, aggregate results
+ this->SyncBestSolution(qexpand);
+ // get the best result, we can synchronize the solution
+ for (size_t i = 0; i < qexpand.size(); ++i) {
+ const int nid = qexpand[i];
+ NodeEntry &e = snode[nid];
+ // now we know the solution in snode[nid], set split
+ if (e.best.loss_chg > rt_eps) {
+ p_tree->AddChilds(nid);
+ (*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
+ // mark right child as 0, to indicate fresh leaf
+ (*p_tree)[(*p_tree)[nid].cleft()].set_leaf(0.0f, 0);
+ (*p_tree)[(*p_tree)[nid].cright()].set_leaf(0.0f, 0);
+ } else {
+ (*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
+ }
+ }
+ }
+ // reset position of each data points after split is created in the tree
+ inline void ResetPosition(const std::vector &qexpand, IFMatrix *p_fmat, const RegTree &tree) {
+ // set the positions in the nondefault
+ this->SetNonDefaultPosition(qexpand, p_fmat, tree);
+ // set rest of instances to default position
+ const std::vector &rowset = p_fmat->buffered_rowset();
+ // set default direct nodes to default
+ // for leaf nodes that are not fresh, mark then to ~nid,
+ // so that they are ignored in future statistics collection
+ const bst_omp_uint ndata = static_cast